diff --git a/Makefile b/Makefile
index 025942349..a0aa5b5b0 100644
--- a/Makefile
+++ b/Makefile
@@ -197,7 +197,7 @@ TARGETS_ALL := $(notdir $(wildcard $(SRC_DIR)/*))
 # Temporarily filter out programs that do not build (yet) with CUDA 11
 TARGETS_ALL := $(filter-out alpakatest alpaka,$(TARGETS_ALL))
 # Temporarily filter out programs that do not run (yet) with CUDA 11
-TARGETS_ALL := $(filter-out cuda cudauvm,$(TARGETS_ALL))
+TARGETS_ALL := $(filter-out cudauvm,$(TARGETS_ALL))
 
 # Split targets by required toolchain
 TARGETS_GCC := fwtest
diff --git a/README.md b/README.md
index df8df18d5..bd21906eb 100644
--- a/README.md
+++ b/README.md
@@ -22,23 +22,46 @@ The purpose of this package is to explore various performance
 portability solutions with the
 [Patatrack](https://patatrack.web.cern.ch/patatrack/wiki/) pixel
 tracking application. The version here corresponds to
-[CMSSW_11_1_0_pre4_Patatrack](https://github.com/cms-patatrack/cmssw/tree/CMSSW_11_1_0_pre4_Patatrack).
+[CMSSW_11_2_0_pre8_Patatrack](https://github.com/cms-patatrack/cmssw/tree/CMSSW_11_2_0_pre8_Patatrack).
 
-The application is designed to require minimal dependencies on the system:
+The application is designed to require minimal dependencies on the system. All programs require
 * GNU Make, `curl`, `md5sum`, `tar`
-  * CMake for `kokkostest` and `kokkos` programs
-* C++17 capable compiler that works with `nvcc`, in the current setup this pretty much means GCC 8
-* CUDA 11.0 runtime and drivers (real drivers are not needed for building)
-* [Intel oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html)
-
-All other external dependencies (listed below) are downloaded and built automatically.
-* [TBB](https://github.com/intel/tbb) (all programs)
-* [CUB](https://nvlabs.github.io/cub/) (`cudatest` and `cuda` programs)
-* [Eigen](http://eigen.tuxfamily.org/) (`cuda` program)
-* [Kokkos](https://github.com/kokkos/kokkos) (`kokkostest` and `kokkos` programs)
-* [Boost](https://www.boost.org/) (`alpakatest` and `alpaka` programs)
-  * Boost libraries from the system can also be used, but they need to be newer than 1.65.1
-* [Alpaka](https://github.com/alpaka-group/alpaka) (`alpakatest` and `alpaka` programs)
+* C++17 capable compiler. Ffor programs using CUDA that must work with `nvcc`, in the current setup this means GCC 8 or 9, possibly 10 with CUDA 11.1
+  * testing is currently done with GCC 8
+
+In addition, the individual programs assume the following be found from the system
+
+| Application  | CMake (>= 3.10)    | CUDA 11 runtime and drivers | [Intel oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) |
+|--------------|--------------------|-----------------------------|------------------------------------------------------------------------------------------------------------------|
+| `cudatest`   |                    | :heavy_check_mark:          |                                                                                                                  |
+| `cuda`       |                    | :heavy_check_mark:          |                                                                                                                  |
+| `cudadev`    |                    | :heavy_check_mark:          |                                                                                                                  |
+| `cudauvm`    |                    | :heavy_check_mark:          |                                                                                                                  |
+| `kokkostest` | :heavy_check_mark: | :heavy_check_mark:          |                                                                                                                  |
+| `kokkos`     | :heavy_check_mark: | :heavy_check_mark:          |                                                                                                                  |
+| `alpakatest` |                    | :heavy_check_mark:          |                                                                                                                  |
+| `alpaka`     |                    | :heavy_check_mark:          |                                                                                                                  |
+| `sycltest`   |                    |                             | :heavy_check_mark:                                                                                               |
+
+
+All other dependencies (listed below) are downloaded and built automatically
+
+
+| Application  | [TBB](https://github.com/intel/tbb) | [Eigen](http://eigen.tuxfamily.org/) | [Kokkos](https://github.com/kokkos/kokkos) | [Boost](https://www.boost.org/) (*) | [Alpaka](https://github.com/alpaka-group/alpaka) |
+|--------------|-------------------------------------|--------------------------------------|--------------------------------------------|-------------------------------------|--------------------------------------------------|
+| `fwtest`     | :heavy_check_mark:                  |                                      |                                            |                                     |                                                  |
+| `cudatest`   | :heavy_check_mark:                  |                                      |                                            |                                     |                                                  |
+| `cuda`       | :heavy_check_mark:                  | :heavy_check_mark:                   |                                            |                                     |                                                  |
+| `cudadev`    | :heavy_check_mark:                  | :heavy_check_mark:                   |                                            |                                     |                                                  |
+| `cudauvm`    | :heavy_check_mark:                  | :heavy_check_mark:                   |                                            |                                     |                                                  |
+| `kokkostest` | :heavy_check_mark:                  |                                      | :heavy_check_mark:                         |                                     |                                                  |
+| `kokkos`     | :heavy_check_mark:                  | :heavy_check_mark:                   | :heavy_check_mark:                         |                                     |                                                  |
+| `alpakatest` | :heavy_check_mark:                  |                                      |                                            | :heavy_check_mark:                  | :heavy_check_mark:                               |
+| `alpaka`     | :heavy_check_mark:                  |                                      |                                            | :heavy_check_mark:                  | :heavy_check_mark:                               |
+| `sycltest`   | :heavy_check_mark:                  |                                      |                                            |                                     |                                                  |
+
+
+* (*) Boost libraries from the system can also be used, but they need to be newer than 1.65.1
 
 The input data set consists of a minimal binary dump of 1000 events of
 ttbar+PU events from of
@@ -49,21 +72,23 @@ downloaded automatically during the build process.
 
 ## Status
 
-| Application  | Description                      | Framework          | Device framework   | Test code          | Raw2Cluster        | RecHit             | Pixel tracking     | Vertex             | Transfers to CPU   |
-|--------------|----------------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
-| `fwtest`     | Framework test                   | :heavy_check_mark: |                    | :heavy_check_mark: |                    |                    |                    |                    |                    |
-| `cudatest`   | CUDA FW test                     | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |                    |
-| `cuda`       | CUDA version (frozen)            | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
-| `cudadev`    | CUDA version (development)       | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
-| `cudauvm`    | CUDA version with managed memory | :heavy_check_mark: | :heavy_check_mark: |                    | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
-| `kokkostest` | Kokkos FW test                   | :heavy_check_mark: | :white_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |                    |
-| `kokkos`     | Kokkos version                   | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
-| `alpakatest` | Alpaka FW test                   | :heavy_check_mark: |                    | :white_check_mark: |                    |                    |                    |                    |                    |
-| `alpaka`     | Alpaka version                   | :white_check_mark: |                    |                    | :white_check_mark: |                    |                    |                    |                    |
-| `sycltest`   | SYCL/oneAPI FW test              | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |                    |
+| Application  | Description                      | Framework          | Device framework   | Test code          | Raw2Cluster        | RecHit             | Pixel tracking     | Vertex             | Transfers to CPU   | Validation code    | Validated          |
+|--------------|----------------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
+| `fwtest`     | Framework test                   | :heavy_check_mark: |                    | :heavy_check_mark: |                    |                    |                    |                    |                    |                    |                    |
+| `cudatest`   | CUDA FW test                     | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |                    |                    |                    |
+| `cuda`       | CUDA version (frozen)            | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
+| `cudadev`    | CUDA version (development)       | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
+| `cudauvm`    | CUDA version with managed memory | :heavy_check_mark: | :heavy_check_mark: |                    | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |                    |                    |
+| `kokkostest` | Kokkos FW test                   | :heavy_check_mark: | :white_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |                    |                    |                    |
+| `kokkos`     | Kokkos version                   | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |
+| `alpakatest` | Alpaka FW test                   | :heavy_check_mark: |                    | :white_check_mark: |                    |                    |                    |                    |                    |                    |                    |
+| `alpaka`     | Alpaka version                   | :white_check_mark: |                    |                    | :white_check_mark: |                    |                    |                    |                    |                    |                    |
+| `sycltest`   | SYCL/oneAPI FW test              | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |                    |                    |                    |
 
 The "Device framework" refers to a mechanism similar to [`cms::cuda::Product`](src/cuda/CUDACore/Product.h) and [`cms::cuda::ScopedContext`](src/cuda/CUDACore/ScopedContext.h) to support chains of modules to use the same device and the same work queue.
 
+The column "Validated" means that the program produces the same histograms as the reference `cuda` program within numerical precision (judged "by eye").
+
 ## Quick recipe
 
 ```bash
@@ -126,11 +151,11 @@ The printouts can be disabled with `-DFWTEST_SILENT` build flag (e.g. `make ...
 
 #### `cuda`
 
-This program is frozen to correspond to CMSSW_11_1_0_pre4.
+This program is frozen to correspond to CMSSW_11_2_0_pre8_Patatrack.
 
 #### `cudadev`
 
-This program currently contains code from CMSSW_11_2_0_pre8_Patatrack.
+This program currently contains code from CMSSW_11_2_0_pre8_Patatrack (currently equivalent to `cuda`).
 
 #### `cudauvm`
 
diff --git a/src/cuda/CUDACore/AtomicPairCounter.h b/src/cuda/CUDACore/AtomicPairCounter.h
index 9864493ad..19b2781e0 100644
--- a/src/cuda/CUDACore/AtomicPairCounter.h
+++ b/src/cuda/CUDACore/AtomicPairCounter.h
@@ -5,48 +5,54 @@
 
 #include "CUDACore/cudaCompat.h"
 
-class AtomicPairCounter {
-public:
-  using c_type = unsigned long long int;
+namespace cms {
+  namespace cuda {
 
-  AtomicPairCounter() {}
-  AtomicPairCounter(c_type i) { counter.ac = i; }
+    class AtomicPairCounter {
+    public:
+      using c_type = unsigned long long int;
 
-  __device__ __host__ AtomicPairCounter& operator=(c_type i) {
-    counter.ac = i;
-    return *this;
-  }
+      AtomicPairCounter() {}
+      AtomicPairCounter(c_type i) { counter.ac = i; }
 
-  struct Counters {
-    uint32_t n;  // in a "One to Many" association is the number of "One"
-    uint32_t m;  // in a "One to Many" association is the total number of associations
-  };
+      __device__ __host__ AtomicPairCounter& operator=(c_type i) {
+        counter.ac = i;
+        return *this;
+      }
 
-  union Atomic2 {
-    Counters counters;
-    c_type ac;
-  };
+      struct Counters {
+        uint32_t n;  // in a "One to Many" association is the number of "One"
+        uint32_t m;  // in a "One to Many" association is the total number of associations
+      };
 
-  static constexpr c_type incr = 1UL << 32;
+      union Atomic2 {
+        Counters counters;
+        c_type ac;
+      };
 
-  __device__ __host__ Counters get() const { return counter.counters; }
+      static constexpr c_type incr = 1UL << 32;
 
-  // increment n by 1 and m by i.  return previous value
-  __host__ __device__ __forceinline__ Counters add(uint32_t i) {
-    c_type c = i;
-    c += incr;
-    Atomic2 ret;
+      __device__ __host__ Counters get() const { return counter.counters; }
+
+      // increment n by 1 and m by i.  return previous value
+      __host__ __device__ __forceinline__ Counters add(uint32_t i) {
+        c_type c = i;
+        c += incr;
+        Atomic2 ret;
 #ifdef __CUDA_ARCH__
-    ret.ac = atomicAdd(&counter.ac, c);
+        ret.ac = atomicAdd(&counter.ac, c);
 #else
-    ret.ac = counter.ac;
-    counter.ac += c;
+        ret.ac = counter.ac;
+        counter.ac += c;
 #endif
-    return ret.counters;
-  }
+        return ret.counters;
+      }
+
+    private:
+      Atomic2 counter;
+    };
 
-private:
-  Atomic2 counter;
-};
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
diff --git a/src/cuda/CUDACore/CUDAHostAllocator.h b/src/cuda/CUDACore/CUDAHostAllocator.h
deleted file mode 100644
index 1cd290f13..000000000
--- a/src/cuda/CUDACore/CUDAHostAllocator.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_CUDAHostAllocator_h
-#define HeterogeneousCore_CUDAUtilities_CUDAHostAllocator_h
-
-#include <memory>
-#include <new>
-#include <cuda_runtime.h>
-
-class cuda_bad_alloc : public std::bad_alloc {
-public:
-  cuda_bad_alloc(cudaError_t error) noexcept : error_(error) {}
-
-  const char* what() const noexcept override { return cudaGetErrorString(error_); }
-
-private:
-  cudaError_t error_;
-};
-
-template <typename T, unsigned int FLAGS = cudaHostAllocDefault>
-class CUDAHostAllocator {
-public:
-  using value_type = T;
-
-  template <typename U>
-  struct rebind {
-    using other = CUDAHostAllocator<U, FLAGS>;
-  };
-
-  T* allocate(std::size_t n) const __attribute__((warn_unused_result)) __attribute__((malloc))
-  __attribute__((returns_nonnull)) {
-    void* ptr = nullptr;
-    cudaError_t status = cudaMallocHost(&ptr, n * sizeof(T), FLAGS);
-    if (status != cudaSuccess) {
-      throw cuda_bad_alloc(status);
-    }
-    if (ptr == nullptr) {
-      throw std::bad_alloc();
-    }
-    return static_cast<T*>(ptr);
-  }
-
-  void deallocate(T* p, std::size_t n) const {
-    cudaError_t status = cudaFreeHost(p);
-    if (status != cudaSuccess) {
-      throw cuda_bad_alloc(status);
-    }
-  }
-};
-
-#endif  // HeterogeneousCore_CUDAUtilities_CUDAHostAllocator_h
diff --git a/src/cuda/CUDACore/CachingDeviceAllocator.h b/src/cuda/CUDACore/CachingDeviceAllocator.h
index 075d568f2..50c1ebdb2 100644
--- a/src/cuda/CUDACore/CachingDeviceAllocator.h
+++ b/src/cuda/CUDACore/CachingDeviceAllocator.h
@@ -41,9 +41,10 @@
 #include <cmath>
 #include <map>
 #include <set>
+#include <mutex>
 
-#include <cub/util_debug.cuh>
-#include <cub/host/mutex.cuh>
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/deviceAllocatorStatus.h"
 
 /// CUB namespace
 namespace notcub {
@@ -122,6 +123,7 @@ namespace notcub {
     struct BlockDescriptor {
       void *d_ptr;                     // Device pointer
       size_t bytes;                    // Size of allocation in bytes
+      size_t bytesRequested;           // CMS: requested allocatoin size (for monitoring only)
       unsigned int bin;                // Bin enumeration
       int device;                      // device ordinal
       cudaStream_t associated_stream;  // Associated associated_stream
@@ -129,12 +131,19 @@ namespace notcub {
 
       // Constructor (suitable for searching maps for a specific block, given its pointer and device)
       BlockDescriptor(void *d_ptr, int device)
-          : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {}
+          : d_ptr(d_ptr),
+            bytes(0),
+            bytesRequested(0),  // CMS
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
 
       // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
       BlockDescriptor(int device)
           : d_ptr(nullptr),
             bytes(0),
+            bytesRequested(0),  // CMS
             bin(INVALID_BIN),
             device(device),
             associated_stream(nullptr),
@@ -160,12 +169,7 @@ namespace notcub {
     /// BlockDescriptor comparator function interface
     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
 
-    class TotalBytes {
-    public:
-      size_t free;
-      size_t live;
-      TotalBytes() { free = live = 0; }
-    };
+    // CMS: Moved TotalBytes to deviceAllocatorStatus.h
 
     /// Set type for cached blocks (ordered by size)
     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
@@ -174,7 +178,8 @@ namespace notcub {
     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
 
     /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, TotalBytes> GpuCachedBytes;
+    // CMS: Moved definition to deviceAllocatorStatus.h
+    using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;
 
     //---------------------------------------------------------------------
     // Utility functions
@@ -219,7 +224,8 @@ namespace notcub {
     // Fields
     //---------------------------------------------------------------------
 
-    cub::Mutex mutex;  /// Mutex for thread-safety
+    // CMS: use std::mutex instead of cub::Mutex, declare mutable
+    mutable std::mutex mutex;  /// Mutex for thread-safety
 
     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
     unsigned int min_bin;     /// Minimum bin enumeration
@@ -298,17 +304,19 @@ namespace notcub {
      */
     cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
       // Lock
-      mutex.Lock();
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock mutex_locker(mutex);
 
       if (debug)
-        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
-                (long long)this->max_cached_bytes,
-                (long long)max_cached_bytes);
+        // CMS: use raw printf
+        printf("Changing max_cached_bytes (%lld -> %lld)\n",
+               (long long)this->max_cached_bytes,
+               (long long)max_cached_bytes);
 
       this->max_cached_bytes = max_cached_bytes;
 
-      // Unlock
-      mutex.Unlock();
+      // Unlock (redundant, kept for style uniformity)
+      mutex_locker.unlock();
 
       return cudaSuccess;
     }
@@ -326,19 +334,22 @@ namespace notcub {
         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
     {
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
       *d_ptr = nullptr;
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
 
       if (device == INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
         device = entrypoint_device;
       }
 
       // Create a block descriptor for the requested allocation
       bool found = false;
       BlockDescriptor search_key(device);
+      search_key.bytesRequested = bytes;  // CMS
       search_key.associated_stream = active_stream;
       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
 
@@ -350,7 +361,7 @@ namespace notcub {
         search_key.bytes = bytes;
       } else {
         // Search for a suitable cached allocation: lock
-        mutex.Lock();
+        mutex_locker.lock();
 
         if (search_key.bin < min_bin) {
           // Bin is less than minimum bin: round up
@@ -376,10 +387,12 @@ namespace notcub {
             // Remove from free blocks
             cached_bytes[device].free -= search_key.bytes;
             cached_bytes[device].live += search_key.bytes;
+            cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
 
             if (debug)
               // CMS: improved debug message
-              _CubLog(
+              // CMS: use raw printf
+              printf(
                   "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
                   "associated with stream %lld, event %lld).\n",
                   device,
@@ -398,24 +411,25 @@ namespace notcub {
         }
 
         // Done searching: unlock
-        mutex.Unlock();
+        mutex_locker.unlock();
       }
 
       // Allocate the block if necessary
       if (!found) {
         // Set runtime's current device to specified device (entrypoint may not be set)
         if (device != entrypoint_device) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-            return error;
-          if (CubDebug(error = cudaSetDevice(device)))
-            return error;
+          // CMS: throw exception on error
+          cudaCheck(error = cudaGetDevice(&entrypoint_device));
+          cudaCheck(error = cudaSetDevice(device));
         }
 
         // Attempt to allocate
-        if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
+        // CMS: silently ignore errors and retry or pass them to the caller
+        if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
           // The allocation attempt failed: free all cached blocks on device and retry
           if (debug)
-            _CubLog(
+            // CMS: use raw printf
+            printf(
                 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
                 device,
                 (long long)search_key.bytes,
@@ -425,7 +439,7 @@ namespace notcub {
           cudaGetLastError();   // Reset CUDART's error
 
           // Lock
-          mutex.Lock();
+          mutex_locker.lock();
 
           // Iterate the range of free blocks on the same device
           BlockDescriptor free_key(device);
@@ -437,16 +451,18 @@ namespace notcub {
             // on the current device
 
             // Free device memory and destroy stream event.
-            if (CubDebug(error = cudaFree(block_itr->d_ptr)))
+            // CMS: silently ignore errors and pass them to the caller
+            if ((error = cudaFree(block_itr->d_ptr)))
               break;
-            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+            if ((error = cudaEventDestroy(block_itr->ready_event)))
               break;
 
             // Reduce balance and erase entry
             cached_bytes[device].free -= block_itr->bytes;
 
             if (debug)
-              _CubLog(
+              // CMS: use raw printf
+              printf(
                   "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
                   "(%lld bytes) outstanding.\n",
                   device,
@@ -462,41 +478,42 @@ namespace notcub {
           }
 
           // Unlock
-          mutex.Unlock();
+          mutex_locker.unlock();
 
           // Return under error
           if (error)
             return error;
 
           // Try to allocate again
-          if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
-            return error;
+          // CMS: throw exception on error
+          cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
         }
 
         // Create ready event
-        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
 
         // Insert into live blocks
-        mutex.Lock();
+        mutex_locker.lock();
         live_blocks.insert(search_key);
         cached_bytes[device].live += search_key.bytes;
-        mutex.Unlock();
+        cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
+        mutex_locker.unlock();
 
         if (debug)
           // CMS: improved debug message
-          _CubLog(
-              "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
-              device,
-              search_key.d_ptr,
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event);
+          // CMS: use raw printf
+          printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
+                 device,
+                 search_key.d_ptr,
+                 (long long)search_key.bytes,
+                 (long long)search_key.associated_stream,
+                 (long long)search_key.ready_event);
 
         // Attempt to revert back to previous device if necessary
         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
-          if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-            return error;
+          // CMS: throw exception on error
+          cudaCheck(error = cudaSetDevice(entrypoint_device));
         }
       }
 
@@ -504,11 +521,12 @@ namespace notcub {
       *d_ptr = search_key.d_ptr;
 
       if (debug)
-        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes[device].free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes[device].live);
+        // CMS: use raw printf
+        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+               (long long)cached_blocks.size(),
+               (long long)cached_bytes[device].free,
+               (long long)live_blocks.size(),
+               (long long)cached_bytes[device].live);
 
       return error;
     }
@@ -538,15 +556,17 @@ namespace notcub {
     cudaError_t DeviceFree(int device, void *d_ptr) {
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
 
       if (device == INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
         device = entrypoint_device;
       }
 
       // Lock
-      mutex.Lock();
+      mutex_locker.lock();
 
       // Find corresponding block descriptor
       bool recached = false;
@@ -557,6 +577,7 @@ namespace notcub {
         search_key = *block_itr;
         live_blocks.erase(block_itr);
         cached_bytes[device].live -= search_key.bytes;
+        cached_bytes[device].liveRequested -= search_key.bytesRequested;  // CMS
 
         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
         if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
@@ -567,7 +588,8 @@ namespace notcub {
 
           if (debug)
             // CMS: improved debug message
-            _CubLog(
+            // CMS: use raw printf
+            printf(
                 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
                 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
                 device,
@@ -584,31 +606,29 @@ namespace notcub {
 
       // First set to specified device (entrypoint may not be set)
       if (device != entrypoint_device) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
-        if (CubDebug(error = cudaSetDevice(device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
+        cudaCheck(error = cudaSetDevice(device));
       }
 
       if (recached) {
         // Insert the ready event in the associated stream (must have current device set properly)
-        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
       }
 
       // Unlock
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       if (!recached) {
         // Free the allocation from the runtime and cleanup the event.
-        if (CubDebug(error = cudaFree(d_ptr)))
-          return error;
-        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaFree(d_ptr));
+        cudaCheck(error = cudaEventDestroy(search_key.ready_event));
 
         if (debug)
           // CMS: improved debug message
-          _CubLog(
+          printf(
               "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
               device,
@@ -624,8 +644,8 @@ namespace notcub {
 
       // Reset device
       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
@@ -647,8 +667,8 @@ namespace notcub {
       cudaError_t error = cudaSuccess;
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       int current_device = INVALID_DEVICE_ORDINAL;
-
-      mutex.Lock();
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex);
 
       while (!cached_blocks.empty()) {
         // Get first block
@@ -656,28 +676,31 @@ namespace notcub {
 
         // Get entry-point device ordinal if necessary
         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          // CMS: silently ignore errors and pass them to the caller
+          if ((error = cudaGetDevice(&entrypoint_device)))
             break;
         }
 
         // Set current device ordinal if necessary
         if (begin->device != current_device) {
-          if (CubDebug(error = cudaSetDevice(begin->device)))
+          // CMS: silently ignore errors and pass them to the caller
+          if ((error = cudaSetDevice(begin->device)))
             break;
           current_device = begin->device;
         }
 
         // Free device memory
-        if (CubDebug(error = cudaFree(begin->d_ptr)))
+        // CMS: silently ignore errors and pass them to the caller
+        if ((error = cudaFree(begin->d_ptr)))
           break;
-        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+        if ((error = cudaEventDestroy(begin->ready_event)))
           break;
 
         // Reduce balance and erase entry
         cached_bytes[current_device].free -= begin->bytes;
 
         if (debug)
-          _CubLog(
+          printf(
               "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
               "bytes) outstanding.\n",
               current_device,
@@ -690,21 +713,28 @@ namespace notcub {
         cached_blocks.erase(begin);
       }
 
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       // Attempt to revert back to entry-point device if necessary
       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
     }
 
+    // CMS: give access to cache allocation status
+    GpuCachedBytes CacheStatus() const {
+      std::unique_lock mutex_locker(mutex);
+      return cached_bytes;
+    }
+
     /**
      * \brief Destructor
      */
-    virtual ~CachingDeviceAllocator() {
+    // CMS: make the destructor not virtual
+    ~CachingDeviceAllocator() {
       if (!skip_cleanup)
         FreeAllCached();
     }
diff --git a/src/cuda/CUDACore/CachingHostAllocator.h b/src/cuda/CUDACore/CachingHostAllocator.h
index 53901e1f1..a206b2da1 100644
--- a/src/cuda/CUDACore/CachingHostAllocator.h
+++ b/src/cuda/CUDACore/CachingHostAllocator.h
@@ -41,9 +41,9 @@
 #include <cmath>
 #include <map>
 #include <set>
+#include <mutex>
 
-#include <cub/util_debug.cuh>
-#include <cub/host/mutex.cuh>
+#include "CUDACore/cudaCheck.h"
 
 /// CUB namespace
 namespace notcub {
@@ -212,7 +212,7 @@ namespace notcub {
     // Fields
     //---------------------------------------------------------------------
 
-    cub::Mutex mutex;  /// Mutex for thread-safety
+    std::mutex mutex;  /// Mutex for thread-safety
 
     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
     unsigned int min_bin;     /// Minimum bin enumeration
@@ -291,17 +291,17 @@ namespace notcub {
      */
     void SetMaxCachedBytes(size_t max_cached_bytes) {
       // Lock
-      mutex.Lock();
+      std::unique_lock mutex_locker(mutex);
 
       if (debug)
-        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
-                (long long)this->max_cached_bytes,
-                (long long)max_cached_bytes);
+        printf("Changing max_cached_bytes (%lld -> %lld)\n",
+               (long long)this->max_cached_bytes,
+               (long long)max_cached_bytes);
 
       this->max_cached_bytes = max_cached_bytes;
 
-      // Unlock
-      mutex.Unlock();
+      // Unlock (redundant, kept for style uniformity)
+      mutex_locker.unlock();
     }
 
     /**
@@ -314,12 +314,12 @@ namespace notcub {
         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
     {
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
       *d_ptr = nullptr;
       int device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
 
-      if (CubDebug(error = cudaGetDevice(&device)))
-        return error;
+      cudaCheck(error = cudaGetDevice(&device));
 
       // Create a block descriptor for the requested allocation
       bool found = false;
@@ -336,7 +336,7 @@ namespace notcub {
         search_key.bytes = bytes;
       } else {
         // Search for a suitable cached allocation: lock
-        mutex.Lock();
+        mutex_locker.lock();
 
         if (search_key.bin < min_bin) {
           // Bin is less than minimum bin: round up
@@ -356,14 +356,10 @@ namespace notcub {
             search_key.associated_stream = active_stream;
             if (search_key.device != device) {
               // If "associated" device changes, need to re-create the event on the right device
-              if (CubDebug(error = cudaSetDevice(search_key.device)))
-                return error;
-              if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-                return error;
-              if (CubDebug(error = cudaSetDevice(device)))
-                return error;
-              if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-                return error;
+              cudaCheck(error = cudaSetDevice(search_key.device));
+              cudaCheck(error = cudaEventDestroy(search_key.ready_event));
+              cudaCheck(error = cudaSetDevice(device));
+              cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
               search_key.device = device;
             }
 
@@ -374,7 +370,7 @@ namespace notcub {
             cached_bytes.live += search_key.bytes;
 
             if (debug)
-              _CubLog(
+              printf(
                   "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
                   "(previously associated with stream %lld, event %lld).\n",
                   search_key.d_ptr,
@@ -393,18 +389,18 @@ namespace notcub {
         }
 
         // Done searching: unlock
-        mutex.Unlock();
+        mutex_locker.unlock();
       }
 
       // Allocate the block if necessary
       if (!found) {
         // Attempt to allocate
         // TODO: eventually support allocation flags
-        if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
+        if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
             cudaErrorMemoryAllocation) {
           // The allocation attempt failed: free all cached blocks on device and retry
           if (debug)
-            _CubLog(
+            printf(
                 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
                 "allocations",
                 (long long)search_key.bytes,
@@ -415,7 +411,7 @@ namespace notcub {
           cudaGetLastError();   // Reset CUDART's error
 
           // Lock
-          mutex.Lock();
+          mutex_locker.lock();
 
           // Iterate the range of free blocks
           CachedBlocks::iterator block_itr = cached_blocks.begin();
@@ -426,16 +422,16 @@ namespace notcub {
             // on the current device
 
             // Free pinned host memory.
-            if (CubDebug(error = cudaFreeHost(block_itr->d_ptr)))
+            if ((error = cudaFreeHost(block_itr->d_ptr)))
               break;
-            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+            if ((error = cudaEventDestroy(block_itr->ready_event)))
               break;
 
             // Reduce balance and erase entry
             cached_bytes.free -= block_itr->bytes;
 
             if (debug)
-              _CubLog(
+              printf(
                   "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
                   "bytes) outstanding.\n",
                   (long long)block_itr->bytes,
@@ -450,29 +446,27 @@ namespace notcub {
           }
 
           // Unlock
-          mutex.Unlock();
+          mutex_locker.unlock();
 
           // Return under error
           if (error)
             return error;
 
           // Try to allocate again
-          if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)))
-            return error;
+          cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
         }
 
         // Create ready event
-        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-          return error;
+        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
 
         // Insert into live blocks
-        mutex.Lock();
+        mutex_locker.lock();
         live_blocks.insert(search_key);
         cached_bytes.live += search_key.bytes;
-        mutex.Unlock();
+        mutex_locker.unlock();
 
         if (debug)
-          _CubLog(
+          printf(
               "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
               "%lld).\n",
               search_key.d_ptr,
@@ -486,11 +480,11 @@ namespace notcub {
       *d_ptr = search_key.d_ptr;
 
       if (debug)
-        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes.free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes.live);
+        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+               (long long)cached_blocks.size(),
+               (long long)cached_bytes.free,
+               (long long)live_blocks.size(),
+               (long long)cached_bytes.live);
 
       return error;
     }
@@ -505,7 +499,7 @@ namespace notcub {
       cudaError_t error = cudaSuccess;
 
       // Lock
-      mutex.Lock();
+      std::unique_lock<std::mutex> mutex_locker(mutex);
 
       // Find corresponding block descriptor
       bool recached = false;
@@ -525,7 +519,7 @@ namespace notcub {
           cached_bytes.free += search_key.bytes;
 
           if (debug)
-            _CubLog(
+            printf(
                 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
                 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
                 (long long)search_key.bytes,
@@ -539,31 +533,26 @@ namespace notcub {
         }
       }
 
-      if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-        return error;
+      cudaCheck(error = cudaGetDevice(&entrypoint_device));
       if (entrypoint_device != search_key.device) {
-        if (CubDebug(error = cudaSetDevice(search_key.device)))
-          return error;
+        cudaCheck(error = cudaSetDevice(search_key.device));
       }
 
       if (recached) {
         // Insert the ready event in the associated stream (must have current device set properly)
-        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
-          return error;
+        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
       }
 
       // Unlock
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       if (!recached) {
         // Free the allocation from the runtime and cleanup the event.
-        if (CubDebug(error = cudaFreeHost(d_ptr)))
-          return error;
-        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-          return error;
+        cudaCheck(error = cudaFreeHost(d_ptr));
+        cudaCheck(error = cudaEventDestroy(search_key.ready_event));
 
         if (debug)
-          _CubLog(
+          printf(
               "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
               (long long)search_key.bytes,
@@ -578,8 +567,7 @@ namespace notcub {
 
       // Reset device
       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
@@ -593,7 +581,7 @@ namespace notcub {
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       int current_device = INVALID_DEVICE_ORDINAL;
 
-      mutex.Lock();
+      std::unique_lock<std::mutex> mutex_locker(mutex);
 
       while (!cached_blocks.empty()) {
         // Get first block
@@ -601,28 +589,28 @@ namespace notcub {
 
         // Get entry-point device ordinal if necessary
         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          if ((error = cudaGetDevice(&entrypoint_device)))
             break;
         }
 
         // Set current device ordinal if necessary
         if (begin->device != current_device) {
-          if (CubDebug(error = cudaSetDevice(begin->device)))
+          if ((error = cudaSetDevice(begin->device)))
             break;
           current_device = begin->device;
         }
 
         // Free host memory
-        if (CubDebug(error = cudaFreeHost(begin->d_ptr)))
+        if ((error = cudaFreeHost(begin->d_ptr)))
           break;
-        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+        if ((error = cudaEventDestroy(begin->ready_event)))
           break;
 
         // Reduce balance and erase entry
         cached_bytes.free -= begin->bytes;
 
         if (debug)
-          _CubLog(
+          printf(
               "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
               "bytes) outstanding.\n",
               (long long)begin->bytes,
@@ -634,12 +622,11 @@ namespace notcub {
         cached_blocks.erase(begin);
       }
 
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       // Attempt to revert back to entry-point device if necessary
       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
diff --git a/src/cuda/CUDACore/GPUSimpleVector.h b/src/cuda/CUDACore/GPUSimpleVector.h
deleted file mode 100644
index 7acea74d9..000000000
--- a/src/cuda/CUDACore/GPUSimpleVector.h
+++ /dev/null
@@ -1,138 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
-#define HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
-
-//  author: Felice Pantaleo, CERN, 2018
-
-#include <type_traits>
-#include <utility>
-
-#include "CUDACore/cudaCompat.h"
-
-namespace GPU {
-  template <class T>
-  struct SimpleVector {
-    constexpr SimpleVector() = default;
-
-    // ownership of m_data stays within the caller
-    constexpr void construct(int capacity, T *data) {
-      m_size = 0;
-      m_capacity = capacity;
-      m_data = data;
-    }
-
-    inline constexpr int push_back_unsafe(const T &element) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < m_capacity) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    constexpr int emplace_back_unsafe(Ts &&... args) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < m_capacity) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    __device__ inline T &back() { return m_data[m_size - 1]; }
-
-    __device__ inline const T &back() const {
-      if (m_size > 0) {
-        return m_data[m_size - 1];
-      } else
-        return T();  //undefined behaviour
-    }
-
-    // thread-safe version of the vector, when used in a CUDA kernel
-    __device__ int push_back(const T &element) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < m_capacity) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    __device__ int emplace_back(Ts &&... args) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < m_capacity) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    // thread safe version of resize
-    __device__ int extend(int size = 1) {
-      auto previousSize = atomicAdd(&m_size, size);
-      if (previousSize < m_capacity) {
-        return previousSize;
-      } else {
-        atomicSub(&m_size, size);
-        return -1;
-      }
-    }
-
-    __device__ int shrink(int size = 1) {
-      auto previousSize = atomicSub(&m_size, size);
-      if (previousSize >= size) {
-        return previousSize - size;
-      } else {
-        atomicAdd(&m_size, size);
-        return -1;
-      }
-    }
-
-    inline constexpr bool empty() const { return m_size <= 0; }
-    inline constexpr bool full() const { return m_size >= m_capacity; }
-    inline constexpr T &operator[](int i) { return m_data[i]; }
-    inline constexpr const T &operator[](int i) const { return m_data[i]; }
-    inline constexpr void reset() { m_size = 0; }
-    inline constexpr int size() const { return m_size; }
-    inline constexpr int capacity() const { return m_capacity; }
-    inline constexpr T const *data() const { return m_data; }
-    inline constexpr void resize(int size) { m_size = size; }
-    inline constexpr void set_data(T *data) { m_data = data; }
-
-  private:
-    int m_size;
-    int m_capacity;
-
-    T *m_data;
-  };
-
-  // ownership of m_data stays within the caller
-  template <class T>
-  SimpleVector<T> make_SimpleVector(int capacity, T *data) {
-    SimpleVector<T> ret;
-    ret.construct(capacity, data);
-    return ret;
-  }
-
-  // ownership of m_data stays within the caller
-  template <class T>
-  SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
-    auto ret = new (mem) SimpleVector<T>();
-    ret->construct(capacity, data);
-    return ret;
-  }
-
-}  // namespace GPU
-
-#endif  // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
diff --git a/src/cuda/CUDACore/GPUVecArray.h b/src/cuda/CUDACore/GPUVecArray.h
deleted file mode 100644
index 79fdfa604..000000000
--- a/src/cuda/CUDACore/GPUVecArray.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
-#define HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
-
-//
-// Author: Felice Pantaleo, CERN
-//
-
-#include "CUDACore/cudaCompat.h"
-
-namespace GPU {
-
-  template <class T, int maxSize>
-  class VecArray {
-  public:
-    using self = VecArray<T, maxSize>;
-    using value_t = T;
-
-    inline constexpr int push_back_unsafe(const T &element) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < maxSize) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    constexpr int emplace_back_unsafe(Ts &&... args) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < maxSize) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    inline constexpr T &back() const {
-      if (m_size > 0) {
-        return m_data[m_size - 1];
-      } else
-        return T();  //undefined behaviour
-    }
-
-    // thread-safe version of the vector, when used in a CUDA kernel
-    __device__ int push_back(const T &element) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < maxSize) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    __device__ int emplace_back(Ts &&... args) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < maxSize) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    inline constexpr T pop_back() {
-      if (m_size > 0) {
-        auto previousSize = m_size--;
-        return m_data[previousSize - 1];
-      } else
-        return T();
-    }
-
-    inline constexpr T const *begin() const { return m_data; }
-    inline constexpr T const *end() const { return m_data + m_size; }
-    inline constexpr T *begin() { return m_data; }
-    inline constexpr T *end() { return m_data + m_size; }
-    inline constexpr int size() const { return m_size; }
-    inline constexpr T &operator[](int i) { return m_data[i]; }
-    inline constexpr const T &operator[](int i) const { return m_data[i]; }
-    inline constexpr void reset() { m_size = 0; }
-    inline static constexpr int capacity() { return maxSize; }
-    inline constexpr T const *data() const { return m_data; }
-    inline constexpr void resize(int size) { m_size = size; }
-    inline constexpr bool empty() const { return 0 == m_size; }
-    inline constexpr bool full() const { return maxSize == m_size; }
-
-  private:
-    T m_data[maxSize];
-
-    int m_size;
-  };
-
-}  // namespace GPU
-
-#endif  // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
diff --git a/src/cuda/CUDACore/HistoContainer.h b/src/cuda/CUDACore/HistoContainer.h
index 5ff12f387..c2ac3308d 100644
--- a/src/cuda/CUDACore/HistoContainer.h
+++ b/src/cuda/CUDACore/HistoContainer.h
@@ -9,10 +9,6 @@
 #include <cstdint>
 #include <type_traits>
 
-#ifdef __CUDACC__
-#include <cub/cub.cuh>
-#endif
-
 #include "CUDACore/AtomicPairCounter.h"
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/cuda_assert.h"
@@ -55,54 +51,52 @@ namespace cms {
     }
 
     template <typename Histo>
-    inline void launchZero(Histo *__restrict__ h,
-                           cudaStream_t stream
+    inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
+                                                          cudaStream_t stream
 #ifndef __CUDACC__
-                           = cudaStreamDefault
+                                                          = cudaStreamDefault
 #endif
     ) {
-      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
+      assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
 #ifdef __CUDACC__
-      cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
+      cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
 #else
-      ::memset(off, 0, 4 * Histo::totbins());
+      ::memset(poff, 0, size);
 #endif
     }
 
     template <typename Histo>
-    inline void launchFinalize(Histo *__restrict__ h,
-                               uint8_t *__restrict__ ws
+    inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
+                                                              cudaStream_t stream
 #ifndef __CUDACC__
-                               = cudaStreamDefault
-#endif
-                               ,
-                               cudaStream_t stream
-#ifndef __CUDACC__
-                               = cudaStreamDefault
+                                                              = cudaStreamDefault
 #endif
     ) {
 #ifdef __CUDACC__
-      assert(ws);
-      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      size_t wss = Histo::wsSize();
-      assert(wss > 0);
-      CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
+      auto nthreads = 1024;
+      auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
+      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
+          poff, poff, Histo::totbins(), ppsws);
+      cudaCheck(cudaGetLastError());
 #else
       h->finalize();
 #endif
     }
 
     template <typename Histo, typename T>
-    inline void fillManyFromVector(Histo *__restrict__ h,
-                                   uint8_t *__restrict__ ws,
-                                   uint32_t nh,
-                                   T const *__restrict__ v,
-                                   uint32_t const *__restrict__ offsets,
-                                   uint32_t totSize,
-                                   int nthreads,
-                                   cudaStream_t stream
+    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
+                                                                  uint32_t nh,
+                                                                  T const *__restrict__ v,
+                                                                  uint32_t const *__restrict__ offsets,
+                                                                  uint32_t totSize,
+                                                                  int nthreads,
+                                                                  cudaStream_t stream
 #ifndef __CUDACC__
-                                   = cudaStreamDefault
+                                                                  = cudaStreamDefault
 #endif
     ) {
       launchZero(h, stream);
@@ -110,7 +104,7 @@ namespace cms {
       auto nblocks = (totSize + nthreads - 1) / nthreads;
       countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
-      launchFinalize(h, ws, stream);
+      launchFinalize(h, stream);
       fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
 #else
@@ -125,216 +119,205 @@ namespace cms {
       assoc->bulkFinalizeFill(*apc);
     }
 
-  }  // namespace cuda
-}  // namespace cms
+    // iteratate over N bins left and right of the one containing "v"
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
+      int bs = Hist::bin(value);
+      int be = std::min(int(Hist::nbins() - 1), bs + n);
+      bs = std::max(0, bs - n);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
 
-// iteratate over N bins left and right of the one containing "v"
-template <typename Hist, typename V, typename Func>
-__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
-  int bs = Hist::bin(value);
-  int be = std::min(int(Hist::nbins() - 1), bs + n);
-  bs = std::max(0, bs - n);
-  assert(be >= bs);
-  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-    func(*pj);
-  }
-}
-
-// iteratate over bins containing all values in window wmin, wmax
-template <typename Hist, typename V, typename Func>
-__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
-  auto bs = Hist::bin(wmin);
-  auto be = Hist::bin(wmax);
-  assert(be >= bs);
-  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-    func(*pj);
-  }
-}
-
-template <typename T,                  // the type of the discretized input values
-          uint32_t NBINS,              // number of bins
-          uint32_t SIZE,               // max number of element
-          uint32_t S = sizeof(T) * 8,  // number of significant bits in T
-          typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
-          uint32_t NHISTS = 1     // number of histos stored
-          >
-class HistoContainer {
-public:
-  using Counter = uint32_t;
-
-  using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
-
-  using index_type = I;
-  using UT = typename std::make_unsigned<T>::type;
-
-  static constexpr uint32_t ilog2(uint32_t v) {
-    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
-    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
-
-    uint32_t r = 0;  // result of log2(v) will go here
-    for (auto i = 4; i >= 0; i--)
-      if (v & b[i]) {
-        v >>= s[i];
-        r |= s[i];
+    // iteratate over bins containing all values in window wmin, wmax
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
+      auto bs = Hist::bin(wmin);
+      auto be = Hist::bin(wmax);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
       }
-    return r;
-  }
+    }
 
-  static constexpr uint32_t sizeT() { return S; }
-  static constexpr uint32_t nbins() { return NBINS; }
-  static constexpr uint32_t nhists() { return NHISTS; }
-  static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
-  static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
-  static constexpr uint32_t capacity() { return SIZE; }
+    template <typename T,                  // the type of the discretized input values
+              uint32_t NBINS,              // number of bins
+              uint32_t SIZE,               // max number of element
+              uint32_t S = sizeof(T) * 8,  // number of significant bits in T
+              typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+              uint32_t NHISTS = 1     // number of histos stored
+              >
+    class HistoContainer {
+    public:
+      using Counter = uint32_t;
+
+      using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
+
+      using index_type = I;
+      using UT = typename std::make_unsigned<T>::type;
+
+      static constexpr uint32_t ilog2(uint32_t v) {
+        constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+        constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+        uint32_t r = 0;  // result of log2(v) will go here
+        for (auto i = 4; i >= 0; i--)
+          if (v & b[i]) {
+            v >>= s[i];
+            r |= s[i];
+          }
+        return r;
+      }
 
-  static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
+      static constexpr uint32_t sizeT() { return S; }
+      static constexpr uint32_t nbins() { return NBINS; }
+      static constexpr uint32_t nhists() { return NHISTS; }
+      static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
+      static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
+      static constexpr uint32_t capacity() { return SIZE; }
 
-  __host__ static size_t wsSize() {
-#ifdef __CUDACC__
-    uint32_t *v = nullptr;
-    void *d_temp_storage = nullptr;
-    size_t temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
-    return temp_storage_bytes;
-#else
-    return 0;
-#endif
-  }
+      static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
 
-  static constexpr UT bin(T t) {
-    constexpr uint32_t shift = sizeT() - nbits();
-    constexpr uint32_t mask = (1 << nbits()) - 1;
-    return (t >> shift) & mask;
-  }
+      static constexpr UT bin(T t) {
+        constexpr uint32_t shift = sizeT() - nbits();
+        constexpr uint32_t mask = (1 << nbits()) - 1;
+        return (t >> shift) & mask;
+      }
 
-  __host__ __device__ void zero() {
-    for (auto &i : off)
-      i = 0;
-  }
+      __host__ __device__ void zero() {
+        for (auto &i : off)
+          i = 0;
+      }
 
-  __host__ __device__ void add(CountersOnly const &co) {
-    for (uint32_t i = 0; i < totbins(); ++i) {
+      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
+        for (uint32_t i = 0; i < totbins(); ++i) {
 #ifdef __CUDA_ARCH__
-      atomicAdd(off + i, co.off[i]);
+          atomicAdd(off + i, co.off[i]);
 #else
-      auto &a = (std::atomic<Counter> &)(off[i]);
-      a += co.off[i];
+          auto &a = (std::atomic<Counter> &)(off[i]);
+          a += co.off[i];
 #endif
-    }
-  }
+        }
+      }
 
-  static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
+      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
 #ifdef __CUDA_ARCH__
-    return atomicAdd(&x, 1);
+        return atomicAdd(&x, 1);
 #else
-    auto &a = (std::atomic<Counter> &)(x);
-    return a++;
+        auto &a = (std::atomic<Counter> &)(x);
+        return a++;
 #endif
-  }
+      }
 
-  static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
+      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
 #ifdef __CUDA_ARCH__
-    return atomicSub(&x, 1);
+        return atomicSub(&x, 1);
 #else
-    auto &a = (std::atomic<Counter> &)(x);
-    return a--;
+        auto &a = (std::atomic<Counter> &)(x);
+        return a--;
 #endif
-  }
-
-  __host__ __device__ __forceinline__ void countDirect(T b) {
-    assert(b < nbins());
-    atomicIncrement(off[b]);
-  }
-
-  __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
-    assert(b < nbins());
-    auto w = atomicDecrement(off[b]);
-    assert(w > 0);
-    bins[w - 1] = j;
-  }
-
-  __device__ __host__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
-    auto c = apc.add(n);
-    if (c.m >= nbins())
-      return -int32_t(c.m);
-    off[c.m] = c.n;
-    for (uint32_t j = 0; j < n; ++j)
-      bins[c.n + j] = v[j];
-    return c.m;
-  }
-
-  __device__ __host__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
-    off[apc.get().m] = apc.get().n;
-  }
-
-  __device__ __host__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
-    auto m = apc.get().m;
-    auto n = apc.get().n;
-    if (m >= nbins()) {  // overflow!
-      off[nbins()] = uint32_t(off[nbins() - 1]);
-      return;
-    }
-    auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
-    for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
-      off[i] = n;
-    }
-  }
-
-  __host__ __device__ __forceinline__ void count(T t) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    atomicIncrement(off[b]);
-  }
-
-  __host__ __device__ __forceinline__ void fill(T t, index_type j) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    auto w = atomicDecrement(off[b]);
-    assert(w > 0);
-    bins[w - 1] = j;
-  }
-
-  __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    b += histOff(nh);
-    assert(b < totbins());
-    atomicIncrement(off[b]);
-  }
-
-  __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    b += histOff(nh);
-    assert(b < totbins());
-    auto w = atomicDecrement(off[b]);
-    assert(w > 0);
-    bins[w - 1] = j;
-  }
-
-  __device__ __host__ __forceinline__ void finalize(Counter *ws = nullptr) {
-    assert(off[totbins() - 1] == 0);
-    blockPrefixScan(off, totbins(), ws);
-    assert(off[totbins() - 1] == off[totbins() - 2]);
-  }
-
-  constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
-  constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
-
-  constexpr index_type const *begin() const { return bins; }
-  constexpr index_type const *end() const { return begin() + size(); }
-
-  constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
-  constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
-
-  Counter off[totbins()];
-  index_type bins[capacity()];
-};
-
-template <typename I,        // type stored in the container (usually an index in a vector of the input values)
-          uint32_t MAXONES,  // max number of "ones"
-          uint32_t MAXMANYS  // max number of "manys"
-          >
-using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
+      }
+
+      __host__ __device__ __forceinline__ void countDirect(T b) {
+        assert(b < nbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
+        assert(b < nbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
+        auto c = apc.add(n);
+        if (c.m >= nbins())
+          return -int32_t(c.m);
+        off[c.m] = c.n;
+        for (uint32_t j = 0; j < n; ++j)
+          bins[c.n + j] = v[j];
+        return c.m;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
+        off[apc.get().m] = apc.get().n;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
+        auto m = apc.get().m;
+        auto n = apc.get().n;
+        if (m >= nbins()) {  // overflow!
+          off[nbins()] = uint32_t(off[nbins() - 1]);
+          return;
+        }
+        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
+        for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
+          off[i] = n;
+        }
+      }
+
+      __host__ __device__ __forceinline__ void count(T t) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(T t, index_type j) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        b += histOff(nh);
+        assert(b < totbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        b += histOff(nh);
+        assert(b < totbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
+        assert(off[totbins() - 1] == 0);
+        blockPrefixScan(off, totbins(), ws);
+        assert(off[totbins() - 1] == off[totbins() - 2]);
+      }
+
+      constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
+      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
+
+      constexpr index_type const *begin() const { return bins; }
+      constexpr index_type const *end() const { return begin() + size(); }
+
+      constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
+      constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
+
+      Counter off[totbins()];
+      int32_t psws;  // prefix-scan working space
+      index_type bins[capacity()];
+    };
+
+    template <typename I,        // type stored in the container (usually an index in a vector of the input values)
+              uint32_t MAXONES,  // max number of "ones"
+              uint32_t MAXMANYS  // max number of "manys"
+              >
+    using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
+
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
diff --git a/src/cuda/CUDACore/HostAllocator.h b/src/cuda/CUDACore/HostAllocator.h
new file mode 100644
index 000000000..19c86e31f
--- /dev/null
+++ b/src/cuda/CUDACore/HostAllocator.h
@@ -0,0 +1,55 @@
+#ifndef HeterogeneousCore_CUDAUtilities_HostAllocator_h
+#define HeterogeneousCore_CUDAUtilities_HostAllocator_h
+
+#include <memory>
+#include <new>
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+
+    class bad_alloc : public std::bad_alloc {
+    public:
+      bad_alloc(cudaError_t error) noexcept : error_(error) {}
+
+      const char* what() const noexcept override { return cudaGetErrorString(error_); }
+
+    private:
+      cudaError_t error_;
+    };
+
+    template <typename T, unsigned int FLAGS = cudaHostAllocDefault>
+    class HostAllocator {
+    public:
+      using value_type = T;
+
+      template <typename U>
+      struct rebind {
+        using other = HostAllocator<U, FLAGS>;
+      };
+
+      T* allocate(std::size_t n) const __attribute__((warn_unused_result)) __attribute__((malloc))
+      __attribute__((returns_nonnull)) {
+        void* ptr = nullptr;
+        cudaError_t status = cudaMallocHost(&ptr, n * sizeof(T), FLAGS);
+        if (status != cudaSuccess) {
+          throw bad_alloc(status);
+        }
+        if (ptr == nullptr) {
+          throw std::bad_alloc();
+        }
+        return static_cast<T*>(ptr);
+      }
+
+      void deallocate(T* p, std::size_t n) const {
+        cudaError_t status = cudaFreeHost(p);
+        if (status != cudaSuccess) {
+          throw bad_alloc(status);
+        }
+      }
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_HostAllocator_h
diff --git a/src/cuda/CUDACore/HostProduct.h b/src/cuda/CUDACore/HostProduct.h
index 542ea2e30..6aaa38e3e 100644
--- a/src/cuda/CUDACore/HostProduct.h
+++ b/src/cuda/CUDACore/HostProduct.h
@@ -19,7 +19,7 @@ class HostProduct {
 
   auto const& operator*() const { return *get(); }
 
-  auto const* operator-> () const { return get(); }
+  auto const* operator->() const { return get(); }
 
 private:
   cms::cuda::host::unique_ptr<T> hm_ptr;  //!
diff --git a/src/cuda/CUDACore/ScopedContext.cc b/src/cuda/CUDACore/ScopedContext.cc
index b2dc58f32..14bff04eb 100644
--- a/src/cuda/CUDACore/ScopedContext.cc
+++ b/src/cuda/CUDACore/ScopedContext.cc
@@ -91,7 +91,7 @@ namespace cms::cuda {
   ScopedContextAcquire::~ScopedContextAcquire() {
     holderHelper_.enqueueCallback(device(), stream());
     if (contextState_) {
-      contextState_->set(device(), std::move(streamPtr()));
+      contextState_->set(device(), streamPtr());
     }
   }
 
diff --git a/src/cuda/CUDACore/SimpleVector.h b/src/cuda/CUDACore/SimpleVector.h
new file mode 100644
index 000000000..f21f51cf8
--- /dev/null
+++ b/src/cuda/CUDACore/SimpleVector.h
@@ -0,0 +1,141 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
+#define HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
+
+//  author: Felice Pantaleo, CERN, 2018
+
+#include <type_traits>
+#include <utility>
+
+#include "CUDACore/cudaCompat.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <class T>
+    struct SimpleVector {
+      constexpr SimpleVector() = default;
+
+      // ownership of m_data stays within the caller
+      constexpr void construct(int capacity, T *data) {
+        m_size = 0;
+        m_capacity = capacity;
+        m_data = data;
+      }
+
+      inline constexpr int push_back_unsafe(const T &element) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < m_capacity) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      constexpr int emplace_back_unsafe(Ts &&... args) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < m_capacity) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      __device__ inline T &back() { return m_data[m_size - 1]; }
+
+      __device__ inline const T &back() const {
+        if (m_size > 0) {
+          return m_data[m_size - 1];
+        } else
+          return T();  //undefined behaviour
+      }
+
+      // thread-safe version of the vector, when used in a CUDA kernel
+      __device__ int push_back(const T &element) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < m_capacity) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      __device__ int emplace_back(Ts &&... args) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < m_capacity) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      // thread safe version of resize
+      __device__ int extend(int size = 1) {
+        auto previousSize = atomicAdd(&m_size, size);
+        if (previousSize < m_capacity) {
+          return previousSize;
+        } else {
+          atomicSub(&m_size, size);
+          return -1;
+        }
+      }
+
+      __device__ int shrink(int size = 1) {
+        auto previousSize = atomicSub(&m_size, size);
+        if (previousSize >= size) {
+          return previousSize - size;
+        } else {
+          atomicAdd(&m_size, size);
+          return -1;
+        }
+      }
+
+      inline constexpr bool empty() const { return m_size <= 0; }
+      inline constexpr bool full() const { return m_size >= m_capacity; }
+      inline constexpr T &operator[](int i) { return m_data[i]; }
+      inline constexpr const T &operator[](int i) const { return m_data[i]; }
+      inline constexpr void reset() { m_size = 0; }
+      inline constexpr int size() const { return m_size; }
+      inline constexpr int capacity() const { return m_capacity; }
+      inline constexpr T const *data() const { return m_data; }
+      inline constexpr void resize(int size) { m_size = size; }
+      inline constexpr void set_data(T *data) { m_data = data; }
+
+    private:
+      int m_size;
+      int m_capacity;
+
+      T *m_data;
+    };
+
+    // ownership of m_data stays within the caller
+    template <class T>
+    SimpleVector<T> make_SimpleVector(int capacity, T *data) {
+      SimpleVector<T> ret;
+      ret.construct(capacity, data);
+      return ret;
+    }
+
+    // ownership of m_data stays within the caller
+    template <class T>
+    SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
+      auto ret = new (mem) SimpleVector<T>();
+      ret->construct(capacity, data);
+      return ret;
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
diff --git a/src/cuda/CUDACore/VecArray.h b/src/cuda/CUDACore/VecArray.h
new file mode 100644
index 000000000..595238ecd
--- /dev/null
+++ b/src/cuda/CUDACore/VecArray.h
@@ -0,0 +1,106 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_VecArray_h
+#define HeterogeneousCore_CUDAUtilities_interface_VecArray_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include "CUDACore/cudaCompat.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <class T, int maxSize>
+    class VecArray {
+    public:
+      using self = VecArray<T, maxSize>;
+      using value_t = T;
+
+      inline constexpr int push_back_unsafe(const T &element) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < maxSize) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      constexpr int emplace_back_unsafe(Ts &&... args) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < maxSize) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      inline constexpr T &back() const {
+        if (m_size > 0) {
+          return m_data[m_size - 1];
+        } else
+          return T();  //undefined behaviour
+      }
+
+      // thread-safe version of the vector, when used in a CUDA kernel
+      __device__ int push_back(const T &element) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < maxSize) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      __device__ int emplace_back(Ts &&... args) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < maxSize) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      inline constexpr T pop_back() {
+        if (m_size > 0) {
+          auto previousSize = m_size--;
+          return m_data[previousSize - 1];
+        } else
+          return T();
+      }
+
+      inline constexpr T const *begin() const { return m_data; }
+      inline constexpr T const *end() const { return m_data + m_size; }
+      inline constexpr T *begin() { return m_data; }
+      inline constexpr T *end() { return m_data + m_size; }
+      inline constexpr int size() const { return m_size; }
+      inline constexpr T &operator[](int i) { return m_data[i]; }
+      inline constexpr const T &operator[](int i) const { return m_data[i]; }
+      inline constexpr void reset() { m_size = 0; }
+      inline static constexpr int capacity() { return maxSize; }
+      inline constexpr T const *data() const { return m_data; }
+      inline constexpr void resize(int size) { m_size = size; }
+      inline constexpr bool empty() const { return 0 == m_size; }
+      inline constexpr bool full() const { return maxSize == m_size; }
+
+    private:
+      T m_data[maxSize];
+
+      int m_size;
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_VecArray_h
diff --git a/src/cuda/CUDACore/copyAsync.h b/src/cuda/CUDACore/copyAsync.h
index b850b9ff4..47e55c74a 100644
--- a/src/cuda/CUDACore/copyAsync.h
+++ b/src/cuda/CUDACore/copyAsync.h
@@ -3,13 +3,16 @@
 
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_noncached_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
 
 #include <type_traits>
 
 namespace cms {
   namespace cuda {
+
     // Single element
+
     template <typename T>
     inline void copyAsync(device::unique_ptr<T>& dst, const host::unique_ptr<T>& src, cudaStream_t stream) {
       // Shouldn't compile for array types because of sizeof(T), but
@@ -19,6 +22,15 @@ namespace cms {
       cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
     }
 
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T>& dst, const host::noncached::unique_ptr<T>& src, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
     template <typename T>
     inline void copyAsync(host::unique_ptr<T>& dst, const device::unique_ptr<T>& src, cudaStream_t stream) {
       static_assert(std::is_array<T>::value == false,
@@ -27,6 +39,7 @@ namespace cms {
     }
 
     // Multiple elements
+
     template <typename T>
     inline void copyAsync(device::unique_ptr<T[]>& dst,
                           const host::unique_ptr<T[]>& src,
@@ -35,6 +48,14 @@ namespace cms {
       cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
     }
 
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T[]>& dst,
+                          const host::noncached::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
     template <typename T>
     inline void copyAsync(host::unique_ptr<T[]>& dst,
                           const device::unique_ptr<T[]>& src,
diff --git a/src/cuda/CUDACore/cudaCompat.cc b/src/cuda/CUDACore/cudaCompat.cc
index 7a93eb2e9..e6bb8069d 100644
--- a/src/cuda/CUDACore/cudaCompat.cc
+++ b/src/cuda/CUDACore/cudaCompat.cc
@@ -1,13 +1,15 @@
 #include "CUDACore/cudaCompat.h"
 
-namespace cudaCompat {
-  thread_local dim3 blockIdx;
-  thread_local dim3 gridDim;
-}  // namespace cudaCompat
+namespace cms {
+  namespace cudacompat {
+    thread_local dim3 blockIdx;
+    thread_local dim3 gridDim;
+  }  // namespace cudacompat
+}  // namespace cms
 
 namespace {
   struct InitGrid {
-    InitGrid() { cudaCompat::resetGrid(); }
+    InitGrid() { cms::cudacompat::resetGrid(); }
   };
 
   const InitGrid initGrid;
diff --git a/src/cuda/CUDACore/cudaCompat.h b/src/cuda/CUDACore/cudaCompat.h
index 32c7d7b48..f9b4b2f8a 100644
--- a/src/cuda/CUDACore/cudaCompat.h
+++ b/src/cuda/CUDACore/cudaCompat.h
@@ -13,69 +13,78 @@
 
 #include <cuda_runtime.h>
 
-namespace cudaCompat {
+namespace cms {
+  namespace cudacompat {
 
 #ifndef __CUDA_RUNTIME_H__
-  struct dim3 {
-    uint32_t x, y, z;
-  };
+    struct dim3 {
+      uint32_t x, y, z;
+    };
 #endif
-  const dim3 threadIdx = {0, 0, 0};
-  const dim3 blockDim = {1, 1, 1};
-
-  extern thread_local dim3 blockIdx;
-  extern thread_local dim3 gridDim;
-
-  template <typename T1, typename T2>
-  T1 atomicInc(T1* a, T2 b) {
-    auto ret = *a;
-    if ((*a) < T1(b))
-      (*a)++;
-    return ret;
-  }
-
-  template <typename T1, typename T2>
-  T1 atomicAdd(T1* a, T2 b) {
-    auto ret = *a;
-    (*a) += b;
-    return ret;
-  }
-
-  template <typename T1, typename T2>
-  T1 atomicSub(T1* a, T2 b) {
-    auto ret = *a;
-    (*a) -= b;
-    return ret;
-  }
-
-  template <typename T1, typename T2>
-  T1 atomicMin(T1* a, T2 b) {
-    auto ret = *a;
-    *a = std::min(*a, T1(b));
-    return ret;
-  }
-  template <typename T1, typename T2>
-  T1 atomicMax(T1* a, T2 b) {
-    auto ret = *a;
-    *a = std::max(*a, T1(b));
-    return ret;
-  }
-
-  inline void __syncthreads() {}
-  inline void __threadfence() {}
-  inline bool __syncthreads_or(bool x) { return x; }
-  inline bool __syncthreads_and(bool x) { return x; }
-  template <typename T>
-  inline T __ldg(T const* x) {
-    return *x;
-  }
-
-  inline void resetGrid() {
-    blockIdx = {0, 0, 0};
-    gridDim = {1, 1, 1};
-  }
-
-}  // namespace cudaCompat
+    const dim3 threadIdx = {0, 0, 0};
+    const dim3 blockDim = {1, 1, 1};
+
+    extern thread_local dim3 blockIdx;
+    extern thread_local dim3 gridDim;
+
+    template <typename T1, typename T2>
+    T1 atomicCAS(T1* address, T1 compare, T2 val) {
+      T1 old = *address;
+      *address = old == compare ? val : old;
+      return old;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicInc(T1* a, T2 b) {
+      auto ret = *a;
+      if ((*a) < T1(b))
+        (*a)++;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicAdd(T1* a, T2 b) {
+      auto ret = *a;
+      (*a) += b;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicSub(T1* a, T2 b) {
+      auto ret = *a;
+      (*a) -= b;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicMin(T1* a, T2 b) {
+      auto ret = *a;
+      *a = std::min(*a, T1(b));
+      return ret;
+    }
+    template <typename T1, typename T2>
+    T1 atomicMax(T1* a, T2 b) {
+      auto ret = *a;
+      *a = std::max(*a, T1(b));
+      return ret;
+    }
+
+    inline void __syncthreads() {}
+    inline void __threadfence() {}
+    inline bool __syncthreads_or(bool x) { return x; }
+    inline bool __syncthreads_and(bool x) { return x; }
+    template <typename T>
+    inline T __ldg(T const* x) {
+      return *x;
+    }
+
+    inline void resetGrid() {
+      blockIdx = {0, 0, 0};
+      gridDim = {1, 1, 1};
+    }
+
+  }  // namespace cudacompat
+}  // namespace cms
 
 // some  not needed as done by cuda runtime...
 #ifndef __CUDA_RUNTIME_H__
@@ -95,7 +104,7 @@ namespace cudaCompat {
 #endif
 
 #ifndef __CUDA_ARCH__
-using namespace cudaCompat;
+using namespace cms::cudacompat;
 #endif
 
 #endif
diff --git a/src/cuda/CUDACore/deviceAllocatorStatus.cc b/src/cuda/CUDACore/deviceAllocatorStatus.cc
new file mode 100644
index 000000000..5d4a0ca09
--- /dev/null
+++ b/src/cuda/CUDACore/deviceAllocatorStatus.cc
@@ -0,0 +1,7 @@
+#include "CUDACore/deviceAllocatorStatus.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace cms::cuda {
+  allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); }
+}  // namespace cms::cuda
diff --git a/src/cuda/CUDACore/deviceAllocatorStatus.h b/src/cuda/CUDACore/deviceAllocatorStatus.h
new file mode 100644
index 000000000..92f9f87e8
--- /dev/null
+++ b/src/cuda/CUDACore/deviceAllocatorStatus.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+
+#include <map>
+
+namespace cms {
+  namespace cuda {
+    namespace allocator {
+      struct TotalBytes {
+        size_t free;
+        size_t live;
+        size_t liveRequested;  // CMS: monitor also requested amount
+        TotalBytes() { free = live = liveRequested = 0; }
+      };
+      /// Map type of device ordinals to the number of cached bytes cached by each device
+      using GpuCachedBytes = std::map<int, TotalBytes>;
+    }  // namespace allocator
+
+    allocator::GpuCachedBytes deviceAllocatorStatus();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cuda/CUDACore/getCachingDeviceAllocator.h b/src/cuda/CUDACore/getCachingDeviceAllocator.h
index f959a50f5..00cc0af54 100644
--- a/src/cuda/CUDACore/getCachingDeviceAllocator.h
+++ b/src/cuda/CUDACore/getCachingDeviceAllocator.h
@@ -1,6 +1,9 @@
 #ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
 #define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
 
+#include <iomanip>
+#include <iostream>
+
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/deviceCount.h"
 #include "CachingDeviceAllocator.h"
@@ -9,11 +12,11 @@ namespace cms::cuda::allocator {
   // Use caching or not
   constexpr bool useCaching = true;
   // Growth factor (bin_growth in cub::CachingDeviceAllocator
-  constexpr unsigned int binGrowth = 8;
+  constexpr unsigned int binGrowth = 2;
   // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
-  constexpr unsigned int minBin = 1;
+  constexpr unsigned int minBin = 8;
   // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-  constexpr unsigned int maxBin = 10;
+  constexpr unsigned int maxBin = 30;
   // Total storage for the allocator. 0 means no limit.
   constexpr size_t maxCachedBytes = 0;
   // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
@@ -39,6 +42,27 @@ namespace cms::cuda::allocator {
   }
 
   inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingDeviceAllocator settings\n"
+          << "  bin growth " << binGrowth << "\n"
+          << "  min bin    " << minBin << "\n"
+          << "  max bin    " << maxBin << "\n"
+          << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
     // the public interface is thread safe
     static notcub::CachingDeviceAllocator allocator{binGrowth,
                                                     minBin,
diff --git a/src/cuda/CUDACore/getCachingHostAllocator.h b/src/cuda/CUDACore/getCachingHostAllocator.h
index 601374685..d6dba9a4a 100644
--- a/src/cuda/CUDACore/getCachingHostAllocator.h
+++ b/src/cuda/CUDACore/getCachingHostAllocator.h
@@ -1,6 +1,9 @@
 #ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator
 #define HeterogeneousCore_CUDACore_src_getCachingHostAllocator
 
+#include <iomanip>
+#include <iostream>
+
 #include "CUDACore/cudaCheck.h"
 #include "CachingHostAllocator.h"
 
@@ -8,13 +11,34 @@
 
 namespace cms::cuda::allocator {
   inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingHostAllocator settings\n"
+          << "  bin growth " << binGrowth << "\n"
+          << "  min bin    " << minBin << "\n"
+          << "  max bin    " << maxBin << "\n"
+          << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
     // the public interface is thread safe
     static notcub::CachingHostAllocator allocator{binGrowth,
-                                                  minBin,
-                                                  maxBin,
-                                                  minCachedBytes(),
-                                                  false,  // do not skip cleanup
-                                                  debug};
+                                                                  minBin,
+                                                                  maxBin,
+                                                                  minCachedBytes(),
+                                                                  false,  // do not skip cleanup
+                                                                  debug};
     return allocator;
   }
 }  // namespace cms::cuda::allocator
diff --git a/src/cuda/CUDACore/prefixScan.h b/src/cuda/CUDACore/prefixScan.h
index f74154a17..5624af03f 100644
--- a/src/cuda/CUDACore/prefixScan.h
+++ b/src/cuda/CUDACore/prefixScan.h
@@ -7,6 +7,7 @@
 #include "CUDACore/cuda_assert.h"
 
 #ifdef __CUDA_ARCH__
+
 template <typename T>
 __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
   // ci and co may be the same
@@ -20,10 +21,7 @@ __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __re
   }
   co[i] = x;
 }
-#endif
 
-//same as above may remove
-#ifdef __CUDA_ARCH__
 template <typename T>
 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
   auto x = c[i];
@@ -36,134 +34,155 @@ __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask)
   }
   c[i] = x;
 }
+
 #endif
 
-// limited to 32*32 elements....
-template <typename T>
-__device__ __host__ void __forceinline__ blockPrefixScan(T const* __restrict__ ci,
-                                                         T* __restrict__ co,
-                                                         uint32_t size,
-                                                         T* ws
+namespace cms {
+  namespace cuda {
+
+    // limited to 32*32 elements....
+    template <typename VT, typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
+                                                             VT* co,
+                                                             uint32_t size,
+                                                             T* ws
 #ifndef __CUDA_ARCH__
-                                                         = nullptr
+                                                             = nullptr
 #endif
-) {
+    ) {
 #ifdef __CUDA_ARCH__
-  assert(ws);
-  assert(size <= 1024);
-  assert(0 == blockDim.x % 32);
-  auto first = threadIdx.x;
-  auto mask = __ballot_sync(0xffffffff, first < size);
-
-  for (auto i = first; i < size; i += blockDim.x) {
-    warpPrefixScan(ci, co, i, mask);
-    auto laneId = threadIdx.x & 0x1f;
-    auto warpId = i / 32;
-    assert(warpId < 32);
-    if (31 == laneId)
-      ws[warpId] = co[i];
-    mask = __ballot_sync(mask, i + blockDim.x < size);
-  }
-  __syncthreads();
-  if (size <= 32)
-    return;
-  if (threadIdx.x < 32)
-    warpPrefixScan(ws, threadIdx.x, 0xffffffff);
-  __syncthreads();
-  for (auto i = first + 32; i < size; i += blockDim.x) {
-    auto warpId = i / 32;
-    co[i] += ws[warpId - 1];
-  }
-  __syncthreads();
+      assert(ws);
+      assert(size <= 1024);
+      assert(0 == blockDim.x % 32);
+      auto first = threadIdx.x;
+      auto mask = __ballot_sync(0xffffffff, first < size);
+
+      for (auto i = first; i < size; i += blockDim.x) {
+        warpPrefixScan(ci, co, i, mask);
+        auto laneId = threadIdx.x & 0x1f;
+        auto warpId = i / 32;
+        assert(warpId < 32);
+        if (31 == laneId)
+          ws[warpId] = co[i];
+        mask = __ballot_sync(mask, i + blockDim.x < size);
+      }
+      __syncthreads();
+      if (size <= 32)
+        return;
+      if (threadIdx.x < 32)
+        warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+      __syncthreads();
+      for (auto i = first + 32; i < size; i += blockDim.x) {
+        auto warpId = i / 32;
+        co[i] += ws[warpId - 1];
+      }
+      __syncthreads();
 #else
-  co[0] = ci[0];
-  for (uint32_t i = 1; i < size; ++i)
-    co[i] = ci[i] + co[i - 1];
+      co[0] = ci[0];
+      for (uint32_t i = 1; i < size; ++i)
+        co[i] = ci[i] + co[i - 1];
 #endif
-}
-
-// same as above, may remove
-// limited to 32*32 elements....
-template <typename T>
-__device__ __host__ void __forceinline__ blockPrefixScan(T* c,
-                                                         uint32_t size,
-                                                         T* ws
+    }
+
+    // same as above, may remove
+    // limited to 32*32 elements....
+    template <typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(T* c,
+                                                             uint32_t size,
+                                                             T* ws
 #ifndef __CUDA_ARCH__
-                                                         = nullptr
+                                                             = nullptr
 #endif
-) {
+    ) {
 #ifdef __CUDA_ARCH__
-  assert(ws);
-  assert(size <= 1024);
-  assert(0 == blockDim.x % 32);
-  auto first = threadIdx.x;
-  auto mask = __ballot_sync(0xffffffff, first < size);
-
-  for (auto i = first; i < size; i += blockDim.x) {
-    warpPrefixScan(c, i, mask);
-    auto laneId = threadIdx.x & 0x1f;
-    auto warpId = i / 32;
-    assert(warpId < 32);
-    if (31 == laneId)
-      ws[warpId] = c[i];
-    mask = __ballot_sync(mask, i + blockDim.x < size);
-  }
-  __syncthreads();
-  if (size <= 32)
-    return;
-  if (threadIdx.x < 32)
-    warpPrefixScan(ws, threadIdx.x, 0xffffffff);
-  __syncthreads();
-  for (auto i = first + 32; i < size; i += blockDim.x) {
-    auto warpId = i / 32;
-    c[i] += ws[warpId - 1];
-  }
-  __syncthreads();
+      assert(ws);
+      assert(size <= 1024);
+      assert(0 == blockDim.x % 32);
+      auto first = threadIdx.x;
+      auto mask = __ballot_sync(0xffffffff, first < size);
+
+      for (auto i = first; i < size; i += blockDim.x) {
+        warpPrefixScan(c, i, mask);
+        auto laneId = threadIdx.x & 0x1f;
+        auto warpId = i / 32;
+        assert(warpId < 32);
+        if (31 == laneId)
+          ws[warpId] = c[i];
+        mask = __ballot_sync(mask, i + blockDim.x < size);
+      }
+      __syncthreads();
+      if (size <= 32)
+        return;
+      if (threadIdx.x < 32)
+        warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+      __syncthreads();
+      for (auto i = first + 32; i < size; i += blockDim.x) {
+        auto warpId = i / 32;
+        c[i] += ws[warpId - 1];
+      }
+      __syncthreads();
 #else
-  for (uint32_t i = 1; i < size; ++i)
-    c[i] += c[i - 1];
+      for (uint32_t i = 1; i < size; ++i)
+        c[i] += c[i - 1];
 #endif
-}
-
-// limited to 1024*1024 elements....
-template <typename T>
-__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) {
-  __shared__ T ws[32];
-  // first each block does a scan of size 1024; (better be enough blocks....)
-  assert(1024 * gridDim.x >= size);
-  int off = 1024 * blockIdx.x;
-  if (size - off > 0)
-    blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws);
-
-  // count blocks that finished
-  __shared__ bool isLastBlockDone;
-  if (0 == threadIdx.x) {
-    auto value = atomicAdd(pc, 1);  // block counter
-    isLastBlockDone = (value == (int(gridDim.x) - 1));
-  }
+    }
 
-  __syncthreads();
-
-  if (!isLastBlockDone)
-    return;
-
-  // good each block has done its work and now we are left in last block
+#ifdef __CUDA_ARCH__
+    // see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
+    __device__ __forceinline__ unsigned dynamic_smem_size() {
+      unsigned ret;
+      asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
+      return ret;
+    }
+#endif
 
-  // let's get the partial sums from each block
-  __shared__ T psum[1024];
-  for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
-    auto j = 1024 * i + 1023;
-    psum[i] = (j < size) ? co[j] : T(0);
-  }
-  __syncthreads();
-  blockPrefixScan(psum, psum, gridDim.x, ws);
-
-  // now it would have been handy to have the other blocks around...
-  int first = threadIdx.x;                                 // + blockDim.x * blockIdx.x
-  for (int i = first + 1024; i < size; i += blockDim.x) {  //  *gridDim.x) {
-    auto k = i / 1024;                                     // block
-    co[i] += psum[k - 1];
-  }
-}
+    // in principle not limited....
+    template <typename T>
+    __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
+      volatile T const* ci = ici;
+      volatile T* co = ico;
+      __shared__ T ws[32];
+#ifdef __CUDA_ARCH__
+      assert(sizeof(T) * gridDim.x <= dynamic_smem_size());  // size of psum below
+#endif
+      assert(blockDim.x * gridDim.x >= size);
+      // first each block does a scan
+      int off = blockDim.x * blockIdx.x;
+      if (size - off > 0)
+        blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);
+
+      // count blocks that finished
+      __shared__ bool isLastBlockDone;
+      if (0 == threadIdx.x) {
+        __threadfence();
+        auto value = atomicAdd(pc, 1);  // block counter
+        isLastBlockDone = (value == (int(gridDim.x) - 1));
+      }
+
+      __syncthreads();
+
+      if (!isLastBlockDone)
+        return;
+
+      assert(int(gridDim.x) == *pc);
+
+      // good each block has done its work and now we are left in last block
+
+      // let's get the partial sums from each block
+      extern __shared__ T psum[];
+      for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
+        auto j = blockDim.x * i + blockDim.x - 1;
+        psum[i] = (j < size) ? co[j] : T(0);
+      }
+      __syncthreads();
+      blockPrefixScan(psum, psum, gridDim.x, ws);
+
+      // now it would have been handy to have the other blocks around...
+      for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
+        co[i] += psum[k];
+      }
+    }
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
diff --git a/src/cuda/CUDACore/radixSort.h b/src/cuda/CUDACore/radixSort.h
index 859ab42f2..ff1da2d46 100644
--- a/src/cuda/CUDACore/radixSort.h
+++ b/src/cuda/CUDACore/radixSort.h
@@ -1,7 +1,10 @@
 #ifndef HeterogeneousCoreCUDAUtilities_radixSort_H
 #define HeterogeneousCoreCUDAUtilities_radixSort_H
 
+#ifdef __CUDACC__
+
 #include <cstdint>
+#include <type_traits>
 
 #include "CUDACore/cuda_assert.h"
 
@@ -79,8 +82,8 @@ __device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, u
 template <typename T,  // shall be interger
           int NS,      // number of significant bytes to use in sorting
           typename RF>
-__device__ void __forceinline__
-radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
+__device__ __forceinline__ void radixSortImpl(
+    T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
   constexpr int d = 8, w = 8 * sizeof(T);
   constexpr int sb = 1 << d;
   constexpr int ps = int(sizeof(T)) - NS;
@@ -217,28 +220,30 @@ radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t s
 template <typename T,
           int NS = sizeof(T),  // number of significant bytes to use in sorting
           typename std::enable_if<std::is_unsigned<T>::value, T>::type* = nullptr>
-__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
   radixSortImpl<T, NS>(a, ind, ind2, size, dummyReorder<T>);
 }
 
 template <typename T,
           int NS = sizeof(T),  // number of significant bytes to use in sorting
           typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
-__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
   radixSortImpl<T, NS>(a, ind, ind2, size, reorderSigned<T>);
 }
 
 template <typename T,
           int NS = sizeof(T),  // number of significant bytes to use in sorting
           typename std::enable_if<std::is_floating_point<T>::value, T>::type* = nullptr>
-__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
   using I = int;
   radixSortImpl<I, NS>((I const*)(a), ind, ind2, size, reorderFloat<I>);
 }
 
 template <typename T, int NS = sizeof(T)>
-__device__ void __forceinline__
-radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+__device__ __forceinline__ void radixSortMulti(T const* v,
+                                               uint16_t* index,
+                                               uint32_t const* offsets,
+                                               uint16_t* workspace) {
   extern __shared__ uint16_t ws[];
 
   auto a = v + offsets[blockIdx.x];
@@ -250,17 +255,23 @@ radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* w
     radixSort<T, NS>(a, ind, ind2, size);
 }
 
-template <typename T, int NS = sizeof(T)>
-__global__ void __launch_bounds__(256, 4)
-    radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
-  radixSortMulti<T, NS>(v, index, offsets, workspace);
-}
+namespace cms {
+  namespace cuda {
 
-template <typename T, int NS = sizeof(T)>
-__global__ void
-// __launch_bounds__(256, 4)
-radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
-  radixSortMulti<T, NS>(v, index, offsets, workspace);
-}
+    template <typename T, int NS = sizeof(T)>
+    __global__ void __launch_bounds__(256, 4)
+        radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+      radixSortMulti<T, NS>(v, index, offsets, workspace);
+    }
+
+    template <typename T, int NS = sizeof(T)>
+    __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+      radixSortMulti<T, NS>(v, index, offsets, workspace);
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // __CUDACC__
 
 #endif  // HeterogeneousCoreCUDAUtilities_radixSort_H
diff --git a/src/cuda/CUDACore/requireDevices.cc b/src/cuda/CUDACore/requireDevices.cc
new file mode 100644
index 000000000..86eb408ff
--- /dev/null
+++ b/src/cuda/CUDACore/requireDevices.cc
@@ -0,0 +1,30 @@
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/requireDevices.h"
+
+namespace cms::cudatest {
+  bool testDevices() {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess) {
+      std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    if (devices == 0) {
+      std::cerr << "No CUDA devices available, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    return true;
+  }
+
+  void requireDevices() {
+    if (not testDevices()) {
+      exit(EXIT_SUCCESS);
+    }
+  }
+}  // namespace cms::cudatest
diff --git a/src/cuda/CUDACore/requireDevices.h b/src/cuda/CUDACore/requireDevices.h
new file mode 100644
index 000000000..0795175b3
--- /dev/null
+++ b/src/cuda/CUDACore/requireDevices.h
@@ -0,0 +1,17 @@
+#ifndef HeterogeneousCore_CUDAUtilities_requireDevices_h
+#define HeterogeneousCore_CUDAUtilities_requireDevices_h
+
+/**
+ * These functions are meant to be called only from unit tests.
+ */
+namespace cms {
+  namespace cudatest {
+    /// In presence of CUDA devices, return true; otherwise print message and return false
+    bool testDevices();
+
+    /// Print message and exit if there are no CUDA devices
+    void requireDevices();
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_requireDevices_h
diff --git a/src/cuda/CUDADataFormats/BeamSpotCUDA.cc b/src/cuda/CUDADataFormats/BeamSpotCUDA.cc
deleted file mode 100644
index 3d146947c..000000000
--- a/src/cuda/CUDADataFormats/BeamSpotCUDA.cc
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "CUDADataFormats/BeamSpotCUDA.h"
-
-#include "CUDACore/cudaCheck.h"
-#include "CUDACore/device_unique_ptr.h"
-
-BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
-  data_d_ = cms::cuda::make_device_unique<Data>(stream);
-  cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
-}
diff --git a/src/cuda/CUDADataFormats/BeamSpotCUDA.h b/src/cuda/CUDADataFormats/BeamSpotCUDA.h
index e529f6eeb..a090ef347 100644
--- a/src/cuda/CUDADataFormats/BeamSpotCUDA.h
+++ b/src/cuda/CUDADataFormats/BeamSpotCUDA.h
@@ -1,32 +1,33 @@
 #ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
 #define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
 
-#include "CUDACore/device_unique_ptr.h"
-
 #include <cuda_runtime.h>
 
+#include "DataFormats/BeamSpotPOD.h"
+#include "CUDACore/device_unique_ptr.h"
+
 class BeamSpotCUDA {
 public:
-  // alignas(128) doesn't really make sense as there is only one
-  // beamspot per event?
-  struct Data {
-    float x, y, z;  // position
-    // TODO: add covariance matrix
-
-    float sigmaZ;
-    float beamWidthX, beamWidthY;
-    float dxdz, dydz;
-    float emittanceX, emittanceY;
-    float betaStar;
-  };
-
+  // default constructor, required by cms::cuda::Product<BeamSpotCUDA>
   BeamSpotCUDA() = default;
-  BeamSpotCUDA(Data const* data_h, cudaStream_t stream);
 
-  Data const* data() const { return data_d_.get(); }
+  // constructor that allocates cached device memory on the given CUDA stream
+  BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique<BeamSpotPOD>(stream); }
+
+  // movable, non-copiable
+  BeamSpotCUDA(BeamSpotCUDA const&) = delete;
+  BeamSpotCUDA(BeamSpotCUDA&&) = default;
+  BeamSpotCUDA& operator=(BeamSpotCUDA const&) = delete;
+  BeamSpotCUDA& operator=(BeamSpotCUDA&&) = default;
+
+  BeamSpotPOD* data() { return data_d_.get(); }
+  BeamSpotPOD const* data() const { return data_d_.get(); }
+
+  cms::cuda::device::unique_ptr<BeamSpotPOD>& ptr() { return data_d_; }
+  cms::cuda::device::unique_ptr<BeamSpotPOD> const& ptr() const { return data_d_; }
 
 private:
-  cms::cuda::device::unique_ptr<Data> data_d_;
+  cms::cuda::device::unique_ptr<BeamSpotPOD> data_d_;
 };
 
-#endif
+#endif  // CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
diff --git a/src/cuda/CUDADataFormats/HeterogeneousSoA.h b/src/cuda/CUDADataFormats/HeterogeneousSoA.h
index 1134cd5ff..cfaad449c 100644
--- a/src/cuda/CUDADataFormats/HeterogeneousSoA.h
+++ b/src/cuda/CUDADataFormats/HeterogeneousSoA.h
@@ -27,13 +27,13 @@ class HeterogeneousSoA {
 
   auto const &operator*() const { return *get(); }
 
-  auto const *operator-> () const { return get(); }
+  auto const *operator->() const { return get(); }
 
   auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
 
   auto &operator*() { return *get(); }
 
-  auto *operator-> () { return get(); }
+  auto *operator->() { return get(); }
 
   // in reality valid only for GPU version...
   cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
@@ -50,94 +50,96 @@ class HeterogeneousSoA {
   std::unique_ptr<T> std_ptr;               //!
 };
 
-namespace cudaCompat {
-
-  struct GPUTraits {
-    template <typename T>
-    using unique_ptr = cms::cuda::device::unique_ptr<T>;
-
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
-    }
-
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
-    }
-  };
-
-  struct HostTraits {
-    template <typename T>
-    using unique_ptr = cms::cuda::host::unique_ptr<T>;
-
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
-    }
-  };
-
-  struct CPUTraits {
-    template <typename T>
-    using unique_ptr = std::unique_ptr<T>;
-
-    template <typename T>
-    static auto make_unique(cudaStream_t) {
-      return std::make_unique<T>();
-    }
-
-    template <typename T>
-    static auto make_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
-    }
-
-    template <typename T>
-    static auto make_host_unique(cudaStream_t) {
-      return std::make_unique<T>();
-    }
-
-    template <typename T>
-    static auto make_device_unique(cudaStream_t) {
-      return std::make_unique<T>();
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
-    }
-  };
-
-}  // namespace cudaCompat
+namespace cms {
+  namespace cudacompat {
+
+    struct GPUTraits {
+      template <typename T>
+      using unique_ptr = cms::cuda::device::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+    };
+
+    struct HostTraits {
+      template <typename T>
+      using unique_ptr = cms::cuda::host::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+    };
+
+    struct CPUTraits {
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_unique(size_t size, cudaStream_t) {
+        return std::make_unique<T>(size);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t) {
+        return std::make_unique<T>(size);
+      }
+    };
+
+  }  // namespace cudacompat
+}  // namespace cms
 
 // a heterogeneous unique pointer (of a different sort) ...
 template <typename T, typename Traits>
@@ -178,10 +180,10 @@ cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cuda
 }
 
 template <typename T>
-using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cudaCompat::GPUTraits>;
+using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cms::cudacompat::GPUTraits>;
 template <typename T>
-using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cudaCompat::CPUTraits>;
+using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cms::cudacompat::CPUTraits>;
 template <typename T>
-using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cudaCompat::HostTraits>;
+using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cms::cudacompat::HostTraits>;
 
 #endif
diff --git a/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h b/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h
index 157035577..579c67092 100644
--- a/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h
+++ b/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h
@@ -17,7 +17,7 @@ class TrackSoAT {
 
   using Quality = trackQuality::Quality;
   using hindex_type = uint16_t;
-  using HitContainer = OneToManyAssoc<hindex_type, S, 5 * S>;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
 
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
diff --git a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
index c667a90ea..b19664874 100644
--- a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
+++ b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
@@ -9,13 +9,13 @@
 
 SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
     : formatterErrors_h(std::move(errors)) {
-  error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  error_d = cms::cuda::make_device_unique<cms::cuda::SimpleVector<PixelErrorCompact>>(stream);
   data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
 
   cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
 
-  error_h = cms::cuda::make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
-  GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
+  error_h = cms::cuda::make_host_unique<cms::cuda::SimpleVector<PixelErrorCompact>>(stream);
+  cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
   assert(error_h->empty());
   assert(error_h->capacity() == static_cast<int>(maxFedWords));
 
@@ -38,5 +38,5 @@ SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync
   }
   auto err = *error_h;
   err.set_data(data.get());
-  return HostDataError(std::move(err), std::move(data));
+  return HostDataError(err, std::move(data));
 }
diff --git a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h
index 46fec5688..9c7c874ee 100644
--- a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h
+++ b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h
@@ -1,12 +1,12 @@
 #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
 #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
 
-#include "DataFormats/PixelErrors.h"
+#include <cuda_runtime.h>
+
+#include "CUDACore/SimpleVector.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "CUDACore/GPUSimpleVector.h"
-
-#include <cuda_runtime.h>
+#include "DataFormats/PixelErrors.h"
 
 class SiPixelDigiErrorsCUDA {
 public:
@@ -21,20 +21,20 @@ class SiPixelDigiErrorsCUDA {
 
   const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
 
-  GPU::SimpleVector<PixelErrorCompact>* error() { return error_d.get(); }
-  GPU::SimpleVector<PixelErrorCompact> const* error() const { return error_d.get(); }
-  GPU::SimpleVector<PixelErrorCompact> const* c_error() const { return error_d.get(); }
+  cms::cuda::SimpleVector<PixelErrorCompact>* error() { return error_d.get(); }
+  cms::cuda::SimpleVector<PixelErrorCompact> const* error() const { return error_d.get(); }
+  cms::cuda::SimpleVector<PixelErrorCompact> const* c_error() const { return error_d.get(); }
 
   using HostDataError =
-      std::pair<GPU::SimpleVector<PixelErrorCompact>, cms::cuda::host::unique_ptr<PixelErrorCompact[]>>;
+      std::pair<cms::cuda::SimpleVector<PixelErrorCompact>, cms::cuda::host::unique_ptr<PixelErrorCompact[]>>;
   HostDataError dataErrorToHostAsync(cudaStream_t stream) const;
 
   void copyErrorToHostAsync(cudaStream_t stream);
 
 private:
   cms::cuda::device::unique_ptr<PixelErrorCompact[]> data_d;
-  cms::cuda::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
-  cms::cuda::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+  cms::cuda::device::unique_ptr<cms::cuda::SimpleVector<PixelErrorCompact>> error_d;
+  cms::cuda::host::unique_ptr<cms::cuda::SimpleVector<PixelErrorCompact>> error_h;
   PixelFormatterErrors formatterErrors_h;
 };
 
diff --git a/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h b/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
index 0eeea4eea..2320fa6d6 100644
--- a/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
+++ b/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
@@ -93,7 +93,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
 #ifndef __CUDACC__
         constexpr
 #endif
-        (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
+        (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
       cms::cuda::copyAsync(m_view, view, stream);
     } else {
       m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
@@ -140,16 +140,16 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
 #ifndef __CUDACC__
       constexpr
 #endif
-      (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
+      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
     cms::cuda::copyAsync(m_view, view, stream);
   } else {
     m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
   }
 }
 
-using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cudaCompat::GPUTraits>;
-using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cudaCompat::GPUTraits>;
-using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cudaCompat::CPUTraits>;
-using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cudaCompat::HostTraits>;
+using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
+using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
+using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
+using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
 
 #endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
diff --git a/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h b/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h
index 166374e41..faaa4378c 100644
--- a/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h
+++ b/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h
@@ -17,7 +17,8 @@ class TrackingRecHit2DSOAView {
   static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
   using hindex_type = uint16_t;  // if above is <=2^16
 
-  using Hist = HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), uint16_t, 10>;
+  using Hist =
+      cms::cuda::HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), uint16_t, 10>;
 
   using AverageGeometry = phase1PixelTopology::AverageGeometry;
 
diff --git a/src/cuda/CUDADataFormats/ZVertexSoA.h b/src/cuda/CUDADataFormats/ZVertexSoA.h
index ae4cba4e7..ecdf76d8e 100644
--- a/src/cuda/CUDADataFormats/ZVertexSoA.h
+++ b/src/cuda/CUDADataFormats/ZVertexSoA.h
@@ -8,7 +8,7 @@
 // These vertices are clusterized and fitted only along the beam line (z)
 // to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
 struct ZVertexSoA {
-  static constexpr uint32_t MAXTRACKS = 16 * 1024;
+  static constexpr uint32_t MAXTRACKS = 32 * 1024;
   static constexpr uint32_t MAXVTX = 1024;
 
   int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
diff --git a/src/cuda/CondFormats/PixelCPEFast.h b/src/cuda/CondFormats/PixelCPEFast.h
index 16e8f2b68..eb0f21c28 100644
--- a/src/cuda/CondFormats/PixelCPEFast.h
+++ b/src/cuda/CondFormats/PixelCPEFast.h
@@ -4,7 +4,7 @@
 #include <utility>
 
 #include "CUDACore/ESProduct.h"
-#include "CUDACore/CUDAHostAllocator.h"
+#include "CUDACore/HostAllocator.h"
 #include "CondFormats/pixelCPEforGPU.h"
 
 class PixelCPEFast {
@@ -22,7 +22,7 @@ class PixelCPEFast {
 private:
   // allocate it with posix malloc to be ocmpatible with cpu wf
   std::vector<pixelCPEforGPU::DetParams> m_detParamsGPU;
-  // std::vector<pixelCPEforGPU::DetParams, CUDAHostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
+  // std::vector<pixelCPEforGPU::DetParams, cms::cuda::HostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
   pixelCPEforGPU::CommonParams m_commonParamsGPU;
   pixelCPEforGPU::LayerGeometry m_layerGeometry;
   pixelCPEforGPU::AverageGeometry m_averageGeometry;
diff --git a/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h
index 85477e87a..027e7d25c 100644
--- a/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h
+++ b/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h
@@ -2,7 +2,7 @@
 #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 
 #include "CUDACore/ESProduct.h"
-#include "CUDACore/CUDAHostAllocator.h"
+#include "CUDACore/HostAllocator.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CondFormats/SiPixelFedCablingMapGPU.h"
 
@@ -25,7 +25,7 @@ class SiPixelFedCablingMapGPUWrapper {
   const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const;
 
 private:
-  std::vector<unsigned char, CUDAHostAllocator<unsigned char>> modToUnpDefault;
+  std::vector<unsigned char, cms::cuda::HostAllocator<unsigned char>> modToUnpDefault;
   bool hasQuality_;
 
   SiPixelFedCablingMapGPU *cablingMapHost = nullptr;  // pointer to struct in CPU
diff --git a/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h b/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h
index ee7cfd53a..5bcdc7a66 100644
--- a/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h
+++ b/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h
@@ -5,6 +5,17 @@
 #include <cstdio>
 #include <tuple>
 
+// including <cuda_runtime.h> would pull in the dependency on all of CUDA;
+// instead, just define away the CUDA specific attributes to keep GCC happy.
+#ifndef __CUDACC__
+#ifndef __host__
+#define __host__
+#endif  // __host__
+#ifndef __device__
+#define __device__
+#endif  // __device__
+#endif  // __CUDACC__
+
 #include "CUDACore/cuda_assert.h"
 
 struct SiPixelGainForHLTonGPU_DecodingStructure {
diff --git a/src/cuda/CondFormats/pixelCPEforGPU.h b/src/cuda/CondFormats/pixelCPEforGPU.h
index 936ff3e1c..c48c05750 100644
--- a/src/cuda/CondFormats/pixelCPEforGPU.h
+++ b/src/cuda/CondFormats/pixelCPEforGPU.h
@@ -10,7 +10,6 @@
 #include "DataFormats/SOARotation.h"
 #include "Geometry/phase1PixelTopology.h"
 #include "CUDACore/cudaCompat.h"
-#include "CUDACore/cuda_cxx17.h"
 
 namespace pixelCPEforGPU {
 
@@ -36,6 +35,9 @@ namespace pixelCPEforGPU {
     float shiftY;
     float chargeWidthX;
     float chargeWidthY;
+    // CMSSW 11.2.x adds 
+    //uint16_t pixmx;  // max pix charge
+    // which would break reading the binary dumps
 
     float x0, y0, z0;  // the vertex in the local coord of the detector
 
diff --git a/src/cuda/DataFormats/BeamSpotPOD.h b/src/cuda/DataFormats/BeamSpotPOD.h
new file mode 100644
index 000000000..7b4c7c241
--- /dev/null
+++ b/src/cuda/DataFormats/BeamSpotPOD.h
@@ -0,0 +1,21 @@
+#ifndef DataFormats_BeamSpot_interface_BeamSpotPOD_h
+#define DataFormats_BeamSpot_interface_BeamSpotPOD_h
+
+// This struct is a transient-only, simplified representation of the beamspot
+// data used as the underlying type for data transfers and operations in
+// heterogeneous code (e.g. in CUDA code).
+
+// The covariance matrix is not used in that code, so is left out here.
+
+// CMSSW 11.2.x uses alignas(128) to align to the CUDA L1 cache line size
+// here it would break reading the beamspot data from the binary dumps
+struct BeamSpotPOD {
+  float x, y, z;  // position
+  float sigmaZ;
+  float beamWidthX, beamWidthY;
+  float dxdz, dydz;
+  float emittanceX, emittanceY;
+  float betaStar;
+};
+
+#endif  // DataFormats_BeamSpot_interface_BeamSpotPOD_h
diff --git a/src/cuda/DataFormats/approx_atan2.h b/src/cuda/DataFormats/approx_atan2.h
index 729aa4536..4508a35cc 100644
--- a/src/cuda/DataFormats/approx_atan2.h
+++ b/src/cuda/DataFormats/approx_atan2.h
@@ -121,7 +121,7 @@ constexpr float unsafe_atan2f(float y, float x) {
 
 template <int DEGREE>
 constexpr float safe_atan2f(float y, float x) {
-  return unsafe_atan2f_impl<DEGREE>(y, (y == 0.f) & (x == 0.f) ? 0.2f : x);
+  return unsafe_atan2f_impl<DEGREE>(y, ((y == 0.f) & (x == 0.f)) ? 0.2f : x);
   // return (y==0.f)&(x==0.f) ? 0.f :  unsafe_atan2f_impl<DEGREE>( y, x);
 }
 
diff --git a/src/cuda/Makefile.deps b/src/cuda/Makefile.deps
index 2acf6dffd..a8e6e5798 100644
--- a/src/cuda/Makefile.deps
+++ b/src/cuda/Makefile.deps
@@ -1,4 +1,4 @@
-cuda_EXTERNAL_DEPENDS := TBB CUDA CUB EIGEN
+cuda_EXTERNAL_DEPENDS := TBB CUDA EIGEN
 BeamSpotProducer_DEPENDS := Framework CUDACore CUDADataFormats
 CUDACore_DEPENDS := Framework
 CUDADataFormats_DEPENDS := CUDACore DataFormats
diff --git a/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc b/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc
index 50208dc5d..3c7f1a1f0 100644
--- a/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc
+++ b/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/BeamSpotCUDA.h"
+#include "DataFormats/BeamSpotPOD.h"
 #include "Framework/ESProducer.h"
 #include "Framework/EventSetup.h"
 #include "Framework/ESPluginFactory.h"
@@ -17,11 +17,11 @@ class BeamSpotESProducer : public edm::ESProducer {
 };
 
 void BeamSpotESProducer::produce(edm::EventSetup& eventSetup) {
-  auto bs = std::make_unique<BeamSpotCUDA::Data>();
+  auto bs = std::make_unique<BeamSpotPOD>();
 
   std::ifstream in(data_ / "beamspot.bin", std::ios::binary);
   in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
-  in.read(reinterpret_cast<char*>(bs.get()), sizeof(BeamSpotCUDA::Data));
+  in.read(reinterpret_cast<char*>(bs.get()), sizeof(BeamSpotPOD));
   eventSetup.put(std::move(bs));
 }
 
diff --git a/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
index df4331be6..48badcabf 100644
--- a/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
+++ b/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
@@ -1,15 +1,17 @@
+#include <fstream>
+
+#include <cuda_runtime.h>
+
 #include "CUDACore/Product.h"
+#include "CUDACore/ScopedContext.h"
+#include "CUDACore/copyAsync.h"
+#include "CUDACore/host_noncached_unique_ptr.h"
 #include "CUDADataFormats/BeamSpotCUDA.h"
-#include "Framework/EventSetup.h"
+#include "DataFormats/BeamSpotPOD.h"
+#include "Framework/EDProducer.h"
 #include "Framework/Event.h"
+#include "Framework/EventSetup.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
-#include "CUDACore/ScopedContext.h"
-#include "CUDACore/host_noncached_unique_ptr.h"
-
-#include <cuda_runtime.h>
-
-#include <fstream>
 
 class BeamSpotToCUDA : public edm::EDProducer {
 public:
@@ -19,21 +21,24 @@ class BeamSpotToCUDA : public edm::EDProducer {
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
 private:
-  edm::EDPutTokenT<cms::cuda::Product<BeamSpotCUDA>> bsPutToken_;
+  const edm::EDPutTokenT<cms::cuda::Product<BeamSpotCUDA>> bsPutToken_;
 
-  cms::cuda::host::noncached::unique_ptr<BeamSpotCUDA::Data> bsHost;
+  cms::cuda::host::noncached::unique_ptr<BeamSpotPOD> bsHost;
 };
 
 BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg)
     : bsPutToken_{reg.produces<cms::cuda::Product<BeamSpotCUDA>>()},
-      bsHost{cms::cuda::make_host_noncached_unique<BeamSpotCUDA::Data>(cudaHostAllocWriteCombined)} {}
+      bsHost{cms::cuda::make_host_noncached_unique<BeamSpotPOD>(cudaHostAllocWriteCombined)} {}
 
 void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  *bsHost = iSetup.get<BeamSpotCUDA::Data>();
+  *bsHost = iSetup.get<BeamSpotPOD>();
 
   cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
 
-  ctx.emplace(iEvent, bsPutToken_, bsHost.get(), ctx.stream());
+  BeamSpotCUDA bsDevice(ctx.stream());
+  cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream());
+
+  ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice));
 }
 
 DEFINE_FWK_MODULE(BeamSpotToCUDA);
diff --git a/src/cuda/plugin-PixelTriplets/BrokenLine.h b/src/cuda/plugin-PixelTriplets/BrokenLine.h
index c3177f021..0a4b5f28f 100644
--- a/src/cuda/plugin-PixelTriplets/BrokenLine.h
+++ b/src/cuda/plugin-PixelTriplets/BrokenLine.h
@@ -7,8 +7,6 @@
 
 namespace BrokenLine {
 
-  using namespace Rfit;
-
   //!< Karimäki's parameters: (phi, d, k=1/R)
   /*!< covariance matrix: \n
     |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
@@ -22,12 +20,13 @@ namespace BrokenLine {
   */
   template <int N>
   struct PreparedBrokenLineData {
-    int q;                //!< particle charge
-    Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
-    VectorNd<N> s;  //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach
-    VectorNd<N> S;  //!< total distance traveled (three-dimensional)
-    VectorNd<N> Z;  //!< orthogonal coordinate to the pre-fitted line in the sz plane
-    VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
+    int q;                      //!< particle charge
+    Rfit::Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
+    Rfit::VectorNd<N> s;        //!< total distance traveled in the transverse plane
+                                //   starting from the pre-fitted closest approach
+    Rfit::VectorNd<N> S;        //!< total distance traveled (three-dimensional)
+    Rfit::VectorNd<N> Z;        //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    Rfit::VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
   };
 
   /*!
@@ -55,9 +54,9 @@ namespace BrokenLine {
     //XX_0*=1;
     constexpr double geometry_factor =
         0.7;  //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
-    constexpr double fact = geometry_factor * sqr(13.6 / 1000.);
-    return fact / (pt2 * (1. + sqr(slope))) * (std::abs(length) * XXI_0) *
-           sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
+    constexpr double fact = geometry_factor * Rfit::sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + Rfit::sqr(slope))) * (std::abs(length) * XXI_0) *
+           Rfit::sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
   }
 
   /*!
@@ -67,9 +66,9 @@ namespace BrokenLine {
     
     \return 2D rotation matrix.
   */
-  __host__ __device__ inline Matrix2d RotationMatrix(double slope) {
-    Matrix2d Rot;
-    Rot(0, 0) = 1. / sqrt(1. + sqr(slope));
+  __host__ __device__ inline Rfit::Matrix2d RotationMatrix(double slope) {
+    Rfit::Matrix2d Rot;
+    Rot(0, 0) = 1. / sqrt(1. + Rfit::sqr(slope));
     Rot(0, 1) = slope * Rot(0, 0);
     Rot(1, 0) = -Rot(0, 1);
     Rot(1, 1) = Rot(0, 0);
@@ -82,31 +81,34 @@ namespace BrokenLine {
     \param circle circle fit in the old coordinate system.
     \param x0 x coordinate of the translation vector.
     \param y0 y coordinate of the translation vector.
-    \param Jacob passed by reference in order to save stack.
+    \param jacobian passed by reference in order to save stack.
   */
-  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, double x0, double y0, Matrix3d& Jacob) {
+  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle,
+                                                    double x0,
+                                                    double y0,
+                                                    Rfit::Matrix3d& jacobian) {
     double A, U, BB, C, DO, DP, uu, xi, v, mu, lambda, zeta;
     DP = x0 * cos(circle.par(0)) + y0 * sin(circle.par(0));
     DO = x0 * sin(circle.par(0)) - y0 * cos(circle.par(0)) + circle.par(1);
     uu = 1 + circle.par(2) * circle.par(1);
     C = -circle.par(2) * y0 + uu * cos(circle.par(0));
     BB = circle.par(2) * x0 + uu * sin(circle.par(0));
-    A = 2. * DO + circle.par(2) * (sqr(DO) + sqr(DP));
+    A = 2. * DO + circle.par(2) * (Rfit::sqr(DO) + Rfit::sqr(DP));
     U = sqrt(1. + circle.par(2) * A);
-    xi = 1. / (sqr(BB) + sqr(C));
+    xi = 1. / (Rfit::sqr(BB) + Rfit::sqr(C));
     v = 1. + circle.par(2) * DO;
-    lambda = (0.5 * A) / (U * sqr(1. + U));
+    lambda = (0.5 * A) / (U * Rfit::sqr(1. + U));
     mu = 1. / (U * (1. + U)) + circle.par(2) * lambda;
-    zeta = sqr(DO) + sqr(DP);
+    zeta = Rfit::sqr(DO) + Rfit::sqr(DP);
 
-    Jacob << xi * uu * v, -xi * sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
+    jacobian << xi * uu * v, -xi * Rfit::sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
         mu * zeta - lambda * A, 0, 0, 1.;
 
     circle.par(0) = atan2(BB, C);
     circle.par(1) = A / (1 + U);
     // circle.par(2)=circle.par(2);
 
-    circle.cov = Jacob * circle.cov * Jacob.transpose();
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
   }
 
   /*!
@@ -125,28 +127,28 @@ namespace BrokenLine {
                                                         PreparedBrokenLineData<N>& results) {
     constexpr auto n = N;
     u_int i;
-    Vector2d d;
-    Vector2d e;
+    Rfit::Vector2d d;
+    Rfit::Vector2d e;
 
     d = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
     e = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
-    results.q = cross2D(d, e) > 0 ? -1 : 1;
+    results.q = Rfit::cross2D(d, e) > 0 ? -1 : 1;
 
     const double slope = -results.q / fast_fit(3);
 
-    Matrix2d R = RotationMatrix(slope);
+    Rfit::Matrix2d R = RotationMatrix(slope);
 
     // calculate radii and s
-    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * MatrixXd::Constant(1, n, 1);
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * Rfit::MatrixXd::Constant(1, n, 1);
     e = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
     for (i = 0; i < n; i++) {
       d = results.radii.block(0, i, 2, 1);
-      results.s(i) = results.q * fast_fit(2) * atan2(cross2D(d, e), d.dot(e));  // calculates the arc length
+      results.s(i) = results.q * fast_fit(2) * atan2(Rfit::cross2D(d, e), d.dot(e));  // calculates the arc length
     }
-    VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
+    Rfit::VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
 
     //calculate S and Z
-    Matrix2xNd<N> pointsSZ = Matrix2xNd<N>::Zero();
+    Rfit::Matrix2xNd<N> pointsSZ = Rfit::Matrix2xNd<N>::Zero();
     for (i = 0; i < n; i++) {
       pointsSZ(0, i) = results.s(i);
       pointsSZ(1, i) = z(i);
@@ -173,21 +175,21 @@ namespace BrokenLine {
     \return the n-by-n matrix of the linear system
   */
   template <int N>
-  __host__ __device__ inline MatrixNd<N> MatrixC_u(const VectorNd<N>& w,
-                                                   const VectorNd<N>& S,
-                                                   const VectorNd<N>& VarBeta) {
+  __host__ __device__ inline Rfit::MatrixNd<N> MatrixC_u(const Rfit::VectorNd<N>& w,
+                                                         const Rfit::VectorNd<N>& S,
+                                                         const Rfit::VectorNd<N>& VarBeta) {
     constexpr u_int n = N;
     u_int i;
 
-    MatrixNd<N> C_U = MatrixNd<N>::Zero();
+    Rfit::MatrixNd<N> C_U = Rfit::MatrixNd<N>::Zero();
     for (i = 0; i < n; i++) {
       C_U(i, i) = w(i);
       if (i > 1)
-        C_U(i, i) += 1. / (VarBeta(i - 1) * sqr(S(i) - S(i - 1)));
+        C_U(i, i) += 1. / (VarBeta(i - 1) * Rfit::sqr(S(i) - S(i - 1)));
       if (i > 0 && i < n - 1)
-        C_U(i, i) += (1. / VarBeta(i)) * sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+        C_U(i, i) += (1. / VarBeta(i)) * Rfit::sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
       if (i < n - 2)
-        C_U(i, i) += 1. / (VarBeta(i + 1) * sqr(S(i + 1) - S(i)));
+        C_U(i, i) += 1. / (VarBeta(i + 1) * Rfit::sqr(S(i + 1) - S(i)));
 
       if (i > 0 && i < n - 1)
         C_U(i, i + 1) =
@@ -219,22 +221,22 @@ namespace BrokenLine {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
     constexpr auto n = N;  // get the number of hits
 
-    const Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-    const Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
-    const Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+    const Rfit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Rfit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    const Rfit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
 
-    auto tmp = 0.5 / cross2D(c, a);
+    auto tmp = 0.5 / Rfit::cross2D(c, a);
     result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
     result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
     // check Wikipedia for these formulas
 
-    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(cross2D(b, a)));
+    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(Rfit::cross2D(b, a)));
     // Using Math Olympiad's formula R=abc/(4A)
 
-    const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-    const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+    const Rfit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const Rfit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
 
-    result(3) = result(2) * atan2(cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+    result(3) = result(2) * atan2(Rfit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
     // ds/dz slope between last and first point
   }
 
@@ -272,15 +274,15 @@ namespace BrokenLine {
     auto& Z = data.Z;
     auto& VarBeta = data.VarBeta;
     const double slope = -circle_results.q / fast_fit(3);
-    VarBeta *= 1. + sqr(slope);  // the kink angles are projected!
+    VarBeta *= 1. + Rfit::sqr(slope);  // the kink angles are projected!
 
     for (i = 0; i < n; i++) {
       Z(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
     }
 
-    Matrix2d V;     // covariance matrix
-    VectorNd<N> w;  // weights
-    Matrix2d RR;    // rotation matrix point by point
+    Rfit::Matrix2d V;     // covariance matrix
+    Rfit::VectorNd<N> w;  // weights
+    Rfit::Matrix2d RR;    // rotation matrix point by point
     //double Slope; // slope of the circle point by point
     for (i = 0; i < n; i++) {
       V(0, 0) = hits_ge.col(i)[0];            // x errors
@@ -291,13 +293,13 @@ namespace BrokenLine {
       w(i) = 1. / ((RR * V * RR.transpose())(1, 1));  // compute the orthogonal weight point by point
     }
 
-    VectorNplusONEd<N> r_u;
+    Rfit::VectorNplusONEd<N> r_u;
     r_u(n) = 0;
     for (i = 0; i < n; i++) {
       r_u(i) = w(i) * Z(i);
     }
 
-    MatrixNplusONEd<N> C_U;
+    Rfit::MatrixNplusONEd<N> C_U;
     C_U.block(0, 0, n, n) = MatrixC_u(w, s, VarBeta);
     C_U(n, n) = 0;
     //add the border to the C_u matrix
@@ -315,69 +317,69 @@ namespace BrokenLine {
       }
       C_U(n, i) = C_U(i, n);
       if (i > 0 && i < n - 1)
-        C_U(n, n) += sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
+        C_U(n, n) += Rfit::sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
     }
 
 #ifdef CPP_DUMP
     std::cout << "CU5\n" << C_U << std::endl;
 #endif
-    MatrixNplusONEd<N> I;
-    choleskyInversion::invert(C_U, I);
-    // MatrixNplusONEd<N> I = C_U.inverse();
+    Rfit::MatrixNplusONEd<N> I;
+    math::cholesky::invert(C_U, I);
+    // Rfit::MatrixNplusONEd<N> I = C_U.inverse();
 #ifdef CPP_DUMP
     std::cout << "I5\n" << I << std::endl;
 #endif
 
-    VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    Rfit::VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
 
     // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
 
     radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
     radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
 
-    Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
-    Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
+    Rfit::Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
+    Rfit::Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
 
     circle_results.par << atan2((e - d)(1), (e - d)(0)),
-        -circle_results.q * (fast_fit(2) - sqrt(sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
+        -circle_results.q * (fast_fit(2) - sqrt(Rfit::sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
         circle_results.q * (1. / fast_fit(2) + u(n));
 
     assert(circle_results.q * circle_results.par(1) <= 0);
 
-    Vector2d eMinusd = e - d;
+    Rfit::Vector2d eMinusd = e - d;
     double tmp1 = eMinusd.squaredNorm();
 
-    Matrix3d Jacob;
-    Jacob << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
+    Rfit::Matrix3d jacobian;
+    jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
         (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0,
         (circle_results.q / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) /
-            sqrt(sqr(2 * fast_fit(2)) - tmp1),
+            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
         (circle_results.q / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) /
-            sqrt(sqr(2 * fast_fit(2)) - tmp1),
+            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
         0, 0, 0, circle_results.q;
 
     circle_results.cov << I(0, 0), I(0, 1), I(0, n), I(1, 0), I(1, 1), I(1, n), I(n, 0), I(n, 1), I(n, n);
 
-    circle_results.cov = Jacob * circle_results.cov * Jacob.transpose();
+    circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
 
     //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
 
-    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), Jacob);
-    circle_results.cov(0, 0) += (1 + sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
+    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), jacobian);
+    circle_results.cov(0, 0) += (1 + Rfit::sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
 
     //...And translate back to the original system
 
-    TranslateKarimaki(circle_results, d(0), d(1), Jacob);
+    TranslateKarimaki(circle_results, d(0), d(1), jacobian);
 
     // compute chi2
     circle_results.chi2 = 0;
     for (i = 0; i < n; i++) {
-      circle_results.chi2 += w(i) * sqr(Z(i) - u(i));
+      circle_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
       if (i > 0 && i < n - 1)
-        circle_results.chi2 +=
-            sqr(u(i - 1) / (s(i) - s(i - 1)) - u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
-                u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
-            VarBeta(i);
+        circle_results.chi2 += Rfit::sqr(u(i - 1) / (s(i) - s(i - 1)) -
+                                         u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
+                                         u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
+                               VarBeta(i);
     }
 
     // assert(circle_results.chi2>=0);
@@ -405,7 +407,7 @@ namespace BrokenLine {
                                               const V4& fast_fit,
                                               const double B,
                                               const PreparedBrokenLineData<N>& data,
-                                              line_fit& line_results) {
+                                              Rfit::line_fit& line_results) {
     constexpr u_int n = N;
     u_int i;
 
@@ -415,11 +417,11 @@ namespace BrokenLine {
     const auto& VarBeta = data.VarBeta;
 
     const double slope = -data.q / fast_fit(3);
-    Matrix2d R = RotationMatrix(slope);
+    Rfit::Matrix2d R = RotationMatrix(slope);
 
-    Matrix3d V = Matrix3d::Zero();                 // covariance matrix XYZ
-    Matrix2x3d JacobXYZtosZ = Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
-    VectorNd<N> w = VectorNd<N>::Zero();
+    Rfit::Matrix3d V = Rfit::Matrix3d::Zero();                 // covariance matrix XYZ
+    Rfit::Matrix2x3d JacobXYZtosZ = Rfit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    Rfit::VectorNd<N> w = Rfit::VectorNd<N>::Zero();
     for (i = 0; i < n; i++) {
       V(0, 0) = hits_ge.col(i)[0];            // x errors
       V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
@@ -435,57 +437,57 @@ namespace BrokenLine {
                       1, 1));  // compute the orthogonal weight point by point
     }
 
-    VectorNd<N> r_u;
+    Rfit::VectorNd<N> r_u;
     for (i = 0; i < n; i++) {
       r_u(i) = w(i) * Z(i);
     }
 #ifdef CPP_DUMP
     std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl;
 #endif
-    MatrixNd<N> I;
-    choleskyInversion::invert(MatrixC_u(w, S, VarBeta), I);
-    //    MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
+    Rfit::MatrixNd<N> I;
+    math::cholesky::invert(MatrixC_u(w, S, VarBeta), I);
+    //    Rfit::MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
 #ifdef CPP_DUMP
     std::cout << "I4\n" << I << std::endl;
 #endif
 
-    VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    Rfit::VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
 
     // line parameters in the system in which the first hit is the origin and with axis along SZ
     line_results.par << (u(1) - u(0)) / (S(1) - S(0)), u(0);
     auto idiff = 1. / (S(1) - S(0));
-    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * sqr(idiff) +
+    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * Rfit::sqr(idiff) +
                             MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope),
         (I(0, 1) - I(0, 0)) * idiff, (I(0, 1) - I(0, 0)) * idiff, I(0, 0);
 
     // translate to the original SZ system
-    Matrix2d Jacob;
-    Jacob(0, 0) = 1.;
-    Jacob(0, 1) = 0;
-    Jacob(1, 0) = -S(0);
-    Jacob(1, 1) = 1.;
+    Rfit::Matrix2d jacobian;
+    jacobian(0, 0) = 1.;
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = -S(0);
+    jacobian(1, 1) = 1.;
     line_results.par(1) += -line_results.par(0) * S(0);
-    line_results.cov = Jacob * line_results.cov * Jacob.transpose();
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // rotate to the original sz system
     auto tmp = R(0, 0) - line_results.par(0) * R(0, 1);
-    Jacob(1, 1) = 1. / tmp;
-    Jacob(0, 0) = Jacob(1, 1) * Jacob(1, 1);
-    Jacob(0, 1) = 0;
-    Jacob(1, 0) = line_results.par(1) * R(0, 1) * Jacob(0, 0);
-    line_results.par(1) = line_results.par(1) * Jacob(1, 1);
-    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * Jacob(1, 1);
-    line_results.cov = Jacob * line_results.cov * Jacob.transpose();
+    jacobian(1, 1) = 1. / tmp;
+    jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = line_results.par(1) * R(0, 1) * jacobian(0, 0);
+    line_results.par(1) = line_results.par(1) * jacobian(1, 1);
+    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * jacobian(1, 1);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // compute chi2
     line_results.chi2 = 0;
     for (i = 0; i < n; i++) {
-      line_results.chi2 += w(i) * sqr(Z(i) - u(i));
+      line_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
       if (i > 0 && i < n - 1)
-        line_results.chi2 +=
-            sqr(u(i - 1) / (S(i) - S(i - 1)) - u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
-                u(i + 1) / (S(i + 1) - S(i))) /
-            VarBeta(i);
+        line_results.chi2 += Rfit::sqr(u(i - 1) / (S(i) - S(i - 1)) -
+                                       u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
+                                       u(i + 1) / (S(i + 1) - S(i))) /
+                             VarBeta(i);
     }
 
     // assert(line_results.chi2>=0);
@@ -526,27 +528,29 @@ namespace BrokenLine {
     \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
   */
   template <int N>
-  inline helix_fit BL_Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float, 6, 4>& hits_ge, const double B) {
-    helix_fit helix;
-    Vector4d fast_fit;
+  inline Rfit::helix_fit BL_Helix_fit(const Rfit::Matrix3xNd<N>& hits,
+                                      const Eigen::Matrix<float, 6, 4>& hits_ge,
+                                      const double B) {
+    Rfit::helix_fit helix;
+    Rfit::Vector4d fast_fit;
     BL_Fast_fit(hits, fast_fit);
 
     PreparedBrokenLineData<N> data;
     karimaki_circle_fit circle;
-    line_fit line;
-    Matrix3d Jacob;
+    Rfit::line_fit line;
+    Rfit::Matrix3d jacobian;
 
     prepareBrokenLineData(hits, fast_fit, B, data);
     BL_Line_fit(hits_ge, fast_fit, B, data, line);
     BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
 
     // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
-    Jacob << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (sqr(circle.par(2)) * circle.par(2));
+    jacobian << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (Rfit::sqr(circle.par(2)) * circle.par(2));
     circle.par(2) = B / std::abs(circle.par(2));
-    circle.cov = Jacob * circle.cov * Jacob.transpose();
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
 
     helix.par << circle.par, line.par;
-    helix.cov = MatrixXd::Zero(5, 5);
+    helix.cov = Rfit::MatrixXd::Zero(5, 5);
     helix.cov.block(0, 0, 3, 3) = circle.cov;
     helix.cov.block(3, 3, 2, 2) = line.cov;
     helix.q = circle.q;
diff --git a/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h b/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h
index d696bbadb..a30c251b7 100644
--- a/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h
+++ b/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h
@@ -20,8 +20,6 @@ using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
 
-using namespace Eigen;
-
 // #define BL_DUMP_HITS
 
 template <int N>
diff --git a/src/cuda/plugin-PixelTriplets/CAConstants.h b/src/cuda/plugin-PixelTriplets/CAConstants.h
index 40be667a3..b063d0f6e 100644
--- a/src/cuda/plugin-PixelTriplets/CAConstants.h
+++ b/src/cuda/plugin-PixelTriplets/CAConstants.h
@@ -5,10 +5,10 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/gpuClusteringConstants.h"
-#include "CUDACore/GPUSimpleVector.h"
-#include "CUDACore/GPUVecArray.h"
 #include "CUDACore/HistoContainer.h"
+#include "CUDACore/SimpleVector.h"
+#include "CUDACore/VecArray.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
 
 // #define ONLY_PHICUT
 
@@ -27,7 +27,7 @@ namespace CAConstants {
   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
 #ifndef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 128; }
 #else
   constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
@@ -37,7 +37,7 @@ namespace CAConstants {
   constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
 #endif
-  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
+  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }
 
   constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
   constexpr uint32_t maxNumberOfLayers() { return 10; }
@@ -48,21 +48,21 @@ namespace CAConstants {
   using tindex_type = uint16_t;  //  for tuples
 
 #ifndef ONLY_PHICUT
-  using CellNeighbors = GPU::VecArray<uint32_t, 36>;
-  using CellTracks = GPU::VecArray<tindex_type, 42>;
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
 #else
-  using CellNeighbors = GPU::VecArray<uint32_t, 64>;
-  using CellTracks = GPU::VecArray<tindex_type, 64>;
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
 #endif
 
-  using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
-  using CellTracksVector = GPU::SimpleVector<CellTracks>;
+  using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
+  using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
 
-  using OuterHitOfCell = GPU::VecArray<uint32_t, maxCellsPerHit()>;
-  using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
+  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit()>;
+  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
   using HitToTuple =
-      OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
-  using TupleMultiplicity = OneToManyAssoc<tindex_type, 8, maxTuples()>;
+      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
+  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples()>;
 
 }  // namespace CAConstants
 
diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
index 4b893260c..4ea687af3 100644
--- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
+++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
@@ -24,12 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   device_isOuterHitOfCell_.reset(
       (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
   assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
+                                             CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ =
+      (GPUCACell::CellTracks *)(cellStorage_.get() +
+                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+
   gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
                                  nhits,
-                                 device_theCellNeighbors_,
-                                 device_theCellNeighborsContainer_.get(),
-                                 device_theCellTracks_,
-                                 device_theCellTracksContainer_.get());
+                                 device_theCellNeighbors_.get(),
+                                 device_theCellNeighborsContainer_,
+                                 device_theCellTracks_.get(),
+                                 device_theCellTracksContainer_);
 
   // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
   device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
@@ -47,8 +55,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   assert(nActualPairs <= gpuPixelDoublets::nPairs);
   gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
                                          device_nCells_,
-                                         device_theCellNeighbors_,
-                                         device_theCellTracks_,
+                                         device_theCellNeighbors_.get(),
+                                         device_theCellTracks_.get(),
                                          hh.view(),
                                          device_isOuterHitOfCell_.get(),
                                          nActualPairs,
@@ -84,7 +92,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  hh.view(),
                  device_theCells_.get(),
                  device_nCells_,
-                 device_theCellNeighbors_,
+                 device_theCellNeighbors_.get(),
                  device_isOuterHitOfCell_.get(),
                  m_params.hardCurvCut_,
                  m_params.ptmin_,
@@ -94,13 +102,14 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  m_params.dcaCutOuterTriplet_);
 
   if (nhits > 1 && m_params.earlyFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
   }
 
   kernel_find_ntuplets(hh.view(),
                        device_theCells_.get(),
                        device_nCells_,
-                       device_theCellTracks_,
+                       device_theCellTracks_.get(),
                        tuples_d,
                        device_hitTuple_apc_,
                        quality_d,
@@ -114,11 +123,12 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
 
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && m_params.lateFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
   }
 
   if (m_params.doStats_) {
@@ -127,8 +137,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                           device_hitTuple_apc_,
                           device_theCells_.get(),
                           device_nCells_,
-                          device_theCellNeighbors_,
-                          device_theCellTracks_,
+                          device_theCellNeighbors_.get(),
+                          device_theCellTracks_.get(),
                           device_isOuterHitOfCell_.get(),
                           nhits,
                           m_params.maxNumberOfDoublets_,
@@ -154,7 +164,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
   kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
 
   // remove duplicates (tracks that share a hit)
diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
index 0e4a9aec5..08474c1af 100644
--- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
+++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
@@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       hh.view(),
       device_theCells_.get(),
       device_nCells_,
-      device_theCellNeighbors_,
+      device_theCellNeighbors_.get(),
       device_isOuterHitOfCell_.get(),
       m_params.hardCurvCut_,
       m_params.ptmin_,
@@ -68,7 +68,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
     cudaCheck(cudaGetLastError());
   }
@@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
-                                                                     device_theCellTracks_,
+                                                                     device_theCellTracks_.get(),
                                                                      tuples_d,
                                                                      device_hitTuple_apc_,
                                                                      quality_d,
@@ -108,7 +108,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
@@ -120,7 +120,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
     cudaCheck(cudaGetLastError());
   }
@@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                         device_hitTuple_apc_,
                                                                         device_theCells_.get(),
                                                                         device_nCells_,
-                                                                        device_theCellNeighbors_,
-                                                                        device_theCellTracks_,
+                                                                        device_theCellNeighbors_.get(),
+                                                                        device_theCellTracks_.get(),
                                                                         device_isOuterHitOfCell_.get(),
                                                                         nhits,
                                                                         m_params.maxNumberOfDoublets_,
@@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
+
+  // free space asap
+  // device_isOuterHitOfCell_.reset();
 }
 
 template <>
@@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   // in principle we can use "nhits" to heuristically dimension the workspace...
   device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
   assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
+      CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
+          CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
+      stream);
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ =
+      (GPUCACell::CellTracks *)(cellStorage_.get() +
+                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+
   {
     int threadsPerBlock = 128;
     // at least one block!
     int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
     gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
                                                                            nhits,
-                                                                           device_theCellNeighbors_,
-                                                                           device_theCellNeighborsContainer_.get(),
-                                                                           device_theCellTracks_,
-                                                                           device_theCellTracksContainer_.get());
+                                                                           device_theCellNeighbors_.get(),
+                                                                           device_theCellNeighborsContainer_,
+                                                                           device_theCellTracks_.get(),
+                                                                           device_theCellTracksContainer_);
     cudaCheck(cudaGetLastError());
   }
 
@@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   dim3 thrs(stride, threadsPerBlock, 1);
   gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
                                                                     device_nCells_,
-                                                                    device_theCellNeighbors_,
-                                                                    device_theCellTracks_,
+                                                                    device_theCellNeighbors_.get(),
+                                                                    device_theCellTracks_.get(),
                                                                     hh.view(),
                                                                     device_isOuterHitOfCell_.get(),
                                                                     nActualPairs,
@@ -252,7 +265,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
-    cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
     cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
index efe960bcf..3c3e3d447 100644
--- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
+++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
@@ -179,30 +179,29 @@ class CAHitNtupletGeneratorKernels {
 
 private:
   // workspace
-  CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
-  unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
-  CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
-  unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
+  unique_ptr<unsigned char[]> cellStorage_;
+  unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
+  CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
+  unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
+  CAConstants::CellTracks* device_theCellTracksContainer_;
 
   unique_ptr<GPUCACell[]> device_theCells_;
   unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
   uint32_t* device_nCells_ = nullptr;
 
   unique_ptr<HitToTuple> device_hitToTuple_;
-  AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
+  cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
 
-  AtomicPairCounter* device_hitTuple_apc_ = nullptr;
+  cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr;
 
   unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
 
-  uint8_t* device_tmws_;
-
-  unique_ptr<AtomicPairCounter::c_type[]> device_storage_;
+  unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
   // params
   Params const& m_params;
 };
 
-using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cudaCompat::GPUTraits>;
-using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cudaCompat::CPUTraits>;
+using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
+using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits>;
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
index d16e6a15f..fb505b126 100644
--- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
@@ -12,36 +12,24 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  /* not used at the moment 
-  cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
-  cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
-  cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
-  cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
-  */
+  device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
+  device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);
 
   device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
 
   device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
 
-  auto storageSize =
-      3 + (std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) + sizeof(AtomicPairCounter::c_type)) /
-              sizeof(AtomicPairCounter::c_type);
+  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
 
-  device_storage_ = Traits::template make_unique<AtomicPairCounter::c_type[]>(storageSize, stream);
-
-  device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
-  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get() + 1;
+  device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
+  device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
   device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
-  device_tmws_ = (uint8_t*)(device_storage_.get() + 3);
-
-  assert(device_tmws_ + std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <=
-         (uint8_t*)(device_storage_.get() + storageSize));
 
   if
 #ifndef __CUDACC__
       constexpr
 #endif
-      (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
+      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
index fda3e508d..e35e20be9 100644
--- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
+++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
@@ -19,8 +19,6 @@
 #include "gpuFishbone.h"
 #include "gpuPixelDoublets.h"
 
-using namespace gpuPixelDoublets;
-
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using HitsOnCPU = TrackingRecHit2DCUDA;
 
@@ -33,11 +31,11 @@ using HitContainer = pixelTrack::HitContainer;
 
 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
                                       CAConstants::TupleMultiplicity *tupleMultiplicity,
-                                      AtomicPairCounter *apc,
+                                      cms::cuda::AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
                                       uint32_t const *__restrict__ nCells,
-                                      CellNeighborsVector const *cellNeighbors,
-                                      CellTracksVector const *cellTracks,
+                                      gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
+                                      gpuPixelDoublets::CellTracksVector const *cellTracks,
                                       GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
                                       uint32_t nHits,
                                       uint32_t maxNumberOfDoublets,
@@ -81,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
       printf("Tuples overflow\n");
     if (*nCells >= maxNumberOfDoublets)
       printf("Cells overflow\n");
+    if (cellNeighbors && cellNeighbors->full())
+      printf("cellNeighbors overflow\n");
+    if (cellTracks && cellTracks->full())
+      printf("cellTracks overflow\n");
   }
 
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
@@ -190,12 +192,12 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
   }
 }
 
-__global__ void kernel_connect(AtomicPairCounter *apc1,
-                               AtomicPairCounter *apc2,  // just to zero them,
+__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
+                               cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
                                GPUCACell::Hits const *__restrict__ hhp,
                                GPUCACell *cells,
                                uint32_t const *__restrict__ nCells,
-                               CellNeighborsVector *cellNeighbors,
+                               gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
                                GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
                                float hardCurvCut,
                                float ptmin,
@@ -266,9 +268,9 @@ __global__ void kernel_connect(AtomicPairCounter *apc1,
 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                      GPUCACell *__restrict__ cells,
                                      uint32_t const *nCells,
-                                     CellTracksVector *cellTracks,
+                                     gpuPixelDoublets::CellTracksVector *cellTracks,
                                      HitContainer *foundNtuplets,
-                                     AtomicPairCounter *apc,
+                                     cms::cuda::AtomicPairCounter *apc,
                                      Quality *__restrict__ quality,
                                      unsigned int minHitsPerNtuplet) {
   // recursive: not obvious to widen
diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
index 22566d057..d0e428da6 100644
--- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
+++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
@@ -92,17 +92,16 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(edm::ProductRegistry& reg
 }
 
 CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() {
-  if (m_params.doStats_) {
-    // crash on multi-gpu processes
-    if (m_params.onGPU_) {
+  if (m_params.onGPU_) {
+    if (m_params.doStats_) {
+      // crash on multi-gpu processes
       CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
-    } else {
-      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
     }
-  }
-  if (m_params.onGPU_) {
     cudaFree(m_counters);
   } else {
+    if (m_params.doStats_) {
+      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
+    }
     delete m_counters;
   }
 }
@@ -116,14 +115,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.counters_ = m_counters;
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
 
   kernels.allocateOnGPU(stream);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   kernels.buildDoublets(hits_d, stream);
   kernels.launchKernels(hits_d, soa, stream);
   kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
+
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   } else {
diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
index 17299c1bd..823987658 100644
--- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
+++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
@@ -2,16 +2,14 @@
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
 
 #include <cuda_runtime.h>
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
-#include "CUDADataFormats/PixelTrackHeterogeneous.h"
 
-#include "CUDACore/GPUSimpleVector.h"
+#include "CUDACore/SimpleVector.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
 
 #include "CAHitNtupletGeneratorKernels.h"
-#include "HelixFitOnGPU.h"
-
-// FIXME  (split header???)
 #include "GPUCACell.h"
+#include "HelixFitOnGPU.h"
 
 namespace edm {
   class Event;
diff --git a/src/cuda/plugin-PixelTriplets/GPUCACell.h b/src/cuda/plugin-PixelTriplets/GPUCACell.h
index 918c13f64..df4354e59 100644
--- a/src/cuda/plugin-PixelTriplets/GPUCACell.h
+++ b/src/cuda/plugin-PixelTriplets/GPUCACell.h
@@ -9,11 +9,12 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
-#include "CUDACore/GPUSimpleVector.h"
-#include "CUDACore/GPUVecArray.h"
+#include "CUDACore/SimpleVector.h"
+#include "CUDACore/VecArray.h"
 #include "CUDACore/cuda_assert.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+
 #include "CAConstants.h"
 #include "CircleEq.h"
 
@@ -31,7 +32,7 @@ class GPUCACell {
   using Hits = TrackingRecHit2DSOAView;
   using hindex_type = Hits::hindex_type;
 
-  using TmpTuple = GPU::VecArray<uint32_t, 6>;
+  using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
 
   using HitContainer = pixelTrack::HitContainer;
   using Quality = trackQuality::Quality;
@@ -56,24 +57,56 @@ class GPUCACell {
     theInnerZ = hh.zGlobal(innerHitId);
     theInnerR = hh.rGlobal(innerHitId);
 
-    outerNeighbors().reset();
-    tracks().reset();
+    // link to default empty
+    theOuterNeighbors = &cellNeighbors[0];
+    theTracks = &cellTracks[0];
     assert(outerNeighbors().empty());
     assert(tracks().empty());
   }
 
   __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
+    // use smart cache
+    if (outerNeighbors().empty()) {
+      auto i = cellNeighbors.extend();  // maybe waisted....
+      if (i > 0) {
+        cellNeighbors[i].reset();
+#ifdef __CUDACC__
+        auto zero = (ptrAsInt)(&cellNeighbors[0]);
+        atomicCAS((ptrAsInt*)(&theOuterNeighbors),
+                  zero,
+                  (ptrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
+#else
+        theOuterNeighbors = &cellNeighbors[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
     return outerNeighbors().push_back(t);
   }
 
   __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
+    if (tracks().empty()) {
+      auto i = cellTracks.extend();  // maybe waisted....
+      if (i > 0) {
+        cellTracks[i].reset();
+#ifdef __CUDACC__
+        auto zero = (ptrAsInt)(&cellTracks[0]);
+        atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
+#else
+        theTracks = &cellTracks[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
     return tracks().push_back(t);
   }
 
-  __device__ __forceinline__ CellTracks& tracks() { return theTracks; }
-  __device__ __forceinline__ CellTracks const& tracks() const { return theTracks; }
-  __device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; }
-  __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; }
+  __device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
+  __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
+  __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
+  __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
   __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
   __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
   __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
@@ -246,7 +279,7 @@ class GPUCACell {
                                        GPUCACell* __restrict__ cells,
                                        CellTracksVector& cellTracks,
                                        HitContainer& foundNtuplets,
-                                       AtomicPairCounter& apc,
+                                       cms::cuda::AtomicPairCounter& apc,
                                        Quality* __restrict__ quality,
                                        TmpTuple& tmpNtuplet,
                                        const unsigned int minHitsPerNtuplet,
@@ -297,8 +330,8 @@ class GPUCACell {
   }
 
 private:
-  CellNeighbors theOuterNeighbors;
-  CellTracks theTracks;
+  CellNeighbors* theOuterNeighbors;
+  CellTracks* theTracks;
 
 public:
   int32_t theDoubletId;
diff --git a/src/cuda/plugin-PixelTriplets/RiemannFit.h b/src/cuda/plugin-PixelTriplets/RiemannFit.h
index c68aa00bd..994b1dcf9 100644
--- a/src/cuda/plugin-PixelTriplets/RiemannFit.h
+++ b/src/cuda/plugin-PixelTriplets/RiemannFit.h
@@ -486,7 +486,7 @@ namespace Rfit {
       printIt(&V, "circle_fit - V:");
       cov_rad += scatter_cov_rad;
       printIt(&cov_rad, "circle_fit - cov_rad:");
-      choleskyInversion::invert(cov_rad, G);
+      math::cholesky::invert(cov_rad, G);
       // G = cov_rad.inverse();
       renorm = G.sum();
       G *= 1. / renorm;
@@ -889,11 +889,11 @@ namespace Rfit {
 
     // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
     MatrixNd<N> Vy_inv;
-    choleskyInversion::invert(cov_with_ms, Vy_inv);
+    math::cholesky::invert(cov_with_ms, Vy_inv);
     // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
     Eigen::Matrix<double, 2, 2> Cov_params = A * Vy_inv * A.transpose();
     // Compute the Covariance Matrix of the fit parameters
-    choleskyInversion::invert(Cov_params, Cov_params);
+    math::cholesky::invert(Cov_params, Cov_params);
 
     // Now Compute the Parameters in the form [2,1]
     // The first component is q.
diff --git a/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h b/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h
index 0e11bb3f3..02766b557 100644
--- a/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h
+++ b/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h
@@ -18,8 +18,6 @@ using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
 
-using namespace Eigen;
-
 template <int N>
 __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
                               CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
diff --git a/src/cuda/plugin-PixelTriplets/choleskyInversion.h b/src/cuda/plugin-PixelTriplets/choleskyInversion.h
index eba2a1764..2cb4105f8 100644
--- a/src/cuda/plugin-PixelTriplets/choleskyInversion.h
+++ b/src/cuda/plugin-PixelTriplets/choleskyInversion.h
@@ -9,325 +9,341 @@
  * fully inlined specialized code to perform the inversion of a
  * positive defined matrix of rank up to 6.
  *
+ * adapted from ROOT::Math::CholeskyDecomp
  * originally by
  * @author Manuel Schiller
  * @date Aug 29 2008
  *
  *
  */
-namespace choleskyInversion {
-
-  template <typename M1, typename M2>
-  inline constexpr void invert11(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    dst(0, 0) = F(1.0) / src(0, 0);
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert22(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0) * src(1, 0) * luc0;
-    auto luc2 = F(1.0) / (src(1, 1) - luc1);
-
-    auto li21 = luc1 * luc0 * luc2;
-
-    dst(0, 0) = li21 + luc0;
-    dst(1, 0) = -src(1, 0) * luc0 * luc2;
-    dst(1, 1) = luc2;
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert33(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4);
-    luc5 = F(1.0) / luc5;
-
-    auto li21 = -luc0 * luc1;
-    auto li32 = -(luc2 * luc4);
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-
-    dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0;
-    dst(1, 0) = luc5 * li31 * li32 + li21 * luc2;
-    dst(1, 1) = luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc5 * li31;
-    dst(2, 1) = luc5 * li32;
-    dst(2, 2) = luc5;
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert44(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
-    luc5 = F(1.0) / luc5;
-    auto luc6 = src(3, 0);
-    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
-    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
-    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
-    luc9 = F(1.0) / luc9;
-
-    auto li21 = -luc1 * luc0;
-    auto li32 = -luc2 * luc4;
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-    auto li43 = -(luc8 * luc5);
-    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
-    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
-
-    dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
-    dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
-    dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc9 * li41 * li43 + luc5 * li31;
-    dst(2, 1) = luc9 * li42 * li43 + luc5 * li32;
-    dst(2, 2) = luc9 * li43 * li43 + luc5;
-    dst(3, 0) = luc9 * li41;
-    dst(3, 1) = luc9 * li42;
-    dst(3, 2) = luc9 * li43;
-    dst(3, 3) = luc9;
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert55(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
-    luc5 = F(1.0) / luc5;
-    auto luc6 = src(3, 0);
-    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
-    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
-    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
-    luc9 = F(1.0) / luc9;
-    auto luc10 = src(4, 0);
-    auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
-    auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
-    auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
-    auto luc14 =
-        src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
-    luc14 = F(1.0) / luc14;
-
-    auto li21 = -luc1 * luc0;
-    auto li32 = -luc2 * luc4;
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-    auto li43 = -(luc8 * luc5);
-    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
-    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
-    auto li54 = -luc13 * luc9;
-    auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
-    auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
-    auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
-                 luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
-                 luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
-                luc0;
-
-    dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
-    dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
-    dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
-    dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
-    dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
-    dst(3, 0) = luc14 * li51 * li54 + luc9 * li41;
-    dst(3, 1) = luc14 * li52 * li54 + luc9 * li42;
-    dst(3, 2) = luc14 * li53 * li54 + luc9 * li43;
-    dst(3, 3) = luc14 * li54 * li54 + luc9;
-    dst(4, 0) = luc14 * li51;
-    dst(4, 1) = luc14 * li52;
-    dst(4, 2) = luc14 * li53;
-    dst(4, 3) = luc14 * li54;
-    dst(4, 4) = luc14;
-  }
-
-  template <typename M1, typename M2>
-  inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
-    luc5 = F(1.0) / luc5;
-    auto luc6 = src(3, 0);
-    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
-    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
-    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
-    luc9 = F(1.0) / luc9;
-    auto luc10 = src(4, 0);
-    auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
-    auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
-    auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
-    auto luc14 =
-        src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
-    luc14 = F(1.0) / luc14;
-    auto luc15 = src(5, 0);
-    auto luc16 = (src(5, 1) - luc0 * luc1 * luc15);
-    auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16);
-    auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17);
-    auto luc19 =
-        (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18);
-    auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 +
-                              luc9 * luc18 * luc18 + luc14 * luc19 * luc19);
-    luc20 = F(1.0) / luc20;
-
-    auto li21 = -luc1 * luc0;
-    auto li32 = -luc2 * luc4;
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-    auto li43 = -(luc8 * luc5);
-    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
-    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
-    auto li54 = -luc13 * luc9;
-    auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
-    auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
-    auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
-                 luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
-                 luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
-                luc0;
-
-    auto li65 = -luc19 * luc14;
-    auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9;
-    auto li63 = (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5;
-    auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 -
-                 luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 +
-                 luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) *
-                luc2;
-    auto li61 =
-        (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 +
-         luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 +
-         luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 -
-         luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 -
-         luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - luc19 * luc13 * luc6 * luc9 * luc14 +
-         luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + luc19 * luc10 * luc14 - luc15) *
-        luc0;
-
-    dst(0, 0) =
-        luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
-    dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
-    dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
-    dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
-    dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
-    dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41;
-    dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42;
-    dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43;
-    dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9;
-    dst(4, 0) = luc20 * li61 * li65 + luc14 * li51;
-    dst(4, 1) = luc20 * li62 * li65 + luc14 * li52;
-    dst(4, 2) = luc20 * li63 * li65 + luc14 * li53;
-    dst(4, 3) = luc20 * li64 * li65 + luc14 * li54;
-    dst(4, 4) = luc20 * li65 * li65 + luc14;
-    dst(5, 0) = luc20 * li61;
-    dst(5, 1) = luc20 * li62;
-    dst(5, 2) = luc20 * li63;
-    dst(5, 3) = luc20 * li64;
-    dst(5, 4) = luc20 * li65;
-    dst(5, 5) = luc20;
-  }
-
-  template <typename M>
-  inline constexpr void symmetrize11(M& dst) {}
-  template <typename M>
-  inline constexpr void symmetrize22(M& dst) {
-    dst(0, 1) = dst(1, 0);
-  }
-  template <typename M>
-  inline constexpr void symmetrize33(M& dst) {
-    symmetrize22(dst);
-    dst(0, 2) = dst(2, 0);
-    dst(1, 2) = dst(2, 1);
-  }
-  template <typename M>
-  inline constexpr void symmetrize44(M& dst) {
-    symmetrize33(dst);
-    dst(0, 3) = dst(3, 0);
-    dst(1, 3) = dst(3, 1);
-    dst(2, 3) = dst(3, 2);
-  }
-  template <typename M>
-  inline constexpr void symmetrize55(M& dst) {
-    symmetrize44(dst);
-    dst(0, 4) = dst(4, 0);
-    dst(1, 4) = dst(4, 1);
-    dst(2, 4) = dst(4, 2);
-    dst(3, 4) = dst(4, 3);
-  }
-  template <typename M>
-  inline constexpr void symmetrize66(M& dst) {
-    symmetrize55(dst);
-    dst(0, 5) = dst(5, 0);
-    dst(1, 5) = dst(5, 1);
-    dst(2, 5) = dst(5, 2);
-    dst(3, 5) = dst(5, 3);
-    dst(4, 5) = dst(5, 4);
-  }
-
-  template <typename M1, typename M2, int N>
-  struct Inverter {
-    static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 1> {
-    static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 2> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert22(src, dst);
+namespace math {
+  namespace cholesky {
+
+    template <typename M1, typename M2>
+    inline constexpr void invert11(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      dst(0, 0) = F(1.0) / src(0, 0);
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert22(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0) * src(1, 0) * luc0;
+      auto luc2 = F(1.0) / (src(1, 1) - luc1);
+
+      auto li21 = luc1 * luc0 * luc2;
+
+      dst(0, 0) = li21 + luc0;
+      dst(1, 0) = -src(1, 0) * luc0 * luc2;
+      dst(1, 1) = luc2;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert33(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4);
+      luc5 = F(1.0) / luc5;
+
+      auto li21 = -luc0 * luc1;
+      auto li32 = -(luc2 * luc4);
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+
+      dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0;
+      dst(1, 0) = luc5 * li31 * li32 + li21 * luc2;
+      dst(1, 1) = luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc5 * li31;
+      dst(2, 1) = luc5 * li32;
+      dst(2, 2) = luc5;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert44(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+
+      dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc9 * li41;
+      dst(3, 1) = luc9 * li42;
+      dst(3, 2) = luc9 * li43;
+      dst(3, 3) = luc9;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert55(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+      auto luc10 = src(4, 0);
+      auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+      auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+      auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+      auto luc14 =
+          src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+      luc14 = F(1.0) / luc14;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+      auto li54 = -luc13 * luc9;
+      auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+      auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+      auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                   luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                   luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                  luc0;
+
+      dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc14 * li51 * li54 + luc9 * li41;
+      dst(3, 1) = luc14 * li52 * li54 + luc9 * li42;
+      dst(3, 2) = luc14 * li53 * li54 + luc9 * li43;
+      dst(3, 3) = luc14 * li54 * li54 + luc9;
+      dst(4, 0) = luc14 * li51;
+      dst(4, 1) = luc14 * li52;
+      dst(4, 2) = luc14 * li53;
+      dst(4, 3) = luc14 * li54;
+      dst(4, 4) = luc14;
+    }
+
+    template <typename M1, typename M2>
+    inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+      auto luc10 = src(4, 0);
+      auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+      auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+      auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+      auto luc14 =
+          src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+      luc14 = F(1.0) / luc14;
+      auto luc15 = src(5, 0);
+      auto luc16 = (src(5, 1) - luc0 * luc1 * luc15);
+      auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16);
+      auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17);
+      auto luc19 =
+          (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18);
+      auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 +
+                                luc9 * luc18 * luc18 + luc14 * luc19 * luc19);
+      luc20 = F(1.0) / luc20;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+      auto li54 = -luc13 * luc9;
+      auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+      auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+      auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                   luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                   luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                  luc0;
+
+      auto li65 = -luc19 * luc14;
+      auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9;
+      auto li63 =
+          (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5;
+      auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 -
+                   luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 +
+                   luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) *
+                  luc2;
+      auto li61 =
+          (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 +
+           luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 +
+           luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 -
+           luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 -
+           luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 -
+           luc19 * luc13 * luc6 * luc9 * luc14 + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 +
+           luc19 * luc10 * luc14 - luc15) *
+          luc0;
+
+      dst(0, 0) = luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 +
+                  luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41;
+      dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42;
+      dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43;
+      dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9;
+      dst(4, 0) = luc20 * li61 * li65 + luc14 * li51;
+      dst(4, 1) = luc20 * li62 * li65 + luc14 * li52;
+      dst(4, 2) = luc20 * li63 * li65 + luc14 * li53;
+      dst(4, 3) = luc20 * li64 * li65 + luc14 * li54;
+      dst(4, 4) = luc20 * li65 * li65 + luc14;
+      dst(5, 0) = luc20 * li61;
+      dst(5, 1) = luc20 * li62;
+      dst(5, 2) = luc20 * li63;
+      dst(5, 3) = luc20 * li64;
+      dst(5, 4) = luc20 * li65;
+      dst(5, 5) = luc20;
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize11(M& dst) {}
+
+    template <typename M>
+    inline constexpr void symmetrize22(M& dst) {
+      dst(0, 1) = dst(1, 0);
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize33(M& dst) {
       symmetrize22(dst);
+      dst(0, 2) = dst(2, 0);
+      dst(1, 2) = dst(2, 1);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 3> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert33(src, dst);
+
+    template <typename M>
+    inline constexpr void symmetrize44(M& dst) {
       symmetrize33(dst);
+      dst(0, 3) = dst(3, 0);
+      dst(1, 3) = dst(3, 1);
+      dst(2, 3) = dst(3, 2);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 4> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert44(src, dst);
+
+    template <typename M>
+    inline constexpr void symmetrize55(M& dst) {
       symmetrize44(dst);
+      dst(0, 4) = dst(4, 0);
+      dst(1, 4) = dst(4, 1);
+      dst(2, 4) = dst(4, 2);
+      dst(3, 4) = dst(4, 3);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 5> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert55(src, dst);
+
+    template <typename M>
+    inline constexpr void symmetrize66(M& dst) {
       symmetrize55(dst);
+      dst(0, 5) = dst(5, 0);
+      dst(1, 5) = dst(5, 1);
+      dst(2, 5) = dst(5, 2);
+      dst(3, 5) = dst(5, 3);
+      dst(4, 5) = dst(5, 4);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 6> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert66(src, dst);
-      symmetrize66(dst);
-    }
-  };
 
-  // Eigen interface
-  template <typename D1, typename D2>
-  inline constexpr void invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
-    using M1 = Eigen::DenseBase<D1>;
-    using M2 = Eigen::DenseBase<D2>;
-    Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
-  }
+    template <typename M1, typename M2, int N>
+    struct Inverter {
+      static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 1> {
+      static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 2> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert22(src, dst);
+        symmetrize22(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 3> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert33(src, dst);
+        symmetrize33(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 4> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert44(src, dst);
+        symmetrize44(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 5> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert55(src, dst);
+        symmetrize55(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 6> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert66(src, dst);
+        symmetrize66(dst);
+      }
+    };
+
+    // Eigen interface
+    template <typename D1, typename D2>
+    inline constexpr void invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
+      using M1 = Eigen::DenseBase<D1>;
+      using M2 = Eigen::DenseBase<D2>;
+      Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+    }
 
-}  // namespace choleskyInversion
+  }  // namespace cholesky
+}  // namespace math
 
 #endif  // DataFormat_Math_choleskyInversion_h
diff --git a/src/cuda/plugin-PixelTriplets/gpuFishbone.h b/src/cuda/plugin-PixelTriplets/gpuFishbone.h
index b6feaae30..2e2446ea3 100644
--- a/src/cuda/plugin-PixelTriplets/gpuFishbone.h
+++ b/src/cuda/plugin-PixelTriplets/gpuFishbone.h
@@ -9,7 +9,7 @@
 
 #include "DataFormats/approx_atan2.h"
 #include "Geometry/phase1PixelTopology.h"
-#include "CUDACore/GPUVecArray.h"
+#include "CUDACore/VecArray.h"
 #include "CUDACore/cuda_assert.h"
 
 #include "GPUCACell.h"
diff --git a/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h b/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h
index d19ab8930..e906f85f1 100644
--- a/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h
+++ b/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h
@@ -7,8 +7,6 @@
 
 namespace gpuPixelDoublets {
 
-  using namespace gpuPixelDoubletsAlgos;
-
   constexpr int nPairs = 13 + 2 + 4;
   static_assert(nPairs <= CAConstants::maxNumberOfLayerPairs());
 
@@ -75,6 +73,17 @@ namespace gpuPixelDoublets {
     int first = blockIdx.x * blockDim.x + threadIdx.x;
     for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
       isOuterHitOfCell[i].reset();
+
+    if (0 == first) {
+      cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
+      cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
+      auto i = cellNeighbors->extend();
+      assert(0 == i);
+      (*cellNeighbors)[0].reset();
+      i = cellTracks->extend();
+      assert(0 == i);
+      (*cellTracks)[0].reset();
+    }
   }
 
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
diff --git a/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h b/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
index 3accacf63..6d6a62c88 100644
--- a/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
+++ b/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
@@ -9,13 +9,13 @@
 
 #include "CUDADataFormats/TrackingRecHit2DCUDA.h"
 #include "DataFormats/approx_atan2.h"
-#include "CUDACore/GPUVecArray.h"
+#include "CUDACore/VecArray.h"
 #include "CUDACore/cuda_assert.h"
 
 #include "CAConstants.h"
 #include "GPUCACell.h"
 
-namespace gpuPixelDoubletsAlgos {
+namespace gpuPixelDoublets {
 
   using CellNeighbors = CAConstants::CellNeighbors;
   using CellTracks = CAConstants::CellTracks;
@@ -239,6 +239,6 @@ namespace gpuPixelDoubletsAlgos {
     }  // loop in block...
   }
 
-}  // namespace gpuPixelDoubletsAlgos
+}  // namespace gpuPixelDoublets
 
 #endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
diff --git a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
index f98cbf959..201971770 100644
--- a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
+++ b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
@@ -48,7 +48,7 @@ namespace gpuVertexFinder {
     assert(pdata);
     assert(zt);
 
-    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
@@ -101,7 +101,7 @@ namespace gpuVertexFinder {
         nn[i]++;
       };
 
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -122,7 +122,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         iv[i] = j;  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -171,7 +171,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         minJ = j;
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
       // should belong to the same cluster...
       assert(iv[i] == iv[minJ]);
       assert(nn[i] <= nn[iv[i]]);
diff --git a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
index 8ac205b00..504ee4cf2 100644
--- a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
+++ b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
@@ -44,7 +44,7 @@ namespace gpuVertexFinder {
     assert(pdata);
     assert(zt);
 
-    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
@@ -96,7 +96,7 @@ namespace gpuVertexFinder {
         nn[i]++;
       };
 
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -118,7 +118,7 @@ namespace gpuVertexFinder {
         mz = zt[j];
         iv[i] = j;  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -172,7 +172,7 @@ namespace gpuVertexFinder {
         }
         assert(iv[i] == iv[j]);
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
     __syncthreads();
 #endif
@@ -194,7 +194,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         iv[i] = iv[j];  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __shared__ unsigned int foundClusters;
diff --git a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
index dd6f5fef7..6e7da0efd 100644
--- a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
+++ b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
@@ -44,7 +44,7 @@ namespace gpuVertexFinder {
     assert(pdata);
     assert(zt);
 
-    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
@@ -97,7 +97,7 @@ namespace gpuVertexFinder {
         nn[i]++;
       };
 
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __shared__ int nloops;
@@ -165,7 +165,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         iv[i] = iv[j];  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __shared__ unsigned int foundClusters;
diff --git a/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h b/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
index a1f1f96bf..f3260cad7 100644
--- a/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
+++ b/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
@@ -109,7 +109,7 @@ namespace gpuVertexFinder {
     loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
     cudaCheck(cudaGetLastError());
 #else
-    cudaCompat::resetGrid();
+    cms::cudacompat::resetGrid();
     init(soa, ws_d.get());
     loadTracks(tksoa, soa, ws_d.get(), ptMin);
 #endif
diff --git a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
index 207195bda..06624744e 100644
--- a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
+++ b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -44,6 +44,7 @@ class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
   std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
   PixelFormatterErrors errors_;
 
+  const bool isRun2_;
   const bool includeErrors_;
   const bool useQuality_;
 };
@@ -52,6 +53,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
     : rawGetToken_(reg.consumes<FEDRawDataCollection>()),
       digiPutToken_(reg.produces<cms::cuda::Product<SiPixelDigisCUDA>>()),
       clusterPutToken_(reg.produces<cms::cuda::Product<SiPixelClustersCUDA>>()),
+      isRun2_(true),
       includeErrors_(true),
       useQuality_(true) {
   if (includeErrors_) {
@@ -145,7 +147,8 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
 
   }  // end of for loop
 
-  gpuAlgo_.makeClustersAsync(gpuMap,
+  gpuAlgo_.makeClustersAsync(isRun2_,
+                             gpuMap,
                              gpuModulesToUnpack,
                              gpuGains,
                              *wordFedAppender_,
diff --git a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
index e58edcf1a..f5070130a 100644
--- a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
+++ b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
@@ -21,24 +21,19 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-// cub includes
-#include <cub/cub.cuh>
-
 // CMSSW includes
 #include "CUDADataFormats/gpuClusteringConstants.h"
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-
 #include "CondFormats/SiPixelFedCablingMapGPU.h"
 
+// local includes
+#include "SiPixelRawToClusterGPUKernel.h"
 #include "gpuCalibPixel.h"
 #include "gpuClusterChargeCut.h"
 #include "gpuClustering.h"
 
-// local includes
-#include "SiPixelRawToClusterGPUKernel.h"
-
 namespace pixelgpudetails {
 
   // number of words for all the FEDs
@@ -363,7 +358,7 @@ namespace pixelgpudetails {
                                    uint32_t *pdigi,
                                    uint32_t *rawIdArr,
                                    uint16_t *moduleId,
-                                   GPU::SimpleVector<PixelErrorCompact> *err,
+                                   cms::cuda::SimpleVector<PixelErrorCompact> *err,
                                    bool useQualityInfo,
                                    bool includeErrors,
                                    bool debug) {
@@ -489,8 +484,8 @@ namespace pixelgpudetails {
     }
 
     __shared__ uint32_t ws[32];
-    blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
-    blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::MaxNumModules - 1024, ws);
+    cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
+    cms::cuda::blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::MaxNumModules - 1024, ws);
 
     for (int i = first + 1025, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) {
       moduleStart[i] += moduleStart[1024];
@@ -524,7 +519,8 @@ namespace pixelgpudetails {
   }
 
   // Interface to outside
-  void SiPixelRawToClusterGPUKernel::makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap,
+  void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2,
+                                                       const SiPixelFedCablingMapGPU *cablingMap,
                                                        const unsigned char *modToUnp,
                                                        const SiPixelGainForHLTonGPU *gains,
                                                        const WordFedAppender &wordFed,
@@ -600,7 +596,8 @@ namespace pixelgpudetails {
       int blocks =
           (std::max(int(wordCounter), int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
 
-      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
+      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
+                                                                        digis_d.moduleInd(),
                                                                         digis_d.c_xx(),
                                                                         digis_d.c_yy(),
                                                                         digis_d.adc(),
diff --git a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
index a9e7a13cc..3cbce9e71 100644
--- a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
+++ b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
@@ -7,7 +7,7 @@
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
-#include "CUDACore/GPUSimpleVector.h"
+#include "CUDACore/SimpleVector.h"
 #include "CUDACore/host_unique_ptr.h"
 #include "CUDACore/host_noncached_unique_ptr.h"
 #include "DataFormats/PixelErrors.h"
@@ -167,7 +167,8 @@ namespace pixelgpudetails {
     SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
     SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
 
-    void makeClustersAsync(const SiPixelFedCablingMapGPU* cablingMap,
+    void makeClustersAsync(bool isRun2,
+                           const SiPixelFedCablingMapGPU* cablingMap,
                            const unsigned char* modToUnp,
                            const SiPixelGainForHLTonGPU* gains,
                            const WordFedAppender& wordFed,
diff --git a/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h b/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h
index 029b0912d..da36be6c4 100644
--- a/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h
+++ b/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h
@@ -13,12 +13,14 @@ namespace gpuCalibPixel {
 
   constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
 
+  // valid for run2
   constexpr float VCaltoElectronGain = 47;         // L2-4: 47 +- 4.7
   constexpr float VCaltoElectronGain_L1 = 50;      // L1:   49.6 +- 2.6
   constexpr float VCaltoElectronOffset = -60;      // L2-4: -60 +- 130
   constexpr float VCaltoElectronOffset_L1 = -670;  // L1:   -670 +- 220
 
-  __global__ void calibDigis(uint16_t* id,
+  __global__ void calibDigis(bool isRun2,
+                             uint16_t* id,
                              uint16_t const* __restrict__ x,
                              uint16_t const* __restrict__ y,
                              uint16_t* adc,
@@ -41,8 +43,8 @@ namespace gpuCalibPixel {
       if (InvId == id[i])
         continue;
 
-      float conversionFactor = id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
-      float offset = id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
+      float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f;
+      float offset = (isRun2) ? (id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset) : 0;
 
       bool isDeadColumn = false, isNoisyColumn = false;
 
diff --git a/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
index ef21acc36..d0dd93044 100644
--- a/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
+++ b/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
@@ -89,7 +89,7 @@ namespace gpuClustering {
 
     // renumber
     __shared__ uint16_t ws[32];
-    blockPrefixScan(newclusId, nclus, ws);
+    cms::cuda::blockPrefixScan(newclusId, nclus, ws);
 
     assert(nclus >= newclusId[nclus - 1]);
 
diff --git a/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h b/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h
index cb3a0244f..84609bd10 100644
--- a/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h
+++ b/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h
@@ -80,7 +80,7 @@ namespace gpuClustering {
     //init hist  (ymax=416 < 512 : 9bits)
     constexpr uint32_t maxPixInModule = 4000;
     constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
-    using Hist = HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
+    using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter ws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
diff --git a/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu b/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu
index 7b93508e5..4cd3fc152 100644
--- a/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu
+++ b/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu
@@ -63,9 +63,7 @@ namespace pixelgpudetails {
     }
 
     if (nHits) {
-      auto hws = cms::cuda::make_device_unique<uint8_t[]>(TrackingRecHit2DSOAView::Hist::wsSize(), stream);
-      cms::cuda::fillManyFromVector(
-          hits_d.phiBinner(), hws.get(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream);
+      cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream);
       cudaCheck(cudaGetLastError());
     }
 
diff --git a/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h
index f70cab66c..433d3b012 100644
--- a/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h
+++ b/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h
@@ -7,14 +7,14 @@
 
 #include "CUDADataFormats/BeamSpotCUDA.h"
 #include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "DataFormats/approx_atan2.h"
 #include "CUDACore/cuda_assert.h"
 #include "CondFormats/pixelCPEforGPU.h"
-#include "DataFormats/approx_atan2.h"
 
 namespace gpuPixelRecHits {
 
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
-                          BeamSpotCUDA::Data const* __restrict__ bs,
+                          BeamSpotPOD const* __restrict__ bs,
                           SiPixelDigisCUDA::DeviceConstView const* __restrict__ pdigis,
                           int numElements,
                           SiPixelClustersCUDA::DeviceConstView const* __restrict__ pclusters,
@@ -134,6 +134,9 @@ namespace gpuPixelRecHits {
 
       __syncthreads();
 
+      // pixmx is not available in the binary dumps
+      //auto pixmx = cpeParams->detParams(me).pixmx;
+      auto pixmx = std::numeric_limits<uint16_t>::max();
       for (int i = first; i < numElements; i += blockDim.x) {
         auto id = digis.moduleInd(i);
         if (id == InvId)
@@ -148,7 +151,7 @@ namespace gpuPixelRecHits {
         assert(cl < MaxHitsInIter);
         auto x = digis.xx(i);
         auto y = digis.yy(i);
-        auto ch = digis.adc(i);
+        auto ch = std::min(digis.adc(i), pixmx);
         atomicAdd(&clusParams.charge[cl], ch);
         if (clusParams.minRow[cl] == x)
           atomicAdd(&clusParams.Q_f_X[cl], ch);
diff --git a/src/cuda/test/AtomicPairCounter_t.cu b/src/cuda/test/AtomicPairCounter_t.cu
index 39041b4b4..8e737e9cb 100644
--- a/src/cuda/test/AtomicPairCounter_t.cu
+++ b/src/cuda/test/AtomicPairCounter_t.cu
@@ -1,9 +1,12 @@
-#include "CUDACore/cudaCheck.h"
-#include "CUDACore/AtomicPairCounter.h"
+#include <iostream>
+
+#include <cuda_runtime.h>
 
+#include "CUDACore/AtomicPairCounter.h"
+#include "CUDACore/cudaCheck.h"
 #include "CUDACore/cuda_assert.h"
 
-__global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
+__global__ void update(cms::cuda::AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i >= n)
     return;
@@ -17,12 +20,12 @@ __global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uin
     cont[j] = i;
 };
 
-__global__ void finalize(AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
+__global__ void finalize(cms::cuda::AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
   assert(dc->get().m == n);
   ind[n] = dc->get().n;
 }
 
-__global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) {
+__global__ void verify(cms::cuda::AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i >= n)
     return;
@@ -37,13 +40,12 @@ __global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_
     assert(cont[ib] == k);
 }
 
-#include <iostream>
 int main() {
-  AtomicPairCounter *dc_d;
-  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
-  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
+  cms::cuda::AtomicPairCounter *dc_d;
+  cudaCheck(cudaMalloc(&dc_d, sizeof(cms::cuda::AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(cms::cuda::AtomicPairCounter)));
 
-  std::cout << "size " << sizeof(AtomicPairCounter) << std::endl;
+  std::cout << "size " << sizeof(cms::cuda::AtomicPairCounter) << std::endl;
 
   constexpr uint32_t N = 20000;
   constexpr uint32_t M = N * 6;
@@ -56,8 +58,8 @@ int main() {
   finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000);
   verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
 
-  AtomicPairCounter dc;
-  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
+  cms::cuda::AtomicPairCounter dc;
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(cms::cuda::AtomicPairCounter), cudaMemcpyDeviceToHost));
 
   std::cout << dc.get().n << ' ' << dc.get().m << std::endl;
 
diff --git a/src/cuda/test/HistoContainer_t.cu b/src/cuda/test/HistoContainer_t.cu
index 0cae01f56..15aafe0d3 100644
--- a/src/cuda/test/HistoContainer_t.cu
+++ b/src/cuda/test/HistoContainer_t.cu
@@ -4,9 +4,12 @@
 #include <limits>
 #include <random>
 
-#include "CUDACore/device_unique_ptr.h"
-#include "CUDACore/cudaCheck.h"
 #include "CUDACore/HistoContainer.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
 
 template <typename T>
 void go() {
@@ -15,7 +18,7 @@ void go() {
 
   constexpr int N = 12000;
   T v[N];
-  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
+  auto v_d = make_device_unique<T[]>(N, nullptr);
 
   cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
@@ -25,14 +28,13 @@ void go() {
 
   using Hist = HistoContainer<T, 128, N, 8 * sizeof(T), uint32_t, nParts>;
   std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
-            << Hist::capacity() << ' ' << Hist::wsSize() << ' '
+            << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' '
             << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
 
   Hist h;
-  auto h_d = cms::cuda::make_device_unique<Hist[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Hist::wsSize(), nullptr);
+  auto h_d = make_device_unique<Hist[]>(1, nullptr);
 
-  auto off_d = cms::cuda::make_device_unique<uint32_t[]>(nParts + 1, nullptr);
+  auto off_d = make_device_unique<uint32_t[]>(nParts + 1, nullptr);
 
   for (int it = 0; it < 5; ++it) {
     offsets[0] = 0;
@@ -67,7 +69,7 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
-    cms::cuda::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
+    fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
@@ -144,6 +146,8 @@ void go() {
 }
 
 int main() {
+  cms::cudatest::requireDevices();
+
   go<int16_t>();
   go<int8_t>();
 
diff --git a/src/cuda/test/HistoContainer_t_cpu.cc b/src/cuda/test/HistoContainer_t_cpu.cc
index 475a5484f..ad1121ef1 100644
--- a/src/cuda/test/HistoContainer_t_cpu.cc
+++ b/src/cuda/test/HistoContainer_t_cpu.cc
@@ -5,6 +5,9 @@
 #include <random>
 
 #include "CUDACore/HistoContainer.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
 
 template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
 void go() {
diff --git a/src/cuda/test/OneHistoContainer_t.cu b/src/cuda/test/OneHistoContainer_t.cu
index a33b9d032..960f77eca 100644
--- a/src/cuda/test/OneHistoContainer_t.cu
+++ b/src/cuda/test/OneHistoContainer_t.cu
@@ -4,10 +4,13 @@
 #include <random>
 #include <limits>
 
-#include "CUDACore/device_unique_ptr.h"
-#include "CUDACore/cudaCheck.h"
 #include "CUDACore/HistoContainer.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
 
 template <typename T, int NBINS, int S, int DELTA>
 __global__ void mykernel(T const* __restrict__ v, uint32_t N) {
@@ -105,7 +108,7 @@ void go() {
   constexpr int N = 12000;
   T v[N];
 
-  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
+  auto v_d = make_device_unique<T[]>(N, nullptr);
   assert(v_d.get());
 
   using Hist = HistoContainer<T, NBINS, N, S>;
@@ -124,11 +127,13 @@ void go() {
     assert(v);
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
     assert(v_d.get());
-    cms::cuda::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
+    launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
   }
 }
 
 int main() {
+  cms::cudatest::requireDevices();
+
   go<int16_t>();
   go<uint8_t, 128, 8, 4>();
   go<uint16_t, 313 / 2, 9, 4>();
diff --git a/src/cuda/test/OneToManyAssoc_t.h b/src/cuda/test/OneToManyAssoc_t.h
index b7506aa91..69c3ade3d 100644
--- a/src/cuda/test/OneToManyAssoc_t.h
+++ b/src/cuda/test/OneToManyAssoc_t.h
@@ -9,20 +9,20 @@
 #ifdef __CUDACC__
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
 #include "CUDACore/currentDevice.h"
 #endif
 
 #include "CUDACore/HistoContainer.h"
+using cms::cuda::AtomicPairCounter;
 
 constexpr uint32_t MaxElem = 64000;
 constexpr uint32_t MaxTk = 8000;
 constexpr uint32_t MaxAssocs = 4 * MaxTk;
-using Assoc = OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
-
-using SmallAssoc = OneToManyAssoc<uint16_t, 128, MaxAssocs>;
-
-using Multiplicity = OneToManyAssoc<uint16_t, 8, MaxTk>;
 
+using Assoc = cms::cuda::OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
+using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, MaxAssocs>;
+using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
 using TK = std::array<uint16_t, 4>;
 
 __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
@@ -97,6 +97,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co
 
 int main() {
 #ifdef __CUDACC__
+  cms::cudatest::requireDevices();
   auto current_device = cms::cuda::currentDevice();
 #else
   // make sure cuda emulation is working
@@ -117,9 +118,9 @@ int main() {
   assert(gridDim.z == 1);
 #endif
 
-  std::cout << "OneToManyAssoc " << Assoc::nbins() << ' ' << Assoc::capacity() << ' ' << Assoc::wsSize() << std::endl;
-  std::cout << "OneToManyAssoc (small) " << SmallAssoc::nbins() << ' ' << SmallAssoc::capacity() << ' '
-            << SmallAssoc::wsSize() << std::endl;
+  std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl;
+  std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' '
+            << SmallAssoc::capacity() << std::endl;
 
   std::mt19937 eng;
 
@@ -168,8 +169,6 @@ int main() {
   assert(v_d.get());
   auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
   auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
-
   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
   auto a_d = std::make_unique<Assoc>();
@@ -177,7 +176,7 @@ int main() {
   auto v_d = tr.data();
 #endif
 
-  cms::cuda::launchZero(a_d.get(), 0);
+  launchZero(a_d.get(), 0);
 
 #ifdef __CUDACC__
   auto nThreads = 256;
@@ -185,12 +184,12 @@ int main() {
 
   count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 
-  cms::cuda::launchFinalize(a_d.get(), ws_d.get(), 0);
+  launchFinalize(a_d.get(), 0);
   verify<<<1, 1>>>(a_d.get());
   fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 #else
   count(v_d, a_d.get(), N);
-  cms::cuda::launchFinalize(a_d.get());
+  launchFinalize(a_d.get());
   verify(a_d.get());
   fill(v_d, a_d.get(), N);
 #endif
@@ -228,7 +227,7 @@ int main() {
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   nBlocks = (N + nThreads - 1) / nThreads;
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
-  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
+  finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
   cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
@@ -236,19 +235,19 @@ int main() {
 
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
-  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
+  finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
   verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
 
 #else
   dc_d = &dc;
   fillBulk(dc_d, v_d, a_d.get(), N);
-  cms::cuda::finalizeBulk(dc_d, a_d.get());
+  finalizeBulk(dc_d, a_d.get());
   verifyBulk(a_d.get(), dc_d);
   memcpy(&la, a_d.get(), sizeof(Assoc));
 
   AtomicPairCounter sdc(0);
   fillBulk(&sdc, v_d, sa_d.get(), N);
-  cms::cuda::finalizeBulk(&sdc, sa_d.get());
+  finalizeBulk(&sdc, sa_d.get());
   verifyBulk(sa_d.get(), &sdc);
 
 #endif
@@ -277,8 +276,8 @@ int main() {
   auto m1_d = std::make_unique<Multiplicity>();
   auto m2_d = std::make_unique<Multiplicity>();
 #endif
-  cms::cuda::launchZero(m1_d.get(), 0);
-  cms::cuda::launchZero(m2_d.get(), 0);
+  launchZero(m1_d.get(), 0);
+  launchZero(m2_d.get(), 0);
 
 #ifdef __CUDACC__
   nBlocks = (4 * N + nThreads - 1) / nThreads;
@@ -286,8 +285,8 @@ int main() {
   countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
   verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
 
-  cms::cuda::launchFinalize(m1_d.get(), ws_d.get(), 0);
-  cms::cuda::launchFinalize(m2_d.get(), ws_d.get(), 0);
+  launchFinalize(m1_d.get(), 0);
+  launchFinalize(m2_d.get(), 0);
   verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
 
   cudaCheck(cudaGetLastError());
@@ -297,8 +296,8 @@ int main() {
   countMultiLocal(v_d, m2_d.get(), N);
   verifyMulti(m1_d.get(), m2_d.get());
 
-  cms::cuda::launchFinalize(m1_d.get());
-  cms::cuda::launchFinalize(m2_d.get());
+  launchFinalize(m1_d.get());
+  launchFinalize(m2_d.get());
   verifyMulti(m1_d.get(), m2_d.get());
 #endif
   return 0;
diff --git a/src/cuda/test/TrajectoryStateSOA_t.h b/src/cuda/test/TrajectoryStateSOA_t.h
index c0892fe1e..2fcf9fc09 100644
--- a/src/cuda/test/TrajectoryStateSOA_t.h
+++ b/src/cuda/test/TrajectoryStateSOA_t.h
@@ -51,10 +51,15 @@ __global__ void testTSSoA(TS* pts, int n) {
 }
 
 #ifdef __CUDACC__
+#include "CUDACore/requireDevices.h"
 #include "CUDACore/cudaCheck.h"
 #endif
 
 int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
   TS ts;
 
 #ifdef __CUDACC__
diff --git a/src/cuda/test/VertexFinder_t.h b/src/cuda/test/VertexFinder_t.h
index 1f944aa71..53f26d2de 100644
--- a/src/cuda/test/VertexFinder_t.h
+++ b/src/cuda/test/VertexFinder_t.h
@@ -5,16 +5,17 @@
 #include <vector>
 
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
 #include "CUDACore/launch.h"
 #ifdef USE_DBSCAN
 #include "plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h"
-#define CLUSTERIZE clusterTracksDBSCAN
+#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
 #elif USE_ITERATIVE
 #include "plugin-PixelVertexFinding/gpuClusterTracksIterative.h"
-#define CLUSTERIZE clusterTracksIterative
+#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
 #else
 #include "plugin-PixelVertexFinding/gpuClusterTracksByDensity.h"
-#define CLUSTERIZE clusterTracksByDensityKernel
+#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
 #endif
 #include "plugin-PixelVertexFinding/gpuFitVertices.h"
 #include "plugin-PixelVertexFinding/gpuSortByPt2.h"
@@ -42,8 +43,6 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
 #endif
 #endif
 
-using namespace gpuVertexFinder;
-
 struct Event {
   std::vector<float> zvert;
   std::vector<uint16_t> itrack;
@@ -102,10 +101,10 @@ struct ClusterGenerator {
 };
 
 // a macro SORRY
-#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(ZVertices, M))
-#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(WorkSpace, M))
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
+#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
 
-__global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
+__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
   auto const& __restrict__ data = *pdata;
   auto const& __restrict__ ws = *pws;
   printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
@@ -113,11 +112,13 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
 
 int main() {
 #ifdef __CUDACC__
-  auto onGPU_d = cms::cuda::make_device_unique<ZVertices[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<WorkSpace[]>(1, nullptr);
+  cms::cudatest::requireDevices();
+
+  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
 #else
-  auto onGPU_d = std::make_unique<ZVertices>();
-  auto ws_d = std::make_unique<WorkSpace>();
+  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
 #endif
 
   Event ev;
@@ -180,7 +181,7 @@ int main() {
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
@@ -242,7 +243,7 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
@@ -262,7 +263,7 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cms::cuda::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       gridDim.x = 1;
@@ -274,10 +275,10 @@ int main() {
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
diff --git a/src/cuda/test/cudastdAlgorithm_t.cu b/src/cuda/test/cudastdAlgorithm_t.cu
index b638624d1..4ecc93cf5 100644
--- a/src/cuda/test/cudastdAlgorithm_t.cu
+++ b/src/cuda/test/cudastdAlgorithm_t.cu
@@ -2,6 +2,7 @@
 #include <iostream>
 
 #include "CUDACore/cudastdAlgorithm.h"
+#include "CUDACore/requireDevices.h"
 #include "CUDACore/launch.h"
 
 __global__ void testBinaryFind() {
@@ -22,4 +23,8 @@ __global__ void testBinaryFind() {
 
 void wrapper() { cms::cuda::launch(testBinaryFind, {32, 64}); }
 
-int main() { wrapper(); }
+int main() {
+  cms::cudatest::requireDevices();
+
+  wrapper();
+}
diff --git a/src/cuda/test/eigenSoA_t.h b/src/cuda/test/eigenSoA_t.h
index afc17dd5c..5409bce10 100644
--- a/src/cuda/test/eigenSoA_t.h
+++ b/src/cuda/test/eigenSoA_t.h
@@ -22,7 +22,7 @@ __global__ void testBasicSoA(float* p) {
   assert(!isPowerOf2(1026));
 
   using M3 = Eigen::Matrix<float, 3, 3>;
-  ;
+
   __shared__ eigenSoA::MatrixSoA<M3, 64> m;
 
   int first = threadIdx.x + blockIdx.x * blockDim.x;
@@ -59,10 +59,15 @@ __global__ void testBasicSoA(float* p) {
 #include <random>
 
 #ifdef __CUDACC__
+#include "CUDACore/requireDevices.h"
 #include "CUDACore/cudaCheck.h"
 #endif
 
 int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
   float p[1024];
 
   std::uniform_real_distribution<float> rgen(0.01, 0.99);
diff --git a/src/cuda/test/gpuClustering_t.h b/src/cuda/test/gpuClustering_t.h
index 66b8adaa8..5388e3499 100644
--- a/src/cuda/test/gpuClustering_t.h
+++ b/src/cuda/test/gpuClustering_t.h
@@ -13,6 +13,7 @@
 
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
 #include "CUDACore/launch.h"
 #endif
 
@@ -21,6 +22,10 @@
 #include "plugin-SiPixelClusterizer/gpuClusterChargeCut.h"
 
 int main(void) {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
   using namespace gpuClustering;
 
   int numElements = 256 * 2000;
diff --git a/src/cuda/test/prefixScan_t.cu b/src/cuda/test/prefixScan_t.cu
index de1624db8..307c989b0 100644
--- a/src/cuda/test/prefixScan_t.cu
+++ b/src/cuda/test/prefixScan_t.cu
@@ -1,9 +1,22 @@
 #include <iostream>
 
-#include <cub/cub.cuh>
-
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/prefixScan.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
+
+template <typename T>
+struct format_traits {
+public:
+  static const constexpr char *failed_msg = "failed %d %d %d: %d %d\n";
+};
+
+template <>
+struct format_traits<float> {
+public:
+  static const constexpr char *failed_msg = "failed %d %d %d: %f %f\n";
+};
 
 template <typename T>
 __global__ void testPrefixScan(uint32_t size) {
@@ -23,7 +36,7 @@ __global__ void testPrefixScan(uint32_t size) {
   assert(1 == co[0]);
   for (auto i = first + 1; i < size; i += blockDim.x) {
     if (c[i] != c[i - 1] + 1)
-      printf("failed %d %d %d: %d %d\n", size, i, blockDim.x, c[i], c[i - 1]);
+      printf(format_traits<T>::failed_msg, size, i, blockDim.x, c[i], c[i - 1]);
     assert(c[i] == c[i - 1] + 1);
     assert(c[i] == i + 1);
     assert(c[i] = co[i]);
@@ -47,7 +60,7 @@ __global__ void testWarpPrefixScan(uint32_t size) {
   assert(1 == co[0]);
   if (i != 0) {
     if (c[i] != c[i - 1] + 1)
-      printf("failed %d %d %d: %d %d\n", size, i, blockDim.x, c[i], c[i - 1]);
+      printf(format_traits<T>::failed_msg, size, i, blockDim.x, c[i], c[i - 1]);
     assert(c[i] == c[i - 1] + 1);
     assert(c[i] == i + 1);
     assert(c[i] = co[i]);
@@ -71,6 +84,8 @@ __global__ void verify(uint32_t const *v, uint32_t n) {
 }
 
 int main() {
+  cms::cudatest::requireDevices();
+
   std::cout << "warp level" << std::endl;
   // std::cout << "warp 32" << std::endl;
   testWarpPrefixScan<int><<<1, 32>>>(32);
@@ -117,34 +132,17 @@ int main() {
     // the block counter
     int32_t *d_pc;
     cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t)));
-    cudaCheck(cudaMemset(d_pc, 0, 4));
+    cudaCheck(cudaMemset(d_pc, 0, sizeof(int32_t)));
 
     nthreads = 1024;
     nblocks = (num_items + nthreads - 1) / nthreads;
-    multiBlockPrefixScan<<<nblocks, nthreads, 0>>>(d_in, d_out1, num_items, d_pc);
+    std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nblocks << std::endl;
+    multiBlockPrefixScan<<<nblocks, nthreads, 4 * nblocks>>>(d_in, d_out1, num_items, d_pc);
+    cudaCheck(cudaGetLastError());
     verify<<<nblocks, nthreads, 0>>>(d_out1, num_items);
+    cudaCheck(cudaGetLastError());
     cudaDeviceSynchronize();
 
-    // test cub
-    std::cout << "cub" << std::endl;
-    // Determine temporary device storage requirements for inclusive prefix sum
-    void *d_temp_storage = nullptr;
-    size_t temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items);
-
-    std::cout << "temp storage " << temp_storage_bytes << std::endl;
-
-    // Allocate temporary storage for inclusive prefix sum
-    // fake larger ws already available
-    temp_storage_bytes *= 8;
-    cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    std::cout << "temp storage " << temp_storage_bytes << std::endl;
-    // Run inclusive prefix sum
-    CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items));
-    std::cout << "temp storage " << temp_storage_bytes << std::endl;
-
-    verify<<<nblocks, nthreads, 0>>>(d_out2, num_items);
-    cudaDeviceSynchronize();
   }  // ksize
   return 0;
 }
diff --git a/src/cuda/test/radixSort_t.cu b/src/cuda/test/radixSort_t.cu
index 72134a603..e1b9bca4c 100644
--- a/src/cuda/test/radixSort_t.cu
+++ b/src/cuda/test/radixSort_t.cu
@@ -10,9 +10,12 @@
 
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/radixSort.h"
 
+using namespace cms::cuda;
+
 template <typename T>
 struct RS {
   using type = std::uniform_int_distribution<T>;
@@ -161,6 +164,8 @@ void go(bool useShared) {
 }
 
 int main() {
+  cms::cudatest::requireDevices();
+
   bool useShared = false;
 
   std::cout << "using Global memory" << std::endl;
diff --git a/src/cuda/test/testEigenGPU.cu b/src/cuda/test/testEigenGPU.cu
index 3ecd7d36b..9cbcc4c57 100644
--- a/src/cuda/test/testEigenGPU.cu
+++ b/src/cuda/test/testEigenGPU.cu
@@ -4,6 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
 
 #ifdef USE_BL
 #include "plugin-PixelTriplets/BrokenLine.h"
@@ -328,6 +329,8 @@ void testFit() {
 }
 
 int main(int argc, char* argv[]) {
+  cms::cudatest::requireDevices();
+
   testFit<4>();
   testFit<3>();
   testFit<5>();
diff --git a/src/cuda/test/testEigenGPUNoFit.cu b/src/cuda/test/testEigenGPUNoFit.cu
index 734b0fb67..8e0b28f0f 100644
--- a/src/cuda/test/testEigenGPUNoFit.cu
+++ b/src/cuda/test/testEigenGPUNoFit.cu
@@ -4,6 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
 #include "test_common.h"
 
 using namespace Eigen;
@@ -214,6 +215,8 @@ void testEigenvalues() {
 }
 
 int main(int argc, char *argv[]) {
+  cms::cudatest::requireDevices();
+
   testEigenvalues();
   testInverse3x3();
   testInverse4x4();
diff --git a/src/cuda/test/test_GPUSimpleVector.cu b/src/cuda/test/test_GPUSimpleVector.cu
index da8719147..92974d8a2 100644
--- a/src/cuda/test/test_GPUSimpleVector.cu
+++ b/src/cuda/test/test_GPUSimpleVector.cu
@@ -6,43 +6,46 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#include "CUDACore/GPUSimpleVector.h"
+#include "CUDACore/SimpleVector.h"
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
 
-__global__ void vector_pushback(GPU::SimpleVector<int> *foo) {
+__global__ void vector_pushback(cms::cuda::SimpleVector<int> *foo) {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
   foo->push_back(index);
 }
 
-__global__ void vector_reset(GPU::SimpleVector<int> *foo) { foo->reset(); }
+__global__ void vector_reset(cms::cuda::SimpleVector<int> *foo) { foo->reset(); }
 
-__global__ void vector_emplace_back(GPU::SimpleVector<int> *foo) {
+__global__ void vector_emplace_back(cms::cuda::SimpleVector<int> *foo) {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
   foo->emplace_back(index);
 }
 
 int main() {
+  cms::cudatest::requireDevices();
+
   auto maxN = 10000;
-  GPU::SimpleVector<int> *obj_ptr = nullptr;
-  GPU::SimpleVector<int> *d_obj_ptr = nullptr;
-  GPU::SimpleVector<int> *tmp_obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *d_obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *tmp_obj_ptr = nullptr;
   int *data_ptr = nullptr;
   int *d_data_ptr = nullptr;
 
-  cudaCheck(cudaMallocHost(&obj_ptr, sizeof(GPU::SimpleVector<int>)));
+  cudaCheck(cudaMallocHost(&obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
   cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int)));
   cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int)));
 
-  auto v = GPU::make_SimpleVector(obj_ptr, maxN, data_ptr);
+  auto v = cms::cuda::make_SimpleVector(obj_ptr, maxN, data_ptr);
 
-  cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(GPU::SimpleVector<int>)));
-  GPU::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr);
+  cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
+  cms::cuda::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr);
   assert(tmp_obj_ptr->size() == 0);
   assert(tmp_obj_ptr->capacity() == static_cast<int>(maxN));
 
-  cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(GPU::SimpleVector<int>)));
+  cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
   // ... and copy the object to the device.
-  cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   int numBlocks = 5;
   int numThreadsPerBlock = 256;
@@ -50,14 +53,14 @@ int main() {
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
 
-  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
   vector_reset<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
 
-  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   assert(obj_ptr->size() == 0);
 
@@ -65,7 +68,7 @@ int main() {
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
 
-  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));