diff --git a/Makefile b/Makefile
index f33960113..c610ba632 100644
--- a/Makefile
+++ b/Makefile
@@ -82,11 +82,6 @@ export TBB_DEPS := $(TBB_LIB)
 export TBB_CXXFLAGS := -I$(TBB_BASE)/include
 export TBB_LDFLAGS := -L$(TBB_LIBDIR) -ltbb
 
-CUB_BASE := $(EXTERNAL_BASE)/cub
-export CUB_DEPS := $(CUB_BASE)
-export CUB_CXXFLAGS := -I$(CUB_BASE)
-export CUB_LDFLAGS :=
-
 EIGEN_BASE := $(EXTERNAL_BASE)/eigen
 export EIGEN_DEPS := $(EIGEN_BASE)
 export EIGEN_CXXFLAGS := -I$(EIGEN_BASE) -DEIGEN_DONT_PARALLELIZE
@@ -421,12 +416,6 @@ $(TBB_LIB): $(TBB_BASE) $(TBB_LIBDIR)
 	+$(MAKE) -C $(TBB_BASE) stdver=c++17
 	cp $$(find $(TBB_BASE)/build -name *.so*) $(TBB_LIBDIR)
 
-# CUB
-external_cub: $(CUB_BASE)
-
-$(CUB_BASE):
-	git clone --branch 1.8.0 https://github.com/NVlabs/cub.git $@
-
 # Eigen
 external_eigen: $(EIGEN_BASE)
 
diff --git a/src/cudatest/CUDACore/CachingDeviceAllocator.h b/src/cudatest/CUDACore/CachingDeviceAllocator.h
index 075d568f2..50c1ebdb2 100644
--- a/src/cudatest/CUDACore/CachingDeviceAllocator.h
+++ b/src/cudatest/CUDACore/CachingDeviceAllocator.h
@@ -41,9 +41,10 @@
 #include <cmath>
 #include <map>
 #include <set>
+#include <mutex>
 
-#include <cub/util_debug.cuh>
-#include <cub/host/mutex.cuh>
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/deviceAllocatorStatus.h"
 
 /// CUB namespace
 namespace notcub {
@@ -122,6 +123,7 @@ namespace notcub {
     struct BlockDescriptor {
       void *d_ptr;                     // Device pointer
       size_t bytes;                    // Size of allocation in bytes
+      size_t bytesRequested;           // CMS: requested allocatoin size (for monitoring only)
       unsigned int bin;                // Bin enumeration
       int device;                      // device ordinal
       cudaStream_t associated_stream;  // Associated associated_stream
@@ -129,12 +131,19 @@ namespace notcub {
 
       // Constructor (suitable for searching maps for a specific block, given its pointer and device)
       BlockDescriptor(void *d_ptr, int device)
-          : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {}
+          : d_ptr(d_ptr),
+            bytes(0),
+            bytesRequested(0),  // CMS
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
 
       // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
       BlockDescriptor(int device)
           : d_ptr(nullptr),
             bytes(0),
+            bytesRequested(0),  // CMS
             bin(INVALID_BIN),
             device(device),
             associated_stream(nullptr),
@@ -160,12 +169,7 @@ namespace notcub {
     /// BlockDescriptor comparator function interface
     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
 
-    class TotalBytes {
-    public:
-      size_t free;
-      size_t live;
-      TotalBytes() { free = live = 0; }
-    };
+    // CMS: Moved TotalBytes to deviceAllocatorStatus.h
 
     /// Set type for cached blocks (ordered by size)
     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
@@ -174,7 +178,8 @@ namespace notcub {
     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
 
     /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, TotalBytes> GpuCachedBytes;
+    // CMS: Moved definition to deviceAllocatorStatus.h
+    using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;
 
     //---------------------------------------------------------------------
     // Utility functions
@@ -219,7 +224,8 @@ namespace notcub {
     // Fields
     //---------------------------------------------------------------------
 
-    cub::Mutex mutex;  /// Mutex for thread-safety
+    // CMS: use std::mutex instead of cub::Mutex, declare mutable
+    mutable std::mutex mutex;  /// Mutex for thread-safety
 
     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
     unsigned int min_bin;     /// Minimum bin enumeration
@@ -298,17 +304,19 @@ namespace notcub {
      */
     cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
       // Lock
-      mutex.Lock();
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock mutex_locker(mutex);
 
       if (debug)
-        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
-                (long long)this->max_cached_bytes,
-                (long long)max_cached_bytes);
+        // CMS: use raw printf
+        printf("Changing max_cached_bytes (%lld -> %lld)\n",
+               (long long)this->max_cached_bytes,
+               (long long)max_cached_bytes);
 
       this->max_cached_bytes = max_cached_bytes;
 
-      // Unlock
-      mutex.Unlock();
+      // Unlock (redundant, kept for style uniformity)
+      mutex_locker.unlock();
 
       return cudaSuccess;
     }
@@ -326,19 +334,22 @@ namespace notcub {
         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
     {
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
       *d_ptr = nullptr;
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
 
       if (device == INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
         device = entrypoint_device;
       }
 
       // Create a block descriptor for the requested allocation
       bool found = false;
       BlockDescriptor search_key(device);
+      search_key.bytesRequested = bytes;  // CMS
       search_key.associated_stream = active_stream;
       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
 
@@ -350,7 +361,7 @@ namespace notcub {
         search_key.bytes = bytes;
       } else {
         // Search for a suitable cached allocation: lock
-        mutex.Lock();
+        mutex_locker.lock();
 
         if (search_key.bin < min_bin) {
           // Bin is less than minimum bin: round up
@@ -376,10 +387,12 @@ namespace notcub {
             // Remove from free blocks
             cached_bytes[device].free -= search_key.bytes;
             cached_bytes[device].live += search_key.bytes;
+            cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
 
             if (debug)
               // CMS: improved debug message
-              _CubLog(
+              // CMS: use raw printf
+              printf(
                   "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
                   "associated with stream %lld, event %lld).\n",
                   device,
@@ -398,24 +411,25 @@ namespace notcub {
         }
 
         // Done searching: unlock
-        mutex.Unlock();
+        mutex_locker.unlock();
       }
 
       // Allocate the block if necessary
       if (!found) {
         // Set runtime's current device to specified device (entrypoint may not be set)
         if (device != entrypoint_device) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-            return error;
-          if (CubDebug(error = cudaSetDevice(device)))
-            return error;
+          // CMS: throw exception on error
+          cudaCheck(error = cudaGetDevice(&entrypoint_device));
+          cudaCheck(error = cudaSetDevice(device));
         }
 
         // Attempt to allocate
-        if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
+        // CMS: silently ignore errors and retry or pass them to the caller
+        if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
           // The allocation attempt failed: free all cached blocks on device and retry
           if (debug)
-            _CubLog(
+            // CMS: use raw printf
+            printf(
                 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
                 device,
                 (long long)search_key.bytes,
@@ -425,7 +439,7 @@ namespace notcub {
           cudaGetLastError();   // Reset CUDART's error
 
           // Lock
-          mutex.Lock();
+          mutex_locker.lock();
 
           // Iterate the range of free blocks on the same device
           BlockDescriptor free_key(device);
@@ -437,16 +451,18 @@ namespace notcub {
             // on the current device
 
             // Free device memory and destroy stream event.
-            if (CubDebug(error = cudaFree(block_itr->d_ptr)))
+            // CMS: silently ignore errors and pass them to the caller
+            if ((error = cudaFree(block_itr->d_ptr)))
               break;
-            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+            if ((error = cudaEventDestroy(block_itr->ready_event)))
               break;
 
             // Reduce balance and erase entry
             cached_bytes[device].free -= block_itr->bytes;
 
             if (debug)
-              _CubLog(
+              // CMS: use raw printf
+              printf(
                   "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
                   "(%lld bytes) outstanding.\n",
                   device,
@@ -462,41 +478,42 @@ namespace notcub {
           }
 
           // Unlock
-          mutex.Unlock();
+          mutex_locker.unlock();
 
           // Return under error
           if (error)
             return error;
 
           // Try to allocate again
-          if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
-            return error;
+          // CMS: throw exception on error
+          cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
         }
 
         // Create ready event
-        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
 
         // Insert into live blocks
-        mutex.Lock();
+        mutex_locker.lock();
         live_blocks.insert(search_key);
         cached_bytes[device].live += search_key.bytes;
-        mutex.Unlock();
+        cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
+        mutex_locker.unlock();
 
         if (debug)
           // CMS: improved debug message
-          _CubLog(
-              "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
-              device,
-              search_key.d_ptr,
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event);
+          // CMS: use raw printf
+          printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
+                 device,
+                 search_key.d_ptr,
+                 (long long)search_key.bytes,
+                 (long long)search_key.associated_stream,
+                 (long long)search_key.ready_event);
 
         // Attempt to revert back to previous device if necessary
         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
-          if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-            return error;
+          // CMS: throw exception on error
+          cudaCheck(error = cudaSetDevice(entrypoint_device));
         }
       }
 
@@ -504,11 +521,12 @@ namespace notcub {
       *d_ptr = search_key.d_ptr;
 
       if (debug)
-        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes[device].free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes[device].live);
+        // CMS: use raw printf
+        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+               (long long)cached_blocks.size(),
+               (long long)cached_bytes[device].free,
+               (long long)live_blocks.size(),
+               (long long)cached_bytes[device].live);
 
       return error;
     }
@@ -538,15 +556,17 @@ namespace notcub {
     cudaError_t DeviceFree(int device, void *d_ptr) {
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
 
       if (device == INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
         device = entrypoint_device;
       }
 
       // Lock
-      mutex.Lock();
+      mutex_locker.lock();
 
       // Find corresponding block descriptor
       bool recached = false;
@@ -557,6 +577,7 @@ namespace notcub {
         search_key = *block_itr;
         live_blocks.erase(block_itr);
         cached_bytes[device].live -= search_key.bytes;
+        cached_bytes[device].liveRequested -= search_key.bytesRequested;  // CMS
 
         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
         if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
@@ -567,7 +588,8 @@ namespace notcub {
 
           if (debug)
             // CMS: improved debug message
-            _CubLog(
+            // CMS: use raw printf
+            printf(
                 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
                 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
                 device,
@@ -584,31 +606,29 @@ namespace notcub {
 
       // First set to specified device (entrypoint may not be set)
       if (device != entrypoint_device) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
-        if (CubDebug(error = cudaSetDevice(device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
+        cudaCheck(error = cudaSetDevice(device));
       }
 
       if (recached) {
         // Insert the ready event in the associated stream (must have current device set properly)
-        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
       }
 
       // Unlock
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       if (!recached) {
         // Free the allocation from the runtime and cleanup the event.
-        if (CubDebug(error = cudaFree(d_ptr)))
-          return error;
-        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaFree(d_ptr));
+        cudaCheck(error = cudaEventDestroy(search_key.ready_event));
 
         if (debug)
           // CMS: improved debug message
-          _CubLog(
+          printf(
               "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
               device,
@@ -624,8 +644,8 @@ namespace notcub {
 
       // Reset device
       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
@@ -647,8 +667,8 @@ namespace notcub {
       cudaError_t error = cudaSuccess;
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       int current_device = INVALID_DEVICE_ORDINAL;
-
-      mutex.Lock();
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex);
 
       while (!cached_blocks.empty()) {
         // Get first block
@@ -656,28 +676,31 @@ namespace notcub {
 
         // Get entry-point device ordinal if necessary
         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          // CMS: silently ignore errors and pass them to the caller
+          if ((error = cudaGetDevice(&entrypoint_device)))
             break;
         }
 
         // Set current device ordinal if necessary
         if (begin->device != current_device) {
-          if (CubDebug(error = cudaSetDevice(begin->device)))
+          // CMS: silently ignore errors and pass them to the caller
+          if ((error = cudaSetDevice(begin->device)))
             break;
           current_device = begin->device;
         }
 
         // Free device memory
-        if (CubDebug(error = cudaFree(begin->d_ptr)))
+        // CMS: silently ignore errors and pass them to the caller
+        if ((error = cudaFree(begin->d_ptr)))
           break;
-        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+        if ((error = cudaEventDestroy(begin->ready_event)))
           break;
 
         // Reduce balance and erase entry
         cached_bytes[current_device].free -= begin->bytes;
 
         if (debug)
-          _CubLog(
+          printf(
               "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
               "bytes) outstanding.\n",
               current_device,
@@ -690,21 +713,28 @@ namespace notcub {
         cached_blocks.erase(begin);
       }
 
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       // Attempt to revert back to entry-point device if necessary
       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        // CMS: throw exception on error
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
     }
 
+    // CMS: give access to cache allocation status
+    GpuCachedBytes CacheStatus() const {
+      std::unique_lock mutex_locker(mutex);
+      return cached_bytes;
+    }
+
     /**
      * \brief Destructor
      */
-    virtual ~CachingDeviceAllocator() {
+    // CMS: make the destructor not virtual
+    ~CachingDeviceAllocator() {
       if (!skip_cleanup)
         FreeAllCached();
     }
diff --git a/src/cudatest/CUDACore/CachingHostAllocator.h b/src/cudatest/CUDACore/CachingHostAllocator.h
index 53901e1f1..a206b2da1 100644
--- a/src/cudatest/CUDACore/CachingHostAllocator.h
+++ b/src/cudatest/CUDACore/CachingHostAllocator.h
@@ -41,9 +41,9 @@
 #include <cmath>
 #include <map>
 #include <set>
+#include <mutex>
 
-#include <cub/util_debug.cuh>
-#include <cub/host/mutex.cuh>
+#include "CUDACore/cudaCheck.h"
 
 /// CUB namespace
 namespace notcub {
@@ -212,7 +212,7 @@ namespace notcub {
     // Fields
     //---------------------------------------------------------------------
 
-    cub::Mutex mutex;  /// Mutex for thread-safety
+    std::mutex mutex;  /// Mutex for thread-safety
 
     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
     unsigned int min_bin;     /// Minimum bin enumeration
@@ -291,17 +291,17 @@ namespace notcub {
      */
     void SetMaxCachedBytes(size_t max_cached_bytes) {
       // Lock
-      mutex.Lock();
+      std::unique_lock mutex_locker(mutex);
 
       if (debug)
-        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
-                (long long)this->max_cached_bytes,
-                (long long)max_cached_bytes);
+        printf("Changing max_cached_bytes (%lld -> %lld)\n",
+               (long long)this->max_cached_bytes,
+               (long long)max_cached_bytes);
 
       this->max_cached_bytes = max_cached_bytes;
 
-      // Unlock
-      mutex.Unlock();
+      // Unlock (redundant, kept for style uniformity)
+      mutex_locker.unlock();
     }
 
     /**
@@ -314,12 +314,12 @@ namespace notcub {
         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
     {
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
       *d_ptr = nullptr;
       int device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
 
-      if (CubDebug(error = cudaGetDevice(&device)))
-        return error;
+      cudaCheck(error = cudaGetDevice(&device));
 
       // Create a block descriptor for the requested allocation
       bool found = false;
@@ -336,7 +336,7 @@ namespace notcub {
         search_key.bytes = bytes;
       } else {
         // Search for a suitable cached allocation: lock
-        mutex.Lock();
+        mutex_locker.lock();
 
         if (search_key.bin < min_bin) {
           // Bin is less than minimum bin: round up
@@ -356,14 +356,10 @@ namespace notcub {
             search_key.associated_stream = active_stream;
             if (search_key.device != device) {
               // If "associated" device changes, need to re-create the event on the right device
-              if (CubDebug(error = cudaSetDevice(search_key.device)))
-                return error;
-              if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-                return error;
-              if (CubDebug(error = cudaSetDevice(device)))
-                return error;
-              if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-                return error;
+              cudaCheck(error = cudaSetDevice(search_key.device));
+              cudaCheck(error = cudaEventDestroy(search_key.ready_event));
+              cudaCheck(error = cudaSetDevice(device));
+              cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
               search_key.device = device;
             }
 
@@ -374,7 +370,7 @@ namespace notcub {
             cached_bytes.live += search_key.bytes;
 
             if (debug)
-              _CubLog(
+              printf(
                   "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
                   "(previously associated with stream %lld, event %lld).\n",
                   search_key.d_ptr,
@@ -393,18 +389,18 @@ namespace notcub {
         }
 
         // Done searching: unlock
-        mutex.Unlock();
+        mutex_locker.unlock();
       }
 
       // Allocate the block if necessary
       if (!found) {
         // Attempt to allocate
         // TODO: eventually support allocation flags
-        if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
+        if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
             cudaErrorMemoryAllocation) {
           // The allocation attempt failed: free all cached blocks on device and retry
           if (debug)
-            _CubLog(
+            printf(
                 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
                 "allocations",
                 (long long)search_key.bytes,
@@ -415,7 +411,7 @@ namespace notcub {
           cudaGetLastError();   // Reset CUDART's error
 
           // Lock
-          mutex.Lock();
+          mutex_locker.lock();
 
           // Iterate the range of free blocks
           CachedBlocks::iterator block_itr = cached_blocks.begin();
@@ -426,16 +422,16 @@ namespace notcub {
             // on the current device
 
             // Free pinned host memory.
-            if (CubDebug(error = cudaFreeHost(block_itr->d_ptr)))
+            if ((error = cudaFreeHost(block_itr->d_ptr)))
               break;
-            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+            if ((error = cudaEventDestroy(block_itr->ready_event)))
               break;
 
             // Reduce balance and erase entry
             cached_bytes.free -= block_itr->bytes;
 
             if (debug)
-              _CubLog(
+              printf(
                   "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
                   "bytes) outstanding.\n",
                   (long long)block_itr->bytes,
@@ -450,29 +446,27 @@ namespace notcub {
           }
 
           // Unlock
-          mutex.Unlock();
+          mutex_locker.unlock();
 
           // Return under error
           if (error)
             return error;
 
           // Try to allocate again
-          if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)))
-            return error;
+          cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
         }
 
         // Create ready event
-        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-          return error;
+        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
 
         // Insert into live blocks
-        mutex.Lock();
+        mutex_locker.lock();
         live_blocks.insert(search_key);
         cached_bytes.live += search_key.bytes;
-        mutex.Unlock();
+        mutex_locker.unlock();
 
         if (debug)
-          _CubLog(
+          printf(
               "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
               "%lld).\n",
               search_key.d_ptr,
@@ -486,11 +480,11 @@ namespace notcub {
       *d_ptr = search_key.d_ptr;
 
       if (debug)
-        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes.free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes.live);
+        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+               (long long)cached_blocks.size(),
+               (long long)cached_bytes.free,
+               (long long)live_blocks.size(),
+               (long long)cached_bytes.live);
 
       return error;
     }
@@ -505,7 +499,7 @@ namespace notcub {
       cudaError_t error = cudaSuccess;
 
       // Lock
-      mutex.Lock();
+      std::unique_lock<std::mutex> mutex_locker(mutex);
 
       // Find corresponding block descriptor
       bool recached = false;
@@ -525,7 +519,7 @@ namespace notcub {
           cached_bytes.free += search_key.bytes;
 
           if (debug)
-            _CubLog(
+            printf(
                 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
                 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
                 (long long)search_key.bytes,
@@ -539,31 +533,26 @@ namespace notcub {
         }
       }
 
-      if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-        return error;
+      cudaCheck(error = cudaGetDevice(&entrypoint_device));
       if (entrypoint_device != search_key.device) {
-        if (CubDebug(error = cudaSetDevice(search_key.device)))
-          return error;
+        cudaCheck(error = cudaSetDevice(search_key.device));
       }
 
       if (recached) {
         // Insert the ready event in the associated stream (must have current device set properly)
-        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
-          return error;
+        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
       }
 
       // Unlock
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       if (!recached) {
         // Free the allocation from the runtime and cleanup the event.
-        if (CubDebug(error = cudaFreeHost(d_ptr)))
-          return error;
-        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-          return error;
+        cudaCheck(error = cudaFreeHost(d_ptr));
+        cudaCheck(error = cudaEventDestroy(search_key.ready_event));
 
         if (debug)
-          _CubLog(
+          printf(
               "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
               (long long)search_key.bytes,
@@ -578,8 +567,7 @@ namespace notcub {
 
       // Reset device
       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
@@ -593,7 +581,7 @@ namespace notcub {
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       int current_device = INVALID_DEVICE_ORDINAL;
 
-      mutex.Lock();
+      std::unique_lock<std::mutex> mutex_locker(mutex);
 
       while (!cached_blocks.empty()) {
         // Get first block
@@ -601,28 +589,28 @@ namespace notcub {
 
         // Get entry-point device ordinal if necessary
         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          if ((error = cudaGetDevice(&entrypoint_device)))
             break;
         }
 
         // Set current device ordinal if necessary
         if (begin->device != current_device) {
-          if (CubDebug(error = cudaSetDevice(begin->device)))
+          if ((error = cudaSetDevice(begin->device)))
             break;
           current_device = begin->device;
         }
 
         // Free host memory
-        if (CubDebug(error = cudaFreeHost(begin->d_ptr)))
+        if ((error = cudaFreeHost(begin->d_ptr)))
           break;
-        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+        if ((error = cudaEventDestroy(begin->ready_event)))
           break;
 
         // Reduce balance and erase entry
         cached_bytes.free -= begin->bytes;
 
         if (debug)
-          _CubLog(
+          printf(
               "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
               "bytes) outstanding.\n",
               (long long)begin->bytes,
@@ -634,12 +622,11 @@ namespace notcub {
         cached_blocks.erase(begin);
       }
 
-      mutex.Unlock();
+      mutex_locker.unlock();
 
       // Attempt to revert back to entry-point device if necessary
       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
       }
 
       return error;
diff --git a/src/cudatest/CUDACore/ScopedContext.cc b/src/cudatest/CUDACore/ScopedContext.cc
index b2dc58f32..14bff04eb 100644
--- a/src/cudatest/CUDACore/ScopedContext.cc
+++ b/src/cudatest/CUDACore/ScopedContext.cc
@@ -91,7 +91,7 @@ namespace cms::cuda {
   ScopedContextAcquire::~ScopedContextAcquire() {
     holderHelper_.enqueueCallback(device(), stream());
     if (contextState_) {
-      contextState_->set(device(), std::move(streamPtr()));
+      contextState_->set(device(), streamPtr());
     }
   }
 
diff --git a/src/cudatest/CUDACore/deviceAllocatorStatus.cc b/src/cudatest/CUDACore/deviceAllocatorStatus.cc
new file mode 100644
index 000000000..5d4a0ca09
--- /dev/null
+++ b/src/cudatest/CUDACore/deviceAllocatorStatus.cc
@@ -0,0 +1,7 @@
+#include "CUDACore/deviceAllocatorStatus.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace cms::cuda {
+  allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); }
+}  // namespace cms::cuda
diff --git a/src/cudatest/CUDACore/deviceAllocatorStatus.h b/src/cudatest/CUDACore/deviceAllocatorStatus.h
new file mode 100644
index 000000000..92f9f87e8
--- /dev/null
+++ b/src/cudatest/CUDACore/deviceAllocatorStatus.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+
+#include <map>
+
+namespace cms {
+  namespace cuda {
+    namespace allocator {
+      struct TotalBytes {
+        size_t free;
+        size_t live;
+        size_t liveRequested;  // CMS: monitor also requested amount
+        TotalBytes() { free = live = liveRequested = 0; }
+      };
+      /// Map type of device ordinals to the number of cached bytes cached by each device
+      using GpuCachedBytes = std::map<int, TotalBytes>;
+    }  // namespace allocator
+
+    allocator::GpuCachedBytes deviceAllocatorStatus();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudatest/CUDACore/getCachingDeviceAllocator.h b/src/cudatest/CUDACore/getCachingDeviceAllocator.h
index f959a50f5..b467d4b90 100644
--- a/src/cudatest/CUDACore/getCachingDeviceAllocator.h
+++ b/src/cudatest/CUDACore/getCachingDeviceAllocator.h
@@ -1,6 +1,9 @@
 #ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
 #define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
 
+#include <iomanip>
+#include <iostream>
+
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/deviceCount.h"
 #include "CachingDeviceAllocator.h"
@@ -9,11 +12,11 @@ namespace cms::cuda::allocator {
   // Use caching or not
   constexpr bool useCaching = true;
   // Growth factor (bin_growth in cub::CachingDeviceAllocator
-  constexpr unsigned int binGrowth = 8;
+  constexpr unsigned int binGrowth = 2;
   // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
-  constexpr unsigned int minBin = 1;
+  constexpr unsigned int minBin = 8;
   // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-  constexpr unsigned int maxBin = 10;
+  constexpr unsigned int maxBin = 30;
   // Total storage for the allocator. 0 means no limit.
   constexpr size_t maxCachedBytes = 0;
   // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
@@ -39,6 +42,27 @@ namespace cms::cuda::allocator {
   }
 
   inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingDeviceAllocator settings\n"
+                << "  bin growth " << binGrowth << "\n"
+                << "  min bin    " << minBin << "\n"
+                << "  max bin    " << maxBin << "\n"
+                << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
     // the public interface is thread safe
     static notcub::CachingDeviceAllocator allocator{binGrowth,
                                                     minBin,
diff --git a/src/cudatest/CUDACore/getCachingHostAllocator.h b/src/cudatest/CUDACore/getCachingHostAllocator.h
index 601374685..d29080795 100644
--- a/src/cudatest/CUDACore/getCachingHostAllocator.h
+++ b/src/cudatest/CUDACore/getCachingHostAllocator.h
@@ -1,6 +1,9 @@
 #ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator
 #define HeterogeneousCore_CUDACore_src_getCachingHostAllocator
 
+#include <iomanip>
+#include <iostream>
+
 #include "CUDACore/cudaCheck.h"
 #include "CachingHostAllocator.h"
 
@@ -8,6 +11,27 @@
 
 namespace cms::cuda::allocator {
   inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingHostAllocator settings\n"
+                << "  bin growth " << binGrowth << "\n"
+                << "  min bin    " << minBin << "\n"
+                << "  max bin    " << maxBin << "\n"
+                << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
     // the public interface is thread safe
     static notcub::CachingHostAllocator allocator{binGrowth,
                                                   minBin,
diff --git a/src/cudatest/Makefile b/src/cudatest/Makefile
index 6b46bf24c..6611e3367 100644
--- a/src/cudatest/Makefile
+++ b/src/cudatest/Makefile
@@ -123,7 +123,7 @@ $(OBJ_DIR)/$(TARGET_NAME)/bin/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/bin/%.cc
 # Tests
 $(OBJ_DIR)/$(TARGET_NAME)/test/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/test/%.cc
 	@[ -d $(@D) ] || mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) $(TEST_CXXFLAGS) $(MY_CXXFLAGS) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD
+	$(CXX) $(CXXFLAGS) $(CUDA_TEST_CXXFLAGS) $(MY_CXXFLAGS) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD
 	@cp $(@D)/$*.cc.d $(@D)/$*.cc.d.tmp; \
 	  sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.d.tmp > $(@D)/$*.cc.d; \
 	  sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
diff --git a/src/cudatest/Makefile.deps b/src/cudatest/Makefile.deps
index ebf9e9b49..331540db3 100644
--- a/src/cudatest/Makefile.deps
+++ b/src/cudatest/Makefile.deps
@@ -1,4 +1,4 @@
-cudatest_EXTERNAL_DEPENDS := TBB CUDA CUB
+cudatest_EXTERNAL_DEPENDS := TBB CUDA
 CUDACore_DEPENDS := Framework
 Test1_DEPENDS := Framework CUDACore DataFormats
 Test2_DEPENDS := Framework CUDACore