diff --git a/Makefile b/Makefile index f33960113..c610ba632 100644 --- a/Makefile +++ b/Makefile @@ -82,11 +82,6 @@ export TBB_DEPS := $(TBB_LIB) export TBB_CXXFLAGS := -I$(TBB_BASE)/include export TBB_LDFLAGS := -L$(TBB_LIBDIR) -ltbb -CUB_BASE := $(EXTERNAL_BASE)/cub -export CUB_DEPS := $(CUB_BASE) -export CUB_CXXFLAGS := -I$(CUB_BASE) -export CUB_LDFLAGS := - EIGEN_BASE := $(EXTERNAL_BASE)/eigen export EIGEN_DEPS := $(EIGEN_BASE) export EIGEN_CXXFLAGS := -I$(EIGEN_BASE) -DEIGEN_DONT_PARALLELIZE @@ -421,12 +416,6 @@ $(TBB_LIB): $(TBB_BASE) $(TBB_LIBDIR) +$(MAKE) -C $(TBB_BASE) stdver=c++17 cp $$(find $(TBB_BASE)/build -name *.so*) $(TBB_LIBDIR) -# CUB -external_cub: $(CUB_BASE) - -$(CUB_BASE): - git clone --branch 1.8.0 https://github.com/NVlabs/cub.git $@ - # Eigen external_eigen: $(EIGEN_BASE) diff --git a/src/cudatest/CUDACore/CachingDeviceAllocator.h b/src/cudatest/CUDACore/CachingDeviceAllocator.h index 075d568f2..50c1ebdb2 100644 --- a/src/cudatest/CUDACore/CachingDeviceAllocator.h +++ b/src/cudatest/CUDACore/CachingDeviceAllocator.h @@ -41,9 +41,10 @@ #include #include #include +#include -#include -#include +#include "CUDACore/cudaCheck.h" +#include "CUDACore/deviceAllocatorStatus.h" /// CUB namespace namespace notcub { @@ -122,6 +123,7 @@ namespace notcub { struct BlockDescriptor { void *d_ptr; // Device pointer size_t bytes; // Size of allocation in bytes + size_t bytesRequested; // CMS: requested allocatoin size (for monitoring only) unsigned int bin; // Bin enumeration int device; // device ordinal cudaStream_t associated_stream; // Associated associated_stream @@ -129,12 +131,19 @@ namespace notcub { // Constructor (suitable for searching maps for a specific block, given its pointer and device) BlockDescriptor(void *d_ptr, int device) - : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {} + : d_ptr(d_ptr), + bytes(0), + bytesRequested(0), // CMS + bin(INVALID_BIN), + device(device), + associated_stream(nullptr), + ready_event(nullptr) {} // Constructor (suitable for searching maps for a range of suitable blocks, given a device) BlockDescriptor(int device) : d_ptr(nullptr), bytes(0), + bytesRequested(0), // CMS bin(INVALID_BIN), device(device), associated_stream(nullptr), @@ -160,12 +169,7 @@ namespace notcub { /// BlockDescriptor comparator function interface typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; + // CMS: Moved TotalBytes to deviceAllocatorStatus.h /// Set type for cached blocks (ordered by size) typedef std::multiset CachedBlocks; @@ -174,7 +178,8 @@ namespace notcub { typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; + // CMS: Moved definition to deviceAllocatorStatus.h + using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes; //--------------------------------------------------------------------- // Utility functions @@ -219,7 +224,8 @@ namespace notcub { // Fields //--------------------------------------------------------------------- - cub::Mutex mutex; /// Mutex for thread-safety + // CMS: use std::mutex instead of cub::Mutex, declare mutable + mutable std::mutex mutex; /// Mutex for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration @@ -298,17 +304,19 @@ namespace notcub { */ cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) { // Lock - mutex.Lock(); + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex); if (debug) - _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", - (long long)this->max_cached_bytes, - (long long)max_cached_bytes); + // CMS: use raw printf + printf("Changing max_cached_bytes (%lld -> %lld)\n", + (long long)this->max_cached_bytes, + (long long)max_cached_bytes); this->max_cached_bytes = max_cached_bytes; - // Unlock - mutex.Unlock(); + // Unlock (redundant, kept for style uniformity) + mutex_locker.unlock(); return cudaSuccess; } @@ -326,19 +334,22 @@ namespace notcub { size_t bytes, ///< [in] Minimum number of bytes for the allocation cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation { + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex, std::defer_lock); *d_ptr = nullptr; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; if (device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); device = entrypoint_device; } // Create a block descriptor for the requested allocation bool found = false; BlockDescriptor search_key(device); + search_key.bytesRequested = bytes; // CMS search_key.associated_stream = active_stream; NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); @@ -350,7 +361,7 @@ namespace notcub { search_key.bytes = bytes; } else { // Search for a suitable cached allocation: lock - mutex.Lock(); + mutex_locker.lock(); if (search_key.bin < min_bin) { // Bin is less than minimum bin: round up @@ -376,10 +387,12 @@ namespace notcub { // Remove from free blocks cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; + cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS if (debug) // CMS: improved debug message - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously " "associated with stream %lld, event %lld).\n", device, @@ -398,24 +411,25 @@ namespace notcub { } // Done searching: unlock - mutex.Unlock(); + mutex_locker.unlock(); } // Allocate the block if necessary if (!found) { // Set runtime's current device to specified device (entrypoint may not be set) if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); + cudaCheck(error = cudaSetDevice(device)); } // Attempt to allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { + // CMS: silently ignore errors and retry or pass them to the caller + if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry if (debug) - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", device, (long long)search_key.bytes, @@ -425,7 +439,7 @@ namespace notcub { cudaGetLastError(); // Reset CUDART's error // Lock - mutex.Lock(); + mutex_locker.lock(); // Iterate the range of free blocks on the same device BlockDescriptor free_key(device); @@ -437,16 +451,18 @@ namespace notcub { // on the current device // Free device memory and destroy stream event. - if (CubDebug(error = cudaFree(block_itr->d_ptr))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaFree(block_itr->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) + if ((error = cudaEventDestroy(block_itr->ready_event))) break; // Reduce balance and erase entry cached_bytes[device].free -= block_itr->bytes; if (debug) - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks " "(%lld bytes) outstanding.\n", device, @@ -462,41 +478,42 @@ namespace notcub { } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); // Return under error if (error) return error; // Try to allocate again - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)); } // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); // Insert into live blocks - mutex.Lock(); + mutex_locker.lock(); live_blocks.insert(search_key); cached_bytes[device].live += search_key.bytes; - mutex.Unlock(); + cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS + mutex_locker.unlock(); if (debug) // CMS: improved debug message - _CubLog( - "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n", - device, - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event); + // CMS: use raw printf + printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n", + device, + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event); // Attempt to revert back to previous device if necessary if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaSetDevice(entrypoint_device)); } } @@ -504,11 +521,12 @@ namespace notcub { *d_ptr = search_key.d_ptr; if (debug) - _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); + // CMS: use raw printf + printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); return error; } @@ -538,15 +556,17 @@ namespace notcub { cudaError_t DeviceFree(int device, void *d_ptr) { int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex, std::defer_lock); if (device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); device = entrypoint_device; } // Lock - mutex.Lock(); + mutex_locker.lock(); // Find corresponding block descriptor bool recached = false; @@ -557,6 +577,7 @@ namespace notcub { search_key = *block_itr; live_blocks.erase(block_itr); cached_bytes[device].live -= search_key.bytes; + cached_bytes[device].liveRequested -= search_key.bytesRequested; // CMS // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { @@ -567,7 +588,8 @@ namespace notcub { if (debug) // CMS: improved debug message - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", device, @@ -584,31 +606,29 @@ namespace notcub { // First set to specified device (entrypoint may not be set) if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); + cudaCheck(error = cudaSetDevice(device)); } if (recached) { // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)); } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); if (!recached) { // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFree(d_ptr))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaFree(d_ptr)); + cudaCheck(error = cudaEventDestroy(search_key.ready_event)); if (debug) // CMS: improved debug message - _CubLog( + printf( "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", device, @@ -624,8 +644,8 @@ namespace notcub { // Reset device if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; @@ -647,8 +667,8 @@ namespace notcub { cudaError_t error = cudaSuccess; int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; - - mutex.Lock(); + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex); while (!cached_blocks.empty()) { // Get first block @@ -656,28 +676,31 @@ namespace notcub { // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaGetDevice(&entrypoint_device))) break; } // Set current device ordinal if necessary if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaSetDevice(begin->device))) break; current_device = begin->device; } // Free device memory - if (CubDebug(error = cudaFree(begin->d_ptr))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaFree(begin->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) + if ((error = cudaEventDestroy(begin->ready_event))) break; // Reduce balance and erase entry cached_bytes[current_device].free -= begin->bytes; if (debug) - _CubLog( + printf( "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", current_device, @@ -690,21 +713,28 @@ namespace notcub { cached_blocks.erase(begin); } - mutex.Unlock(); + mutex_locker.unlock(); // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; } + // CMS: give access to cache allocation status + GpuCachedBytes CacheStatus() const { + std::unique_lock mutex_locker(mutex); + return cached_bytes; + } + /** * \brief Destructor */ - virtual ~CachingDeviceAllocator() { + // CMS: make the destructor not virtual + ~CachingDeviceAllocator() { if (!skip_cleanup) FreeAllCached(); } diff --git a/src/cudatest/CUDACore/CachingHostAllocator.h b/src/cudatest/CUDACore/CachingHostAllocator.h index 53901e1f1..a206b2da1 100644 --- a/src/cudatest/CUDACore/CachingHostAllocator.h +++ b/src/cudatest/CUDACore/CachingHostAllocator.h @@ -41,9 +41,9 @@ #include #include #include +#include -#include -#include +#include "CUDACore/cudaCheck.h" /// CUB namespace namespace notcub { @@ -212,7 +212,7 @@ namespace notcub { // Fields //--------------------------------------------------------------------- - cub::Mutex mutex; /// Mutex for thread-safety + std::mutex mutex; /// Mutex for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration @@ -291,17 +291,17 @@ namespace notcub { */ void SetMaxCachedBytes(size_t max_cached_bytes) { // Lock - mutex.Lock(); + std::unique_lock mutex_locker(mutex); if (debug) - _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", - (long long)this->max_cached_bytes, - (long long)max_cached_bytes); + printf("Changing max_cached_bytes (%lld -> %lld)\n", + (long long)this->max_cached_bytes, + (long long)max_cached_bytes); this->max_cached_bytes = max_cached_bytes; - // Unlock - mutex.Unlock(); + // Unlock (redundant, kept for style uniformity) + mutex_locker.unlock(); } /** @@ -314,12 +314,12 @@ namespace notcub { size_t bytes, ///< [in] Minimum number of bytes for the allocation cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation { + std::unique_lock mutex_locker(mutex, std::defer_lock); *d_ptr = nullptr; int device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; - if (CubDebug(error = cudaGetDevice(&device))) - return error; + cudaCheck(error = cudaGetDevice(&device)); // Create a block descriptor for the requested allocation bool found = false; @@ -336,7 +336,7 @@ namespace notcub { search_key.bytes = bytes; } else { // Search for a suitable cached allocation: lock - mutex.Lock(); + mutex_locker.lock(); if (search_key.bin < min_bin) { // Bin is less than minimum bin: round up @@ -356,14 +356,10 @@ namespace notcub { search_key.associated_stream = active_stream; if (search_key.device != device) { // If "associated" device changes, need to re-create the event on the right device - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; + cudaCheck(error = cudaSetDevice(search_key.device)); + cudaCheck(error = cudaEventDestroy(search_key.ready_event)); + cudaCheck(error = cudaSetDevice(device)); + cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); search_key.device = device; } @@ -374,7 +370,7 @@ namespace notcub { cached_bytes.live += search_key.bytes; if (debug) - _CubLog( + printf( "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld " "(previously associated with stream %lld, event %lld).\n", search_key.d_ptr, @@ -393,18 +389,18 @@ namespace notcub { } // Done searching: unlock - mutex.Unlock(); + mutex_locker.unlock(); } // Allocate the block if necessary if (!found) { // Attempt to allocate // TODO: eventually support allocation flags - if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == + if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry if (debug) - _CubLog( + printf( "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached " "allocations", (long long)search_key.bytes, @@ -415,7 +411,7 @@ namespace notcub { cudaGetLastError(); // Reset CUDART's error // Lock - mutex.Lock(); + mutex_locker.lock(); // Iterate the range of free blocks CachedBlocks::iterator block_itr = cached_blocks.begin(); @@ -426,16 +422,16 @@ namespace notcub { // on the current device // Free pinned host memory. - if (CubDebug(error = cudaFreeHost(block_itr->d_ptr))) + if ((error = cudaFreeHost(block_itr->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) + if ((error = cudaEventDestroy(block_itr->ready_event))) break; // Reduce balance and erase entry cached_bytes.free -= block_itr->bytes; if (debug) - _CubLog( + printf( "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", (long long)block_itr->bytes, @@ -450,29 +446,27 @@ namespace notcub { } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); // Return under error if (error) return error; // Try to allocate again - if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault))) - return error; + cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)); } // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; + cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); // Insert into live blocks - mutex.Lock(); + mutex_locker.lock(); live_blocks.insert(search_key); cached_bytes.live += search_key.bytes; - mutex.Unlock(); + mutex_locker.unlock(); if (debug) - _CubLog( + printf( "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device " "%lld).\n", search_key.d_ptr, @@ -486,11 +480,11 @@ namespace notcub { *d_ptr = search_key.d_ptr; if (debug) - _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); + printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); return error; } @@ -505,7 +499,7 @@ namespace notcub { cudaError_t error = cudaSuccess; // Lock - mutex.Lock(); + std::unique_lock mutex_locker(mutex); // Find corresponding block descriptor bool recached = false; @@ -525,7 +519,7 @@ namespace notcub { cached_bytes.free += search_key.bytes; if (debug) - _CubLog( + printf( "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld " "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", (long long)search_key.bytes, @@ -539,31 +533,26 @@ namespace notcub { } } - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; + cudaCheck(error = cudaGetDevice(&entrypoint_device)); if (entrypoint_device != search_key.device) { - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; + cudaCheck(error = cudaSetDevice(search_key.device)); } if (recached) { // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) - return error; + cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)); } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); if (!recached) { // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFreeHost(d_ptr))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; + cudaCheck(error = cudaFreeHost(d_ptr)); + cudaCheck(error = cudaEventDestroy(search_key.ready_event)); if (debug) - _CubLog( + printf( "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available " "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", (long long)search_key.bytes, @@ -578,8 +567,7 @@ namespace notcub { // Reset device if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; @@ -593,7 +581,7 @@ namespace notcub { int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; - mutex.Lock(); + std::unique_lock mutex_locker(mutex); while (!cached_blocks.empty()) { // Get first block @@ -601,28 +589,28 @@ namespace notcub { // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + if ((error = cudaGetDevice(&entrypoint_device))) break; } // Set current device ordinal if necessary if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) + if ((error = cudaSetDevice(begin->device))) break; current_device = begin->device; } // Free host memory - if (CubDebug(error = cudaFreeHost(begin->d_ptr))) + if ((error = cudaFreeHost(begin->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) + if ((error = cudaEventDestroy(begin->ready_event))) break; // Reduce balance and erase entry cached_bytes.free -= begin->bytes; if (debug) - _CubLog( + printf( "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", (long long)begin->bytes, @@ -634,12 +622,11 @@ namespace notcub { cached_blocks.erase(begin); } - mutex.Unlock(); + mutex_locker.unlock(); // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; diff --git a/src/cudatest/CUDACore/ScopedContext.cc b/src/cudatest/CUDACore/ScopedContext.cc index b2dc58f32..14bff04eb 100644 --- a/src/cudatest/CUDACore/ScopedContext.cc +++ b/src/cudatest/CUDACore/ScopedContext.cc @@ -91,7 +91,7 @@ namespace cms::cuda { ScopedContextAcquire::~ScopedContextAcquire() { holderHelper_.enqueueCallback(device(), stream()); if (contextState_) { - contextState_->set(device(), std::move(streamPtr())); + contextState_->set(device(), streamPtr()); } } diff --git a/src/cudatest/CUDACore/deviceAllocatorStatus.cc b/src/cudatest/CUDACore/deviceAllocatorStatus.cc new file mode 100644 index 000000000..5d4a0ca09 --- /dev/null +++ b/src/cudatest/CUDACore/deviceAllocatorStatus.cc @@ -0,0 +1,7 @@ +#include "CUDACore/deviceAllocatorStatus.h" + +#include "getCachingDeviceAllocator.h" + +namespace cms::cuda { + allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); } +} // namespace cms::cuda diff --git a/src/cudatest/CUDACore/deviceAllocatorStatus.h b/src/cudatest/CUDACore/deviceAllocatorStatus.h new file mode 100644 index 000000000..92f9f87e8 --- /dev/null +++ b/src/cudatest/CUDACore/deviceAllocatorStatus.h @@ -0,0 +1,23 @@ +#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h +#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h + +#include + +namespace cms { + namespace cuda { + namespace allocator { + struct TotalBytes { + size_t free; + size_t live; + size_t liveRequested; // CMS: monitor also requested amount + TotalBytes() { free = live = liveRequested = 0; } + }; + /// Map type of device ordinals to the number of cached bytes cached by each device + using GpuCachedBytes = std::map; + } // namespace allocator + + allocator::GpuCachedBytes deviceAllocatorStatus(); + } // namespace cuda +} // namespace cms + +#endif diff --git a/src/cudatest/CUDACore/getCachingDeviceAllocator.h b/src/cudatest/CUDACore/getCachingDeviceAllocator.h index f959a50f5..b467d4b90 100644 --- a/src/cudatest/CUDACore/getCachingDeviceAllocator.h +++ b/src/cudatest/CUDACore/getCachingDeviceAllocator.h @@ -1,6 +1,9 @@ #ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator #define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator +#include +#include + #include "CUDACore/cudaCheck.h" #include "CUDACore/deviceCount.h" #include "CachingDeviceAllocator.h" @@ -9,11 +12,11 @@ namespace cms::cuda::allocator { // Use caching or not constexpr bool useCaching = true; // Growth factor (bin_growth in cub::CachingDeviceAllocator - constexpr unsigned int binGrowth = 8; + constexpr unsigned int binGrowth = 2; // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator - constexpr unsigned int minBin = 1; + constexpr unsigned int minBin = 8; // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. - constexpr unsigned int maxBin = 10; + constexpr unsigned int maxBin = 30; // Total storage for the allocator. 0 means no limit. constexpr size_t maxCachedBytes = 0; // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. @@ -39,6 +42,27 @@ namespace cms::cuda::allocator { } inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() { + if (debug) { + std::cout << "cub::CachingDeviceAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + std::cout << " " << std::setw(9) << binSize << " B\n"; + } + } + std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n"; + } + // the public interface is thread safe static notcub::CachingDeviceAllocator allocator{binGrowth, minBin, diff --git a/src/cudatest/CUDACore/getCachingHostAllocator.h b/src/cudatest/CUDACore/getCachingHostAllocator.h index 601374685..d29080795 100644 --- a/src/cudatest/CUDACore/getCachingHostAllocator.h +++ b/src/cudatest/CUDACore/getCachingHostAllocator.h @@ -1,6 +1,9 @@ #ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator #define HeterogeneousCore_CUDACore_src_getCachingHostAllocator +#include +#include + #include "CUDACore/cudaCheck.h" #include "CachingHostAllocator.h" @@ -8,6 +11,27 @@ namespace cms::cuda::allocator { inline notcub::CachingHostAllocator& getCachingHostAllocator() { + if (debug) { + std::cout << "cub::CachingHostAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + std::cout << " " << std::setw(9) << binSize << " B\n"; + } + } + std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n"; + } + // the public interface is thread safe static notcub::CachingHostAllocator allocator{binGrowth, minBin, diff --git a/src/cudatest/Makefile b/src/cudatest/Makefile index 6b46bf24c..6611e3367 100644 --- a/src/cudatest/Makefile +++ b/src/cudatest/Makefile @@ -123,7 +123,7 @@ $(OBJ_DIR)/$(TARGET_NAME)/bin/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/bin/%.cc # Tests $(OBJ_DIR)/$(TARGET_NAME)/test/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/test/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $(TEST_CXXFLAGS) $(MY_CXXFLAGS) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + $(CXX) $(CXXFLAGS) $(CUDA_TEST_CXXFLAGS) $(MY_CXXFLAGS) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD @cp $(@D)/$*.cc.d $(@D)/$*.cc.d.tmp; \ sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.d.tmp > $(@D)/$*.cc.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ diff --git a/src/cudatest/Makefile.deps b/src/cudatest/Makefile.deps index ebf9e9b49..331540db3 100644 --- a/src/cudatest/Makefile.deps +++ b/src/cudatest/Makefile.deps @@ -1,4 +1,4 @@ -cudatest_EXTERNAL_DEPENDS := TBB CUDA CUB +cudatest_EXTERNAL_DEPENDS := TBB CUDA CUDACore_DEPENDS := Framework Test1_DEPENDS := Framework CUDACore DataFormats Test2_DEPENDS := Framework CUDACore