diff --git a/Makefile b/Makefile index 025942349..a0aa5b5b0 100644 --- a/Makefile +++ b/Makefile @@ -197,7 +197,7 @@ TARGETS_ALL := $(notdir $(wildcard $(SRC_DIR)/*)) # Temporarily filter out programs that do not build (yet) with CUDA 11 TARGETS_ALL := $(filter-out alpakatest alpaka,$(TARGETS_ALL)) # Temporarily filter out programs that do not run (yet) with CUDA 11 -TARGETS_ALL := $(filter-out cuda cudauvm,$(TARGETS_ALL)) +TARGETS_ALL := $(filter-out cudauvm,$(TARGETS_ALL)) # Split targets by required toolchain TARGETS_GCC := fwtest diff --git a/README.md b/README.md index df8df18d5..bd21906eb 100644 --- a/README.md +++ b/README.md @@ -22,23 +22,46 @@ The purpose of this package is to explore various performance portability solutions with the [Patatrack](https://patatrack.web.cern.ch/patatrack/wiki/) pixel tracking application. The version here corresponds to -[CMSSW_11_1_0_pre4_Patatrack](https://github.com/cms-patatrack/cmssw/tree/CMSSW_11_1_0_pre4_Patatrack). +[CMSSW_11_2_0_pre8_Patatrack](https://github.com/cms-patatrack/cmssw/tree/CMSSW_11_2_0_pre8_Patatrack). -The application is designed to require minimal dependencies on the system: +The application is designed to require minimal dependencies on the system. All programs require * GNU Make, `curl`, `md5sum`, `tar` - * CMake for `kokkostest` and `kokkos` programs -* C++17 capable compiler that works with `nvcc`, in the current setup this pretty much means GCC 8 -* CUDA 11.0 runtime and drivers (real drivers are not needed for building) -* [Intel oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) - -All other external dependencies (listed below) are downloaded and built automatically. -* [TBB](https://github.com/intel/tbb) (all programs) -* [CUB](https://nvlabs.github.io/cub/) (`cudatest` and `cuda` programs) -* [Eigen](http://eigen.tuxfamily.org/) (`cuda` program) -* [Kokkos](https://github.com/kokkos/kokkos) (`kokkostest` and `kokkos` programs) -* [Boost](https://www.boost.org/) (`alpakatest` and `alpaka` programs) - * Boost libraries from the system can also be used, but they need to be newer than 1.65.1 -* [Alpaka](https://github.com/alpaka-group/alpaka) (`alpakatest` and `alpaka` programs) +* C++17 capable compiler. Ffor programs using CUDA that must work with `nvcc`, in the current setup this means GCC 8 or 9, possibly 10 with CUDA 11.1 + * testing is currently done with GCC 8 + +In addition, the individual programs assume the following be found from the system + +| Application | CMake (>= 3.10) | CUDA 11 runtime and drivers | [Intel oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) | +|--------------|--------------------|-----------------------------|------------------------------------------------------------------------------------------------------------------| +| `cudatest` | | :heavy_check_mark: | | +| `cuda` | | :heavy_check_mark: | | +| `cudadev` | | :heavy_check_mark: | | +| `cudauvm` | | :heavy_check_mark: | | +| `kokkostest` | :heavy_check_mark: | :heavy_check_mark: | | +| `kokkos` | :heavy_check_mark: | :heavy_check_mark: | | +| `alpakatest` | | :heavy_check_mark: | | +| `alpaka` | | :heavy_check_mark: | | +| `sycltest` | | | :heavy_check_mark: | + + +All other dependencies (listed below) are downloaded and built automatically + + +| Application | [TBB](https://github.com/intel/tbb) | [Eigen](http://eigen.tuxfamily.org/) | [Kokkos](https://github.com/kokkos/kokkos) | [Boost](https://www.boost.org/) (*) | [Alpaka](https://github.com/alpaka-group/alpaka) | +|--------------|-------------------------------------|--------------------------------------|--------------------------------------------|-------------------------------------|--------------------------------------------------| +| `fwtest` | :heavy_check_mark: | | | | | +| `cudatest` | :heavy_check_mark: | | | | | +| `cuda` | :heavy_check_mark: | :heavy_check_mark: | | | | +| `cudadev` | :heavy_check_mark: | :heavy_check_mark: | | | | +| `cudauvm` | :heavy_check_mark: | :heavy_check_mark: | | | | +| `kokkostest` | :heavy_check_mark: | | :heavy_check_mark: | | | +| `kokkos` | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | +| `alpakatest` | :heavy_check_mark: | | | :heavy_check_mark: | :heavy_check_mark: | +| `alpaka` | :heavy_check_mark: | | | :heavy_check_mark: | :heavy_check_mark: | +| `sycltest` | :heavy_check_mark: | | | | | + + +* (*) Boost libraries from the system can also be used, but they need to be newer than 1.65.1 The input data set consists of a minimal binary dump of 1000 events of ttbar+PU events from of @@ -49,21 +72,23 @@ downloaded automatically during the build process. ## Status -| Application | Description | Framework | Device framework | Test code | Raw2Cluster | RecHit | Pixel tracking | Vertex | Transfers to CPU | -|--------------|----------------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| -| `fwtest` | Framework test | :heavy_check_mark: | | :heavy_check_mark: | | | | | | -| `cudatest` | CUDA FW test | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | | | -| `cuda` | CUDA version (frozen) | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| `cudadev` | CUDA version (development) | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| `cudauvm` | CUDA version with managed memory | :heavy_check_mark: | :heavy_check_mark: | | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| `kokkostest` | Kokkos FW test | :heavy_check_mark: | :white_check_mark: | :heavy_check_mark: | | | | | | -| `kokkos` | Kokkos version | :heavy_check_mark: | | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| `alpakatest` | Alpaka FW test | :heavy_check_mark: | | :white_check_mark: | | | | | | -| `alpaka` | Alpaka version | :white_check_mark: | | | :white_check_mark: | | | | | -| `sycltest` | SYCL/oneAPI FW test | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | | | +| Application | Description | Framework | Device framework | Test code | Raw2Cluster | RecHit | Pixel tracking | Vertex | Transfers to CPU | Validation code | Validated | +|--------------|----------------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| +| `fwtest` | Framework test | :heavy_check_mark: | | :heavy_check_mark: | | | | | | | | +| `cudatest` | CUDA FW test | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | | | | | +| `cuda` | CUDA version (frozen) | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| `cudadev` | CUDA version (development) | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| `cudauvm` | CUDA version with managed memory | :heavy_check_mark: | :heavy_check_mark: | | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | | +| `kokkostest` | Kokkos FW test | :heavy_check_mark: | :white_check_mark: | :heavy_check_mark: | | | | | | | | +| `kokkos` | Kokkos version | :heavy_check_mark: | | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | +| `alpakatest` | Alpaka FW test | :heavy_check_mark: | | :white_check_mark: | | | | | | | | +| `alpaka` | Alpaka version | :white_check_mark: | | | :white_check_mark: | | | | | | | +| `sycltest` | SYCL/oneAPI FW test | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | | | | | The "Device framework" refers to a mechanism similar to [`cms::cuda::Product`](src/cuda/CUDACore/Product.h) and [`cms::cuda::ScopedContext`](src/cuda/CUDACore/ScopedContext.h) to support chains of modules to use the same device and the same work queue. +The column "Validated" means that the program produces the same histograms as the reference `cuda` program within numerical precision (judged "by eye"). + ## Quick recipe ```bash @@ -126,11 +151,11 @@ The printouts can be disabled with `-DFWTEST_SILENT` build flag (e.g. `make ... #### `cuda` -This program is frozen to correspond to CMSSW_11_1_0_pre4. +This program is frozen to correspond to CMSSW_11_2_0_pre8_Patatrack. #### `cudadev` -This program currently contains code from CMSSW_11_2_0_pre8_Patatrack. +This program currently contains code from CMSSW_11_2_0_pre8_Patatrack (currently equivalent to `cuda`). #### `cudauvm` diff --git a/src/cuda/CUDACore/AtomicPairCounter.h b/src/cuda/CUDACore/AtomicPairCounter.h index 9864493ad..19b2781e0 100644 --- a/src/cuda/CUDACore/AtomicPairCounter.h +++ b/src/cuda/CUDACore/AtomicPairCounter.h @@ -5,48 +5,54 @@ #include "CUDACore/cudaCompat.h" -class AtomicPairCounter { -public: - using c_type = unsigned long long int; +namespace cms { + namespace cuda { - AtomicPairCounter() {} - AtomicPairCounter(c_type i) { counter.ac = i; } + class AtomicPairCounter { + public: + using c_type = unsigned long long int; - __device__ __host__ AtomicPairCounter& operator=(c_type i) { - counter.ac = i; - return *this; - } + AtomicPairCounter() {} + AtomicPairCounter(c_type i) { counter.ac = i; } - struct Counters { - uint32_t n; // in a "One to Many" association is the number of "One" - uint32_t m; // in a "One to Many" association is the total number of associations - }; + __device__ __host__ AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } - union Atomic2 { - Counters counters; - c_type ac; - }; + struct Counters { + uint32_t n; // in a "One to Many" association is the number of "One" + uint32_t m; // in a "One to Many" association is the total number of associations + }; - static constexpr c_type incr = 1UL << 32; + union Atomic2 { + Counters counters; + c_type ac; + }; - __device__ __host__ Counters get() const { return counter.counters; } + static constexpr c_type incr = 1UL << 32; - // increment n by 1 and m by i. return previous value - __host__ __device__ __forceinline__ Counters add(uint32_t i) { - c_type c = i; - c += incr; - Atomic2 ret; + __device__ __host__ Counters get() const { return counter.counters; } + + // increment n by 1 and m by i. return previous value + __host__ __device__ __forceinline__ Counters add(uint32_t i) { + c_type c = i; + c += incr; + Atomic2 ret; #ifdef __CUDA_ARCH__ - ret.ac = atomicAdd(&counter.ac, c); + ret.ac = atomicAdd(&counter.ac, c); #else - ret.ac = counter.ac; - counter.ac += c; + ret.ac = counter.ac; + counter.ac += c; #endif - return ret.counters; - } + return ret.counters; + } + + private: + Atomic2 counter; + }; -private: - Atomic2 counter; -}; + } // namespace cuda +} // namespace cms #endif // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h diff --git a/src/cuda/CUDACore/CUDAHostAllocator.h b/src/cuda/CUDACore/CUDAHostAllocator.h deleted file mode 100644 index 1cd290f13..000000000 --- a/src/cuda/CUDACore/CUDAHostAllocator.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_CUDAHostAllocator_h -#define HeterogeneousCore_CUDAUtilities_CUDAHostAllocator_h - -#include -#include -#include - -class cuda_bad_alloc : public std::bad_alloc { -public: - cuda_bad_alloc(cudaError_t error) noexcept : error_(error) {} - - const char* what() const noexcept override { return cudaGetErrorString(error_); } - -private: - cudaError_t error_; -}; - -template -class CUDAHostAllocator { -public: - using value_type = T; - - template - struct rebind { - using other = CUDAHostAllocator; - }; - - T* allocate(std::size_t n) const __attribute__((warn_unused_result)) __attribute__((malloc)) - __attribute__((returns_nonnull)) { - void* ptr = nullptr; - cudaError_t status = cudaMallocHost(&ptr, n * sizeof(T), FLAGS); - if (status != cudaSuccess) { - throw cuda_bad_alloc(status); - } - if (ptr == nullptr) { - throw std::bad_alloc(); - } - return static_cast(ptr); - } - - void deallocate(T* p, std::size_t n) const { - cudaError_t status = cudaFreeHost(p); - if (status != cudaSuccess) { - throw cuda_bad_alloc(status); - } - } -}; - -#endif // HeterogeneousCore_CUDAUtilities_CUDAHostAllocator_h diff --git a/src/cuda/CUDACore/CachingDeviceAllocator.h b/src/cuda/CUDACore/CachingDeviceAllocator.h index 075d568f2..50c1ebdb2 100644 --- a/src/cuda/CUDACore/CachingDeviceAllocator.h +++ b/src/cuda/CUDACore/CachingDeviceAllocator.h @@ -41,9 +41,10 @@ #include #include #include +#include -#include -#include +#include "CUDACore/cudaCheck.h" +#include "CUDACore/deviceAllocatorStatus.h" /// CUB namespace namespace notcub { @@ -122,6 +123,7 @@ namespace notcub { struct BlockDescriptor { void *d_ptr; // Device pointer size_t bytes; // Size of allocation in bytes + size_t bytesRequested; // CMS: requested allocatoin size (for monitoring only) unsigned int bin; // Bin enumeration int device; // device ordinal cudaStream_t associated_stream; // Associated associated_stream @@ -129,12 +131,19 @@ namespace notcub { // Constructor (suitable for searching maps for a specific block, given its pointer and device) BlockDescriptor(void *d_ptr, int device) - : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {} + : d_ptr(d_ptr), + bytes(0), + bytesRequested(0), // CMS + bin(INVALID_BIN), + device(device), + associated_stream(nullptr), + ready_event(nullptr) {} // Constructor (suitable for searching maps for a range of suitable blocks, given a device) BlockDescriptor(int device) : d_ptr(nullptr), bytes(0), + bytesRequested(0), // CMS bin(INVALID_BIN), device(device), associated_stream(nullptr), @@ -160,12 +169,7 @@ namespace notcub { /// BlockDescriptor comparator function interface typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; + // CMS: Moved TotalBytes to deviceAllocatorStatus.h /// Set type for cached blocks (ordered by size) typedef std::multiset CachedBlocks; @@ -174,7 +178,8 @@ namespace notcub { typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; + // CMS: Moved definition to deviceAllocatorStatus.h + using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes; //--------------------------------------------------------------------- // Utility functions @@ -219,7 +224,8 @@ namespace notcub { // Fields //--------------------------------------------------------------------- - cub::Mutex mutex; /// Mutex for thread-safety + // CMS: use std::mutex instead of cub::Mutex, declare mutable + mutable std::mutex mutex; /// Mutex for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration @@ -298,17 +304,19 @@ namespace notcub { */ cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) { // Lock - mutex.Lock(); + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex); if (debug) - _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", - (long long)this->max_cached_bytes, - (long long)max_cached_bytes); + // CMS: use raw printf + printf("Changing max_cached_bytes (%lld -> %lld)\n", + (long long)this->max_cached_bytes, + (long long)max_cached_bytes); this->max_cached_bytes = max_cached_bytes; - // Unlock - mutex.Unlock(); + // Unlock (redundant, kept for style uniformity) + mutex_locker.unlock(); return cudaSuccess; } @@ -326,19 +334,22 @@ namespace notcub { size_t bytes, ///< [in] Minimum number of bytes for the allocation cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation { + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex, std::defer_lock); *d_ptr = nullptr; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; if (device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); device = entrypoint_device; } // Create a block descriptor for the requested allocation bool found = false; BlockDescriptor search_key(device); + search_key.bytesRequested = bytes; // CMS search_key.associated_stream = active_stream; NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); @@ -350,7 +361,7 @@ namespace notcub { search_key.bytes = bytes; } else { // Search for a suitable cached allocation: lock - mutex.Lock(); + mutex_locker.lock(); if (search_key.bin < min_bin) { // Bin is less than minimum bin: round up @@ -376,10 +387,12 @@ namespace notcub { // Remove from free blocks cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; + cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS if (debug) // CMS: improved debug message - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously " "associated with stream %lld, event %lld).\n", device, @@ -398,24 +411,25 @@ namespace notcub { } // Done searching: unlock - mutex.Unlock(); + mutex_locker.unlock(); } // Allocate the block if necessary if (!found) { // Set runtime's current device to specified device (entrypoint may not be set) if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); + cudaCheck(error = cudaSetDevice(device)); } // Attempt to allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { + // CMS: silently ignore errors and retry or pass them to the caller + if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry if (debug) - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", device, (long long)search_key.bytes, @@ -425,7 +439,7 @@ namespace notcub { cudaGetLastError(); // Reset CUDART's error // Lock - mutex.Lock(); + mutex_locker.lock(); // Iterate the range of free blocks on the same device BlockDescriptor free_key(device); @@ -437,16 +451,18 @@ namespace notcub { // on the current device // Free device memory and destroy stream event. - if (CubDebug(error = cudaFree(block_itr->d_ptr))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaFree(block_itr->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) + if ((error = cudaEventDestroy(block_itr->ready_event))) break; // Reduce balance and erase entry cached_bytes[device].free -= block_itr->bytes; if (debug) - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks " "(%lld bytes) outstanding.\n", device, @@ -462,41 +478,42 @@ namespace notcub { } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); // Return under error if (error) return error; // Try to allocate again - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)); } // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); // Insert into live blocks - mutex.Lock(); + mutex_locker.lock(); live_blocks.insert(search_key); cached_bytes[device].live += search_key.bytes; - mutex.Unlock(); + cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS + mutex_locker.unlock(); if (debug) // CMS: improved debug message - _CubLog( - "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n", - device, - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event); + // CMS: use raw printf + printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n", + device, + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event); // Attempt to revert back to previous device if necessary if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaSetDevice(entrypoint_device)); } } @@ -504,11 +521,12 @@ namespace notcub { *d_ptr = search_key.d_ptr; if (debug) - _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); + // CMS: use raw printf + printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); return error; } @@ -538,15 +556,17 @@ namespace notcub { cudaError_t DeviceFree(int device, void *d_ptr) { int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex, std::defer_lock); if (device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); device = entrypoint_device; } // Lock - mutex.Lock(); + mutex_locker.lock(); // Find corresponding block descriptor bool recached = false; @@ -557,6 +577,7 @@ namespace notcub { search_key = *block_itr; live_blocks.erase(block_itr); cached_bytes[device].live -= search_key.bytes; + cached_bytes[device].liveRequested -= search_key.bytesRequested; // CMS // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { @@ -567,7 +588,8 @@ namespace notcub { if (debug) // CMS: improved debug message - _CubLog( + // CMS: use raw printf + printf( "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", device, @@ -584,31 +606,29 @@ namespace notcub { // First set to specified device (entrypoint may not be set) if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaGetDevice(&entrypoint_device)); + cudaCheck(error = cudaSetDevice(device)); } if (recached) { // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)); } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); if (!recached) { // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFree(d_ptr))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaFree(d_ptr)); + cudaCheck(error = cudaEventDestroy(search_key.ready_event)); if (debug) // CMS: improved debug message - _CubLog( + printf( "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", device, @@ -624,8 +644,8 @@ namespace notcub { // Reset device if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; @@ -647,8 +667,8 @@ namespace notcub { cudaError_t error = cudaSuccess; int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; - - mutex.Lock(); + // CMS: use RAII instead of (un)locking explicitly + std::unique_lock mutex_locker(mutex); while (!cached_blocks.empty()) { // Get first block @@ -656,28 +676,31 @@ namespace notcub { // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaGetDevice(&entrypoint_device))) break; } // Set current device ordinal if necessary if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaSetDevice(begin->device))) break; current_device = begin->device; } // Free device memory - if (CubDebug(error = cudaFree(begin->d_ptr))) + // CMS: silently ignore errors and pass them to the caller + if ((error = cudaFree(begin->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) + if ((error = cudaEventDestroy(begin->ready_event))) break; // Reduce balance and erase entry cached_bytes[current_device].free -= begin->bytes; if (debug) - _CubLog( + printf( "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", current_device, @@ -690,21 +713,28 @@ namespace notcub { cached_blocks.erase(begin); } - mutex.Unlock(); + mutex_locker.unlock(); // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + // CMS: throw exception on error + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; } + // CMS: give access to cache allocation status + GpuCachedBytes CacheStatus() const { + std::unique_lock mutex_locker(mutex); + return cached_bytes; + } + /** * \brief Destructor */ - virtual ~CachingDeviceAllocator() { + // CMS: make the destructor not virtual + ~CachingDeviceAllocator() { if (!skip_cleanup) FreeAllCached(); } diff --git a/src/cuda/CUDACore/CachingHostAllocator.h b/src/cuda/CUDACore/CachingHostAllocator.h index 53901e1f1..a206b2da1 100644 --- a/src/cuda/CUDACore/CachingHostAllocator.h +++ b/src/cuda/CUDACore/CachingHostAllocator.h @@ -41,9 +41,9 @@ #include #include #include +#include -#include -#include +#include "CUDACore/cudaCheck.h" /// CUB namespace namespace notcub { @@ -212,7 +212,7 @@ namespace notcub { // Fields //--------------------------------------------------------------------- - cub::Mutex mutex; /// Mutex for thread-safety + std::mutex mutex; /// Mutex for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration @@ -291,17 +291,17 @@ namespace notcub { */ void SetMaxCachedBytes(size_t max_cached_bytes) { // Lock - mutex.Lock(); + std::unique_lock mutex_locker(mutex); if (debug) - _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", - (long long)this->max_cached_bytes, - (long long)max_cached_bytes); + printf("Changing max_cached_bytes (%lld -> %lld)\n", + (long long)this->max_cached_bytes, + (long long)max_cached_bytes); this->max_cached_bytes = max_cached_bytes; - // Unlock - mutex.Unlock(); + // Unlock (redundant, kept for style uniformity) + mutex_locker.unlock(); } /** @@ -314,12 +314,12 @@ namespace notcub { size_t bytes, ///< [in] Minimum number of bytes for the allocation cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation { + std::unique_lock mutex_locker(mutex, std::defer_lock); *d_ptr = nullptr; int device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; - if (CubDebug(error = cudaGetDevice(&device))) - return error; + cudaCheck(error = cudaGetDevice(&device)); // Create a block descriptor for the requested allocation bool found = false; @@ -336,7 +336,7 @@ namespace notcub { search_key.bytes = bytes; } else { // Search for a suitable cached allocation: lock - mutex.Lock(); + mutex_locker.lock(); if (search_key.bin < min_bin) { // Bin is less than minimum bin: round up @@ -356,14 +356,10 @@ namespace notcub { search_key.associated_stream = active_stream; if (search_key.device != device) { // If "associated" device changes, need to re-create the event on the right device - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; + cudaCheck(error = cudaSetDevice(search_key.device)); + cudaCheck(error = cudaEventDestroy(search_key.ready_event)); + cudaCheck(error = cudaSetDevice(device)); + cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); search_key.device = device; } @@ -374,7 +370,7 @@ namespace notcub { cached_bytes.live += search_key.bytes; if (debug) - _CubLog( + printf( "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld " "(previously associated with stream %lld, event %lld).\n", search_key.d_ptr, @@ -393,18 +389,18 @@ namespace notcub { } // Done searching: unlock - mutex.Unlock(); + mutex_locker.unlock(); } // Allocate the block if necessary if (!found) { // Attempt to allocate // TODO: eventually support allocation flags - if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == + if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry if (debug) - _CubLog( + printf( "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached " "allocations", (long long)search_key.bytes, @@ -415,7 +411,7 @@ namespace notcub { cudaGetLastError(); // Reset CUDART's error // Lock - mutex.Lock(); + mutex_locker.lock(); // Iterate the range of free blocks CachedBlocks::iterator block_itr = cached_blocks.begin(); @@ -426,16 +422,16 @@ namespace notcub { // on the current device // Free pinned host memory. - if (CubDebug(error = cudaFreeHost(block_itr->d_ptr))) + if ((error = cudaFreeHost(block_itr->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) + if ((error = cudaEventDestroy(block_itr->ready_event))) break; // Reduce balance and erase entry cached_bytes.free -= block_itr->bytes; if (debug) - _CubLog( + printf( "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", (long long)block_itr->bytes, @@ -450,29 +446,27 @@ namespace notcub { } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); // Return under error if (error) return error; // Try to allocate again - if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault))) - return error; + cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)); } // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; + cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); // Insert into live blocks - mutex.Lock(); + mutex_locker.lock(); live_blocks.insert(search_key); cached_bytes.live += search_key.bytes; - mutex.Unlock(); + mutex_locker.unlock(); if (debug) - _CubLog( + printf( "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device " "%lld).\n", search_key.d_ptr, @@ -486,11 +480,11 @@ namespace notcub { *d_ptr = search_key.d_ptr; if (debug) - _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); + printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); return error; } @@ -505,7 +499,7 @@ namespace notcub { cudaError_t error = cudaSuccess; // Lock - mutex.Lock(); + std::unique_lock mutex_locker(mutex); // Find corresponding block descriptor bool recached = false; @@ -525,7 +519,7 @@ namespace notcub { cached_bytes.free += search_key.bytes; if (debug) - _CubLog( + printf( "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld " "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", (long long)search_key.bytes, @@ -539,31 +533,26 @@ namespace notcub { } } - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; + cudaCheck(error = cudaGetDevice(&entrypoint_device)); if (entrypoint_device != search_key.device) { - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; + cudaCheck(error = cudaSetDevice(search_key.device)); } if (recached) { // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) - return error; + cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)); } // Unlock - mutex.Unlock(); + mutex_locker.unlock(); if (!recached) { // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFreeHost(d_ptr))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; + cudaCheck(error = cudaFreeHost(d_ptr)); + cudaCheck(error = cudaEventDestroy(search_key.ready_event)); if (debug) - _CubLog( + printf( "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available " "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", (long long)search_key.bytes, @@ -578,8 +567,7 @@ namespace notcub { // Reset device if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; @@ -593,7 +581,7 @@ namespace notcub { int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; - mutex.Lock(); + std::unique_lock mutex_locker(mutex); while (!cached_blocks.empty()) { // Get first block @@ -601,28 +589,28 @@ namespace notcub { // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + if ((error = cudaGetDevice(&entrypoint_device))) break; } // Set current device ordinal if necessary if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) + if ((error = cudaSetDevice(begin->device))) break; current_device = begin->device; } // Free host memory - if (CubDebug(error = cudaFreeHost(begin->d_ptr))) + if ((error = cudaFreeHost(begin->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) + if ((error = cudaEventDestroy(begin->ready_event))) break; // Reduce balance and erase entry cached_bytes.free -= begin->bytes; if (debug) - _CubLog( + printf( "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", (long long)begin->bytes, @@ -634,12 +622,11 @@ namespace notcub { cached_blocks.erase(begin); } - mutex.Unlock(); + mutex_locker.unlock(); // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; + cudaCheck(error = cudaSetDevice(entrypoint_device)); } return error; diff --git a/src/cuda/CUDACore/GPUSimpleVector.h b/src/cuda/CUDACore/GPUSimpleVector.h deleted file mode 100644 index 7acea74d9..000000000 --- a/src/cuda/CUDACore/GPUSimpleVector.h +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h -#define HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h - -// author: Felice Pantaleo, CERN, 2018 - -#include -#include - -#include "CUDACore/cudaCompat.h" - -namespace GPU { - template - struct SimpleVector { - constexpr SimpleVector() = default; - - // ownership of m_data stays within the caller - constexpr void construct(int capacity, T *data) { - m_size = 0; - m_capacity = capacity; - m_data = data; - } - - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; - } - } - - template - constexpr int emplace_back_unsafe(Ts &&... args) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; - } - } - - __device__ inline T &back() { return m_data[m_size - 1]; } - - __device__ inline const T &back() const { - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour - } - - // thread-safe version of the vector, when used in a CUDA kernel - __device__ int push_back(const T &element) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - template - __device__ int emplace_back(Ts &&... args) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - // thread safe version of resize - __device__ int extend(int size = 1) { - auto previousSize = atomicAdd(&m_size, size); - if (previousSize < m_capacity) { - return previousSize; - } else { - atomicSub(&m_size, size); - return -1; - } - } - - __device__ int shrink(int size = 1) { - auto previousSize = atomicSub(&m_size, size); - if (previousSize >= size) { - return previousSize - size; - } else { - atomicAdd(&m_size, size); - return -1; - } - } - - inline constexpr bool empty() const { return m_size <= 0; } - inline constexpr bool full() const { return m_size >= m_capacity; } - inline constexpr T &operator[](int i) { return m_data[i]; } - inline constexpr const T &operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline constexpr int size() const { return m_size; } - inline constexpr int capacity() const { return m_capacity; } - inline constexpr T const *data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr void set_data(T *data) { m_data = data; } - - private: - int m_size; - int m_capacity; - - T *m_data; - }; - - // ownership of m_data stays within the caller - template - SimpleVector make_SimpleVector(int capacity, T *data) { - SimpleVector ret; - ret.construct(capacity, data); - return ret; - } - - // ownership of m_data stays within the caller - template - SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { - auto ret = new (mem) SimpleVector(); - ret->construct(capacity, data); - return ret; - } - -} // namespace GPU - -#endif // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h diff --git a/src/cuda/CUDACore/GPUVecArray.h b/src/cuda/CUDACore/GPUVecArray.h deleted file mode 100644 index 79fdfa604..000000000 --- a/src/cuda/CUDACore/GPUVecArray.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h -#define HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h - -// -// Author: Felice Pantaleo, CERN -// - -#include "CUDACore/cudaCompat.h" - -namespace GPU { - - template - class VecArray { - public: - using self = VecArray; - using value_t = T; - - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; - } - } - - template - constexpr int emplace_back_unsafe(Ts &&... args) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; - } - } - - inline constexpr T &back() const { - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour - } - - // thread-safe version of the vector, when used in a CUDA kernel - __device__ int push_back(const T &element) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - template - __device__ int emplace_back(Ts &&... args) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - inline constexpr T pop_back() { - if (m_size > 0) { - auto previousSize = m_size--; - return m_data[previousSize - 1]; - } else - return T(); - } - - inline constexpr T const *begin() const { return m_data; } - inline constexpr T const *end() const { return m_data + m_size; } - inline constexpr T *begin() { return m_data; } - inline constexpr T *end() { return m_data + m_size; } - inline constexpr int size() const { return m_size; } - inline constexpr T &operator[](int i) { return m_data[i]; } - inline constexpr const T &operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline static constexpr int capacity() { return maxSize; } - inline constexpr T const *data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr bool empty() const { return 0 == m_size; } - inline constexpr bool full() const { return maxSize == m_size; } - - private: - T m_data[maxSize]; - - int m_size; - }; - -} // namespace GPU - -#endif // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h diff --git a/src/cuda/CUDACore/HistoContainer.h b/src/cuda/CUDACore/HistoContainer.h index 5ff12f387..c2ac3308d 100644 --- a/src/cuda/CUDACore/HistoContainer.h +++ b/src/cuda/CUDACore/HistoContainer.h @@ -9,10 +9,6 @@ #include #include -#ifdef __CUDACC__ -#include -#endif - #include "CUDACore/AtomicPairCounter.h" #include "CUDACore/cudaCheck.h" #include "CUDACore/cuda_assert.h" @@ -55,54 +51,52 @@ namespace cms { } template - inline void launchZero(Histo *__restrict__ h, - cudaStream_t stream + inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h, + cudaStream_t stream #ifndef __CUDACC__ - = cudaStreamDefault + = cudaStreamDefault #endif ) { - uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + int32_t size = offsetof(Histo, bins) - offsetof(Histo, off); + assert(size >= int(sizeof(uint32_t) * Histo::totbins())); #ifdef __CUDACC__ - cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream)); + cudaCheck(cudaMemsetAsync(poff, 0, size, stream)); #else - ::memset(off, 0, 4 * Histo::totbins()); + ::memset(poff, 0, size); #endif } template - inline void launchFinalize(Histo *__restrict__ h, - uint8_t *__restrict__ ws + inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, + cudaStream_t stream #ifndef __CUDACC__ - = cudaStreamDefault -#endif - , - cudaStream_t stream -#ifndef __CUDACC__ - = cudaStreamDefault + = cudaStreamDefault #endif ) { #ifdef __CUDACC__ - assert(ws); - uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - size_t wss = Histo::wsSize(); - assert(wss > 0); - CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); + uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws)); + auto nthreads = 1024; + auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads; + multiBlockPrefixScan<<>>( + poff, poff, Histo::totbins(), ppsws); + cudaCheck(cudaGetLastError()); #else h->finalize(); #endif } template - inline void fillManyFromVector(Histo *__restrict__ h, - uint8_t *__restrict__ ws, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - uint32_t totSize, - int nthreads, - cudaStream_t stream + inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + int nthreads, + cudaStream_t stream #ifndef __CUDACC__ - = cudaStreamDefault + = cudaStreamDefault #endif ) { launchZero(h, stream); @@ -110,7 +104,7 @@ namespace cms { auto nblocks = (totSize + nthreads - 1) / nthreads; countFromVector<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); - launchFinalize(h, ws, stream); + launchFinalize(h, stream); fillFromVector<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); #else @@ -125,216 +119,205 @@ namespace cms { assoc->bulkFinalizeFill(*apc); } - } // namespace cuda -} // namespace cms + // iteratate over N bins left and right of the one containing "v" + template + __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } -// iteratate over N bins left and right of the one containing "v" -template -__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { - int bs = Hist::bin(value); - int be = std::min(int(Hist::nbins() - 1), bs + n); - bs = std::max(0, bs - n); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } -} - -// iteratate over bins containing all values in window wmin, wmax -template -__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { - auto bs = Hist::bin(wmin); - auto be = Hist::bin(wmax); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } -} - -template -class HistoContainer { -public: - using Counter = uint32_t; - - using CountersOnly = HistoContainer; - - using index_type = I; - using UT = typename std::make_unsigned::type; - - static constexpr uint32_t ilog2(uint32_t v) { - constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; - constexpr uint32_t s[] = {1, 2, 4, 8, 16}; - - uint32_t r = 0; // result of log2(v) will go here - for (auto i = 4; i >= 0; i--) - if (v & b[i]) { - v >>= s[i]; - r |= s[i]; + // iteratate over bins containing all values in window wmin, wmax + template + __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); } - return r; - } + } - static constexpr uint32_t sizeT() { return S; } - static constexpr uint32_t nbins() { return NBINS; } - static constexpr uint32_t nhists() { return NHISTS; } - static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } - static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } - static constexpr uint32_t capacity() { return SIZE; } + template + class HistoContainer { + public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } - static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr uint32_t capacity() { return SIZE; } - __host__ static size_t wsSize() { -#ifdef __CUDACC__ - uint32_t *v = nullptr; - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()); - return temp_storage_bytes; -#else - return 0; -#endif - } + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } - static constexpr UT bin(T t) { - constexpr uint32_t shift = sizeT() - nbits(); - constexpr uint32_t mask = (1 << nbits()) - 1; - return (t >> shift) & mask; - } + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } - __host__ __device__ void zero() { - for (auto &i : off) - i = 0; - } + __host__ __device__ void zero() { + for (auto &i : off) + i = 0; + } - __host__ __device__ void add(CountersOnly const &co) { - for (uint32_t i = 0; i < totbins(); ++i) { + __host__ __device__ __forceinline__ void add(CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { #ifdef __CUDA_ARCH__ - atomicAdd(off + i, co.off[i]); + atomicAdd(off + i, co.off[i]); #else - auto &a = (std::atomic &)(off[i]); - a += co.off[i]; + auto &a = (std::atomic &)(off[i]); + a += co.off[i]; #endif - } - } + } + } - static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { + static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { #ifdef __CUDA_ARCH__ - return atomicAdd(&x, 1); + return atomicAdd(&x, 1); #else - auto &a = (std::atomic &)(x); - return a++; + auto &a = (std::atomic &)(x); + return a++; #endif - } + } - static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { + static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { #ifdef __CUDA_ARCH__ - return atomicSub(&x, 1); + return atomicSub(&x, 1); #else - auto &a = (std::atomic &)(x); - return a--; + auto &a = (std::atomic &)(x); + return a--; #endif - } - - __host__ __device__ __forceinline__ void countDirect(T b) { - assert(b < nbins()); - atomicIncrement(off[b]); - } - - __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) { - assert(b < nbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } - - __device__ __host__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { - auto c = apc.add(n); - if (c.m >= nbins()) - return -int32_t(c.m); - off[c.m] = c.n; - for (uint32_t j = 0; j < n; ++j) - bins[c.n + j] = v[j]; - return c.m; - } - - __device__ __host__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { - off[apc.get().m] = apc.get().n; - } - - __device__ __host__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { - auto m = apc.get().m; - auto n = apc.get().n; - if (m >= nbins()) { // overflow! - off[nbins()] = uint32_t(off[nbins() - 1]); - return; - } - auto first = m + blockDim.x * blockIdx.x + threadIdx.x; - for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { - off[i] = n; - } - } - - __host__ __device__ __forceinline__ void count(T t) { - uint32_t b = bin(t); - assert(b < nbins()); - atomicIncrement(off[b]); - } - - __host__ __device__ __forceinline__ void fill(T t, index_type j) { - uint32_t b = bin(t); - assert(b < nbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } - - __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - atomicIncrement(off[b]); - } - - __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } - - __device__ __host__ __forceinline__ void finalize(Counter *ws = nullptr) { - assert(off[totbins() - 1] == 0); - blockPrefixScan(off, totbins(), ws); - assert(off[totbins() - 1] == off[totbins() - 2]); - } - - constexpr auto size() const { return uint32_t(off[totbins() - 1]); } - constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } - - constexpr index_type const *begin() const { return bins; } - constexpr index_type const *end() const { return begin() + size(); } - - constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } - constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } - - Counter off[totbins()]; - index_type bins[capacity()]; -}; - -template -using OneToManyAssoc = HistoContainer; + } + + __host__ __device__ __forceinline__ void countDirect(T b) { + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) { + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } + + __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } + auto first = m + blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { + off[i] = n; + } + } + + __host__ __device__ __forceinline__ void count(T t) { + uint32_t b = bin(t); + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j) { + uint32_t b = bin(t); + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) { + assert(off[totbins() - 1] == 0); + blockPrefixScan(off, totbins(), ws); + assert(off[totbins() - 1] == off[totbins() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + + Counter off[totbins()]; + int32_t psws; // prefix-scan working space + index_type bins[capacity()]; + }; + + template + using OneToManyAssoc = HistoContainer; + + } // namespace cuda +} // namespace cms #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/src/cuda/CUDACore/HostAllocator.h b/src/cuda/CUDACore/HostAllocator.h new file mode 100644 index 000000000..19c86e31f --- /dev/null +++ b/src/cuda/CUDACore/HostAllocator.h @@ -0,0 +1,55 @@ +#ifndef HeterogeneousCore_CUDAUtilities_HostAllocator_h +#define HeterogeneousCore_CUDAUtilities_HostAllocator_h + +#include +#include +#include + +namespace cms { + namespace cuda { + + class bad_alloc : public std::bad_alloc { + public: + bad_alloc(cudaError_t error) noexcept : error_(error) {} + + const char* what() const noexcept override { return cudaGetErrorString(error_); } + + private: + cudaError_t error_; + }; + + template + class HostAllocator { + public: + using value_type = T; + + template + struct rebind { + using other = HostAllocator; + }; + + T* allocate(std::size_t n) const __attribute__((warn_unused_result)) __attribute__((malloc)) + __attribute__((returns_nonnull)) { + void* ptr = nullptr; + cudaError_t status = cudaMallocHost(&ptr, n * sizeof(T), FLAGS); + if (status != cudaSuccess) { + throw bad_alloc(status); + } + if (ptr == nullptr) { + throw std::bad_alloc(); + } + return static_cast(ptr); + } + + void deallocate(T* p, std::size_t n) const { + cudaError_t status = cudaFreeHost(p); + if (status != cudaSuccess) { + throw bad_alloc(status); + } + } + }; + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_HostAllocator_h diff --git a/src/cuda/CUDACore/HostProduct.h b/src/cuda/CUDACore/HostProduct.h index 542ea2e30..6aaa38e3e 100644 --- a/src/cuda/CUDACore/HostProduct.h +++ b/src/cuda/CUDACore/HostProduct.h @@ -19,7 +19,7 @@ class HostProduct { auto const& operator*() const { return *get(); } - auto const* operator-> () const { return get(); } + auto const* operator->() const { return get(); } private: cms::cuda::host::unique_ptr hm_ptr; //! diff --git a/src/cuda/CUDACore/ScopedContext.cc b/src/cuda/CUDACore/ScopedContext.cc index b2dc58f32..14bff04eb 100644 --- a/src/cuda/CUDACore/ScopedContext.cc +++ b/src/cuda/CUDACore/ScopedContext.cc @@ -91,7 +91,7 @@ namespace cms::cuda { ScopedContextAcquire::~ScopedContextAcquire() { holderHelper_.enqueueCallback(device(), stream()); if (contextState_) { - contextState_->set(device(), std::move(streamPtr())); + contextState_->set(device(), streamPtr()); } } diff --git a/src/cuda/CUDACore/SimpleVector.h b/src/cuda/CUDACore/SimpleVector.h new file mode 100644 index 000000000..f21f51cf8 --- /dev/null +++ b/src/cuda/CUDACore/SimpleVector.h @@ -0,0 +1,141 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h +#define HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h + +// author: Felice Pantaleo, CERN, 2018 + +#include +#include + +#include "CUDACore/cudaCompat.h" + +namespace cms { + namespace cuda { + + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + __device__ inline T &back() { return m_data[m_size - 1]; } + + __device__ inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + // thread safe version of resize + __device__ int extend(int size = 1) { + auto previousSize = atomicAdd(&m_size, size); + if (previousSize < m_capacity) { + return previousSize; + } else { + atomicSub(&m_size, size); + return -1; + } + } + + __device__ int shrink(int size = 1) { + auto previousSize = atomicSub(&m_size, size); + if (previousSize >= size) { + return previousSize - size; + } else { + atomicAdd(&m_size, size); + return -1; + } + } + + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h diff --git a/src/cuda/CUDACore/VecArray.h b/src/cuda/CUDACore/VecArray.h new file mode 100644 index 000000000..595238ecd --- /dev/null +++ b/src/cuda/CUDACore/VecArray.h @@ -0,0 +1,106 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_VecArray_h +#define HeterogeneousCore_CUDAUtilities_interface_VecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +#include "CUDACore/cudaCompat.h" + +namespace cms { + namespace cuda { + + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_VecArray_h diff --git a/src/cuda/CUDACore/copyAsync.h b/src/cuda/CUDACore/copyAsync.h index b850b9ff4..47e55c74a 100644 --- a/src/cuda/CUDACore/copyAsync.h +++ b/src/cuda/CUDACore/copyAsync.h @@ -3,13 +3,16 @@ #include "CUDACore/cudaCheck.h" #include "CUDACore/device_unique_ptr.h" +#include "CUDACore/host_noncached_unique_ptr.h" #include "CUDACore/host_unique_ptr.h" #include namespace cms { namespace cuda { + // Single element + template inline void copyAsync(device::unique_ptr& dst, const host::unique_ptr& src, cudaStream_t stream) { // Shouldn't compile for array types because of sizeof(T), but @@ -19,6 +22,15 @@ namespace cms { cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); } + template + inline void copyAsync(device::unique_ptr& dst, const host::noncached::unique_ptr& src, cudaStream_t stream) { + // Shouldn't compile for array types because of sizeof(T), but + // let's add an assert with a more helpful message + static_assert(std::is_array::value == false, + "For array types, use the other overload with the size parameter"); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); + } + template inline void copyAsync(host::unique_ptr& dst, const device::unique_ptr& src, cudaStream_t stream) { static_assert(std::is_array::value == false, @@ -27,6 +39,7 @@ namespace cms { } // Multiple elements + template inline void copyAsync(device::unique_ptr& dst, const host::unique_ptr& src, @@ -35,6 +48,14 @@ namespace cms { cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); } + template + inline void copyAsync(device::unique_ptr& dst, + const host::noncached::unique_ptr& src, + size_t nelements, + cudaStream_t stream) { + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); + } + template inline void copyAsync(host::unique_ptr& dst, const device::unique_ptr& src, diff --git a/src/cuda/CUDACore/cudaCompat.cc b/src/cuda/CUDACore/cudaCompat.cc index 7a93eb2e9..e6bb8069d 100644 --- a/src/cuda/CUDACore/cudaCompat.cc +++ b/src/cuda/CUDACore/cudaCompat.cc @@ -1,13 +1,15 @@ #include "CUDACore/cudaCompat.h" -namespace cudaCompat { - thread_local dim3 blockIdx; - thread_local dim3 gridDim; -} // namespace cudaCompat +namespace cms { + namespace cudacompat { + thread_local dim3 blockIdx; + thread_local dim3 gridDim; + } // namespace cudacompat +} // namespace cms namespace { struct InitGrid { - InitGrid() { cudaCompat::resetGrid(); } + InitGrid() { cms::cudacompat::resetGrid(); } }; const InitGrid initGrid; diff --git a/src/cuda/CUDACore/cudaCompat.h b/src/cuda/CUDACore/cudaCompat.h index 32c7d7b48..f9b4b2f8a 100644 --- a/src/cuda/CUDACore/cudaCompat.h +++ b/src/cuda/CUDACore/cudaCompat.h @@ -13,69 +13,78 @@ #include -namespace cudaCompat { +namespace cms { + namespace cudacompat { #ifndef __CUDA_RUNTIME_H__ - struct dim3 { - uint32_t x, y, z; - }; + struct dim3 { + uint32_t x, y, z; + }; #endif - const dim3 threadIdx = {0, 0, 0}; - const dim3 blockDim = {1, 1, 1}; - - extern thread_local dim3 blockIdx; - extern thread_local dim3 gridDim; - - template - T1 atomicInc(T1* a, T2 b) { - auto ret = *a; - if ((*a) < T1(b)) - (*a)++; - return ret; - } - - template - T1 atomicAdd(T1* a, T2 b) { - auto ret = *a; - (*a) += b; - return ret; - } - - template - T1 atomicSub(T1* a, T2 b) { - auto ret = *a; - (*a) -= b; - return ret; - } - - template - T1 atomicMin(T1* a, T2 b) { - auto ret = *a; - *a = std::min(*a, T1(b)); - return ret; - } - template - T1 atomicMax(T1* a, T2 b) { - auto ret = *a; - *a = std::max(*a, T1(b)); - return ret; - } - - inline void __syncthreads() {} - inline void __threadfence() {} - inline bool __syncthreads_or(bool x) { return x; } - inline bool __syncthreads_and(bool x) { return x; } - template - inline T __ldg(T const* x) { - return *x; - } - - inline void resetGrid() { - blockIdx = {0, 0, 0}; - gridDim = {1, 1, 1}; - } - -} // namespace cudaCompat + const dim3 threadIdx = {0, 0, 0}; + const dim3 blockDim = {1, 1, 1}; + + extern thread_local dim3 blockIdx; + extern thread_local dim3 gridDim; + + template + T1 atomicCAS(T1* address, T1 compare, T2 val) { + T1 old = *address; + *address = old == compare ? val : old; + return old; + } + + template + T1 atomicInc(T1* a, T2 b) { + auto ret = *a; + if ((*a) < T1(b)) + (*a)++; + return ret; + } + + template + T1 atomicAdd(T1* a, T2 b) { + auto ret = *a; + (*a) += b; + return ret; + } + + template + T1 atomicSub(T1* a, T2 b) { + auto ret = *a; + (*a) -= b; + return ret; + } + + template + T1 atomicMin(T1* a, T2 b) { + auto ret = *a; + *a = std::min(*a, T1(b)); + return ret; + } + template + T1 atomicMax(T1* a, T2 b) { + auto ret = *a; + *a = std::max(*a, T1(b)); + return ret; + } + + inline void __syncthreads() {} + inline void __threadfence() {} + inline bool __syncthreads_or(bool x) { return x; } + inline bool __syncthreads_and(bool x) { return x; } + template + inline T __ldg(T const* x) { + return *x; + } + + inline void resetGrid() { + blockIdx = {0, 0, 0}; + gridDim = {1, 1, 1}; + } + + } // namespace cudacompat +} // namespace cms // some not needed as done by cuda runtime... #ifndef __CUDA_RUNTIME_H__ @@ -95,7 +104,7 @@ namespace cudaCompat { #endif #ifndef __CUDA_ARCH__ -using namespace cudaCompat; +using namespace cms::cudacompat; #endif #endif diff --git a/src/cuda/CUDACore/deviceAllocatorStatus.cc b/src/cuda/CUDACore/deviceAllocatorStatus.cc new file mode 100644 index 000000000..5d4a0ca09 --- /dev/null +++ b/src/cuda/CUDACore/deviceAllocatorStatus.cc @@ -0,0 +1,7 @@ +#include "CUDACore/deviceAllocatorStatus.h" + +#include "getCachingDeviceAllocator.h" + +namespace cms::cuda { + allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); } +} // namespace cms::cuda diff --git a/src/cuda/CUDACore/deviceAllocatorStatus.h b/src/cuda/CUDACore/deviceAllocatorStatus.h new file mode 100644 index 000000000..92f9f87e8 --- /dev/null +++ b/src/cuda/CUDACore/deviceAllocatorStatus.h @@ -0,0 +1,23 @@ +#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h +#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h + +#include + +namespace cms { + namespace cuda { + namespace allocator { + struct TotalBytes { + size_t free; + size_t live; + size_t liveRequested; // CMS: monitor also requested amount + TotalBytes() { free = live = liveRequested = 0; } + }; + /// Map type of device ordinals to the number of cached bytes cached by each device + using GpuCachedBytes = std::map; + } // namespace allocator + + allocator::GpuCachedBytes deviceAllocatorStatus(); + } // namespace cuda +} // namespace cms + +#endif diff --git a/src/cuda/CUDACore/getCachingDeviceAllocator.h b/src/cuda/CUDACore/getCachingDeviceAllocator.h index f959a50f5..00cc0af54 100644 --- a/src/cuda/CUDACore/getCachingDeviceAllocator.h +++ b/src/cuda/CUDACore/getCachingDeviceAllocator.h @@ -1,6 +1,9 @@ #ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator #define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator +#include +#include + #include "CUDACore/cudaCheck.h" #include "CUDACore/deviceCount.h" #include "CachingDeviceAllocator.h" @@ -9,11 +12,11 @@ namespace cms::cuda::allocator { // Use caching or not constexpr bool useCaching = true; // Growth factor (bin_growth in cub::CachingDeviceAllocator - constexpr unsigned int binGrowth = 8; + constexpr unsigned int binGrowth = 2; // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator - constexpr unsigned int minBin = 1; + constexpr unsigned int minBin = 8; // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. - constexpr unsigned int maxBin = 10; + constexpr unsigned int maxBin = 30; // Total storage for the allocator. 0 means no limit. constexpr size_t maxCachedBytes = 0; // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. @@ -39,6 +42,27 @@ namespace cms::cuda::allocator { } inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() { + if (debug) { + std::cout << "cub::CachingDeviceAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + std::cout << " " << std::setw(9) << binSize << " B\n"; + } + } + std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n"; + } + // the public interface is thread safe static notcub::CachingDeviceAllocator allocator{binGrowth, minBin, diff --git a/src/cuda/CUDACore/getCachingHostAllocator.h b/src/cuda/CUDACore/getCachingHostAllocator.h index 601374685..d6dba9a4a 100644 --- a/src/cuda/CUDACore/getCachingHostAllocator.h +++ b/src/cuda/CUDACore/getCachingHostAllocator.h @@ -1,6 +1,9 @@ #ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator #define HeterogeneousCore_CUDACore_src_getCachingHostAllocator +#include +#include + #include "CUDACore/cudaCheck.h" #include "CachingHostAllocator.h" @@ -8,13 +11,34 @@ namespace cms::cuda::allocator { inline notcub::CachingHostAllocator& getCachingHostAllocator() { + if (debug) { + std::cout << "cub::CachingHostAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + std::cout << " " << std::setw(9) << binSize << " B\n"; + } + } + std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n"; + } + // the public interface is thread safe static notcub::CachingHostAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; + minBin, + maxBin, + minCachedBytes(), + false, // do not skip cleanup + debug}; return allocator; } } // namespace cms::cuda::allocator diff --git a/src/cuda/CUDACore/prefixScan.h b/src/cuda/CUDACore/prefixScan.h index f74154a17..5624af03f 100644 --- a/src/cuda/CUDACore/prefixScan.h +++ b/src/cuda/CUDACore/prefixScan.h @@ -7,6 +7,7 @@ #include "CUDACore/cuda_assert.h" #ifdef __CUDA_ARCH__ + template __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) { // ci and co may be the same @@ -20,10 +21,7 @@ __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __re } co[i] = x; } -#endif -//same as above may remove -#ifdef __CUDA_ARCH__ template __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) { auto x = c[i]; @@ -36,134 +34,155 @@ __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) } c[i] = x; } + #endif -// limited to 32*32 elements.... -template -__device__ __host__ void __forceinline__ blockPrefixScan(T const* __restrict__ ci, - T* __restrict__ co, - uint32_t size, - T* ws +namespace cms { + namespace cuda { + + // limited to 32*32 elements.... + template + __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci, + VT* co, + uint32_t size, + T* ws #ifndef __CUDA_ARCH__ - = nullptr + = nullptr #endif -) { + ) { #ifdef __CUDA_ARCH__ - assert(ws); - assert(size <= 1024); - assert(0 == blockDim.x % 32); - auto first = threadIdx.x; - auto mask = __ballot_sync(0xffffffff, first < size); - - for (auto i = first; i < size; i += blockDim.x) { - warpPrefixScan(ci, co, i, mask); - auto laneId = threadIdx.x & 0x1f; - auto warpId = i / 32; - assert(warpId < 32); - if (31 == laneId) - ws[warpId] = co[i]; - mask = __ballot_sync(mask, i + blockDim.x < size); - } - __syncthreads(); - if (size <= 32) - return; - if (threadIdx.x < 32) - warpPrefixScan(ws, threadIdx.x, 0xffffffff); - __syncthreads(); - for (auto i = first + 32; i < size; i += blockDim.x) { - auto warpId = i / 32; - co[i] += ws[warpId - 1]; - } - __syncthreads(); + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(ci, co, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = co[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + co[i] += ws[warpId - 1]; + } + __syncthreads(); #else - co[0] = ci[0]; - for (uint32_t i = 1; i < size; ++i) - co[i] = ci[i] + co[i - 1]; + co[0] = ci[0]; + for (uint32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; #endif -} - -// same as above, may remove -// limited to 32*32 elements.... -template -__device__ __host__ void __forceinline__ blockPrefixScan(T* c, - uint32_t size, - T* ws + } + + // same as above, may remove + // limited to 32*32 elements.... + template + __host__ __device__ __forceinline__ void blockPrefixScan(T* c, + uint32_t size, + T* ws #ifndef __CUDA_ARCH__ - = nullptr + = nullptr #endif -) { + ) { #ifdef __CUDA_ARCH__ - assert(ws); - assert(size <= 1024); - assert(0 == blockDim.x % 32); - auto first = threadIdx.x; - auto mask = __ballot_sync(0xffffffff, first < size); - - for (auto i = first; i < size; i += blockDim.x) { - warpPrefixScan(c, i, mask); - auto laneId = threadIdx.x & 0x1f; - auto warpId = i / 32; - assert(warpId < 32); - if (31 == laneId) - ws[warpId] = c[i]; - mask = __ballot_sync(mask, i + blockDim.x < size); - } - __syncthreads(); - if (size <= 32) - return; - if (threadIdx.x < 32) - warpPrefixScan(ws, threadIdx.x, 0xffffffff); - __syncthreads(); - for (auto i = first + 32; i < size; i += blockDim.x) { - auto warpId = i / 32; - c[i] += ws[warpId - 1]; - } - __syncthreads(); + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(c, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = c[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + c[i] += ws[warpId - 1]; + } + __syncthreads(); #else - for (uint32_t i = 1; i < size; ++i) - c[i] += c[i - 1]; + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; #endif -} - -// limited to 1024*1024 elements.... -template -__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) { - __shared__ T ws[32]; - // first each block does a scan of size 1024; (better be enough blocks....) - assert(1024 * gridDim.x >= size); - int off = 1024 * blockIdx.x; - if (size - off > 0) - blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws); - - // count blocks that finished - __shared__ bool isLastBlockDone; - if (0 == threadIdx.x) { - auto value = atomicAdd(pc, 1); // block counter - isLastBlockDone = (value == (int(gridDim.x) - 1)); - } + } - __syncthreads(); - - if (!isLastBlockDone) - return; - - // good each block has done its work and now we are left in last block +#ifdef __CUDA_ARCH__ + // see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087 + __device__ __forceinline__ unsigned dynamic_smem_size() { + unsigned ret; + asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret)); + return ret; + } +#endif - // let's get the partial sums from each block - __shared__ T psum[1024]; - for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) { - auto j = 1024 * i + 1023; - psum[i] = (j < size) ? co[j] : T(0); - } - __syncthreads(); - blockPrefixScan(psum, psum, gridDim.x, ws); - - // now it would have been handy to have the other blocks around... - int first = threadIdx.x; // + blockDim.x * blockIdx.x - for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) { - auto k = i / 1024; // block - co[i] += psum[k - 1]; - } -} + // in principle not limited.... + template + __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) { + volatile T const* ci = ici; + volatile T* co = ico; + __shared__ T ws[32]; +#ifdef __CUDA_ARCH__ + assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below +#endif + assert(blockDim.x * gridDim.x >= size); + // first each block does a scan + int off = blockDim.x * blockIdx.x; + if (size - off > 0) + blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws); + + // count blocks that finished + __shared__ bool isLastBlockDone; + if (0 == threadIdx.x) { + __threadfence(); + auto value = atomicAdd(pc, 1); // block counter + isLastBlockDone = (value == (int(gridDim.x) - 1)); + } + + __syncthreads(); + + if (!isLastBlockDone) + return; + + assert(int(gridDim.x) == *pc); + + // good each block has done its work and now we are left in last block + + // let's get the partial sums from each block + extern __shared__ T psum[]; + for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) { + auto j = blockDim.x * i + blockDim.x - 1; + psum[i] = (j < size) ? co[j] : T(0); + } + __syncthreads(); + blockPrefixScan(psum, psum, gridDim.x, ws); + + // now it would have been handy to have the other blocks around... + for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) { + co[i] += psum[k]; + } + } + } // namespace cuda +} // namespace cms #endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h diff --git a/src/cuda/CUDACore/radixSort.h b/src/cuda/CUDACore/radixSort.h index 859ab42f2..ff1da2d46 100644 --- a/src/cuda/CUDACore/radixSort.h +++ b/src/cuda/CUDACore/radixSort.h @@ -1,7 +1,10 @@ #ifndef HeterogeneousCoreCUDAUtilities_radixSort_H #define HeterogeneousCoreCUDAUtilities_radixSort_H +#ifdef __CUDACC__ + #include +#include #include "CUDACore/cuda_assert.h" @@ -79,8 +82,8 @@ __device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, u template -__device__ void __forceinline__ -radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { +__device__ __forceinline__ void radixSortImpl( + T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { constexpr int d = 8, w = 8 * sizeof(T); constexpr int sb = 1 << d; constexpr int ps = int(sizeof(T)) - NS; @@ -217,28 +220,30 @@ radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t s template ::value, T>::type* = nullptr> -__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { radixSortImpl(a, ind, ind2, size, dummyReorder); } template ::value && std::is_signed::value, T>::type* = nullptr> -__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { radixSortImpl(a, ind, ind2, size, reorderSigned); } template ::value, T>::type* = nullptr> -__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { using I = int; radixSortImpl((I const*)(a), ind, ind2, size, reorderFloat); } template -__device__ void __forceinline__ -radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { +__device__ __forceinline__ void radixSortMulti(T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace) { extern __shared__ uint16_t ws[]; auto a = v + offsets[blockIdx.x]; @@ -250,17 +255,23 @@ radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* w radixSort(a, ind, ind2, size); } -template -__global__ void __launch_bounds__(256, 4) - radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { - radixSortMulti(v, index, offsets, workspace); -} +namespace cms { + namespace cuda { -template -__global__ void -// __launch_bounds__(256, 4) -radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { - radixSortMulti(v, index, offsets, workspace); -} + template + __global__ void __launch_bounds__(256, 4) + radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); + } + + template + __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); + } + + } // namespace cuda +} // namespace cms + +#endif // __CUDACC__ #endif // HeterogeneousCoreCUDAUtilities_radixSort_H diff --git a/src/cuda/CUDACore/requireDevices.cc b/src/cuda/CUDACore/requireDevices.cc new file mode 100644 index 000000000..86eb408ff --- /dev/null +++ b/src/cuda/CUDACore/requireDevices.cc @@ -0,0 +1,30 @@ +#include +#include + +#include + +#include "CUDACore/requireDevices.h" + +namespace cms::cudatest { + bool testDevices() { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status != cudaSuccess) { + std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped." + << "\n"; + return false; + } + if (devices == 0) { + std::cerr << "No CUDA devices available, the test will be skipped." + << "\n"; + return false; + } + return true; + } + + void requireDevices() { + if (not testDevices()) { + exit(EXIT_SUCCESS); + } + } +} // namespace cms::cudatest diff --git a/src/cuda/CUDACore/requireDevices.h b/src/cuda/CUDACore/requireDevices.h new file mode 100644 index 000000000..0795175b3 --- /dev/null +++ b/src/cuda/CUDACore/requireDevices.h @@ -0,0 +1,17 @@ +#ifndef HeterogeneousCore_CUDAUtilities_requireDevices_h +#define HeterogeneousCore_CUDAUtilities_requireDevices_h + +/** + * These functions are meant to be called only from unit tests. + */ +namespace cms { + namespace cudatest { + /// In presence of CUDA devices, return true; otherwise print message and return false + bool testDevices(); + + /// Print message and exit if there are no CUDA devices + void requireDevices(); + } // namespace cudatest +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_requireDevices_h diff --git a/src/cuda/CUDADataFormats/BeamSpotCUDA.cc b/src/cuda/CUDADataFormats/BeamSpotCUDA.cc deleted file mode 100644 index 3d146947c..000000000 --- a/src/cuda/CUDADataFormats/BeamSpotCUDA.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include "CUDADataFormats/BeamSpotCUDA.h" - -#include "CUDACore/cudaCheck.h" -#include "CUDACore/device_unique_ptr.h" - -BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) { - data_d_ = cms::cuda::make_device_unique(stream); - cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream)); -} diff --git a/src/cuda/CUDADataFormats/BeamSpotCUDA.h b/src/cuda/CUDADataFormats/BeamSpotCUDA.h index e529f6eeb..a090ef347 100644 --- a/src/cuda/CUDADataFormats/BeamSpotCUDA.h +++ b/src/cuda/CUDADataFormats/BeamSpotCUDA.h @@ -1,32 +1,33 @@ #ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h #define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h -#include "CUDACore/device_unique_ptr.h" - #include +#include "DataFormats/BeamSpotPOD.h" +#include "CUDACore/device_unique_ptr.h" + class BeamSpotCUDA { public: - // alignas(128) doesn't really make sense as there is only one - // beamspot per event? - struct Data { - float x, y, z; // position - // TODO: add covariance matrix - - float sigmaZ; - float beamWidthX, beamWidthY; - float dxdz, dydz; - float emittanceX, emittanceY; - float betaStar; - }; - + // default constructor, required by cms::cuda::Product BeamSpotCUDA() = default; - BeamSpotCUDA(Data const* data_h, cudaStream_t stream); - Data const* data() const { return data_d_.get(); } + // constructor that allocates cached device memory on the given CUDA stream + BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique(stream); } + + // movable, non-copiable + BeamSpotCUDA(BeamSpotCUDA const&) = delete; + BeamSpotCUDA(BeamSpotCUDA&&) = default; + BeamSpotCUDA& operator=(BeamSpotCUDA const&) = delete; + BeamSpotCUDA& operator=(BeamSpotCUDA&&) = default; + + BeamSpotPOD* data() { return data_d_.get(); } + BeamSpotPOD const* data() const { return data_d_.get(); } + + cms::cuda::device::unique_ptr& ptr() { return data_d_; } + cms::cuda::device::unique_ptr const& ptr() const { return data_d_; } private: - cms::cuda::device::unique_ptr data_d_; + cms::cuda::device::unique_ptr data_d_; }; -#endif +#endif // CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h diff --git a/src/cuda/CUDADataFormats/HeterogeneousSoA.h b/src/cuda/CUDADataFormats/HeterogeneousSoA.h index 1134cd5ff..cfaad449c 100644 --- a/src/cuda/CUDADataFormats/HeterogeneousSoA.h +++ b/src/cuda/CUDADataFormats/HeterogeneousSoA.h @@ -27,13 +27,13 @@ class HeterogeneousSoA { auto const &operator*() const { return *get(); } - auto const *operator-> () const { return get(); } + auto const *operator->() const { return get(); } auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } auto &operator*() { return *get(); } - auto *operator-> () { return get(); } + auto *operator->() { return get(); } // in reality valid only for GPU version... cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const { @@ -50,94 +50,96 @@ class HeterogeneousSoA { std::unique_ptr std_ptr; //! }; -namespace cudaCompat { - - struct GPUTraits { - template - using unique_ptr = cms::cuda::device::unique_ptr; - - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - }; - - struct HostTraits { - template - using unique_ptr = cms::cuda::host::unique_ptr; - - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - }; - - struct CPUTraits { - template - using unique_ptr = std::unique_ptr; - - template - static auto make_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_unique(size_t size, cudaStream_t) { - return std::make_unique(size); - } - - template - static auto make_host_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_device_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_device_unique(size_t size, cudaStream_t) { - return std::make_unique(size); - } - }; - -} // namespace cudaCompat +namespace cms { + namespace cudacompat { + + struct GPUTraits { + template + using unique_ptr = cms::cuda::device::unique_ptr; + + template + static auto make_unique(cudaStream_t stream) { + return cms::cuda::make_device_unique(stream); + } + + template + static auto make_unique(size_t size, cudaStream_t stream) { + return cms::cuda::make_device_unique(size, stream); + } + + template + static auto make_host_unique(cudaStream_t stream) { + return cms::cuda::make_host_unique(stream); + } + + template + static auto make_device_unique(cudaStream_t stream) { + return cms::cuda::make_device_unique(stream); + } + + template + static auto make_device_unique(size_t size, cudaStream_t stream) { + return cms::cuda::make_device_unique(size, stream); + } + }; + + struct HostTraits { + template + using unique_ptr = cms::cuda::host::unique_ptr; + + template + static auto make_unique(cudaStream_t stream) { + return cms::cuda::make_host_unique(stream); + } + + template + static auto make_host_unique(cudaStream_t stream) { + return cms::cuda::make_host_unique(stream); + } + + template + static auto make_device_unique(cudaStream_t stream) { + return cms::cuda::make_device_unique(stream); + } + + template + static auto make_device_unique(size_t size, cudaStream_t stream) { + return cms::cuda::make_device_unique(size, stream); + } + }; + + struct CPUTraits { + template + using unique_ptr = std::unique_ptr; + + template + static auto make_unique(cudaStream_t) { + return std::make_unique(); + } + + template + static auto make_unique(size_t size, cudaStream_t) { + return std::make_unique(size); + } + + template + static auto make_host_unique(cudaStream_t) { + return std::make_unique(); + } + + template + static auto make_device_unique(cudaStream_t) { + return std::make_unique(); + } + + template + static auto make_device_unique(size_t size, cudaStream_t) { + return std::make_unique(size); + } + }; + + } // namespace cudacompat +} // namespace cms // a heterogeneous unique pointer (of a different sort) ... template @@ -178,10 +180,10 @@ cms::cuda::host::unique_ptr HeterogeneousSoAImpl::toHostAsync(cuda } template -using HeterogeneousSoAGPU = HeterogeneousSoAImpl; +using HeterogeneousSoAGPU = HeterogeneousSoAImpl; template -using HeterogeneousSoACPU = HeterogeneousSoAImpl; +using HeterogeneousSoACPU = HeterogeneousSoAImpl; template -using HeterogeneousSoAHost = HeterogeneousSoAImpl; +using HeterogeneousSoAHost = HeterogeneousSoAImpl; #endif diff --git a/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h b/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h index 157035577..579c67092 100644 --- a/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h +++ b/src/cuda/CUDADataFormats/PixelTrackHeterogeneous.h @@ -17,7 +17,7 @@ class TrackSoAT { using Quality = trackQuality::Quality; using hindex_type = uint16_t; - using HitContainer = OneToManyAssoc; + using HitContainer = cms::cuda::OneToManyAssoc; // Always check quality is at least loose! // CUDA does not support enums in __lgc ... diff --git a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc index c667a90ea..b19664874 100644 --- a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc +++ b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.cc @@ -9,13 +9,13 @@ SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream) : formatterErrors_h(std::move(errors)) { - error_d = cms::cuda::make_device_unique>(stream); + error_d = cms::cuda::make_device_unique>(stream); data_d = cms::cuda::make_device_unique(maxFedWords, stream); cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream); - error_h = cms::cuda::make_host_unique>(stream); - GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); + error_h = cms::cuda::make_host_unique>(stream); + cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); assert(error_h->empty()); assert(error_h->capacity() == static_cast(maxFedWords)); @@ -38,5 +38,5 @@ SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync } auto err = *error_h; err.set_data(data.get()); - return HostDataError(std::move(err), std::move(data)); + return HostDataError(err, std::move(data)); } diff --git a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h index 46fec5688..9c7c874ee 100644 --- a/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h +++ b/src/cuda/CUDADataFormats/SiPixelDigiErrorsCUDA.h @@ -1,12 +1,12 @@ #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h -#include "DataFormats/PixelErrors.h" +#include + +#include "CUDACore/SimpleVector.h" #include "CUDACore/device_unique_ptr.h" #include "CUDACore/host_unique_ptr.h" -#include "CUDACore/GPUSimpleVector.h" - -#include +#include "DataFormats/PixelErrors.h" class SiPixelDigiErrorsCUDA { public: @@ -21,20 +21,20 @@ class SiPixelDigiErrorsCUDA { const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; } - GPU::SimpleVector* error() { return error_d.get(); } - GPU::SimpleVector const* error() const { return error_d.get(); } - GPU::SimpleVector const* c_error() const { return error_d.get(); } + cms::cuda::SimpleVector* error() { return error_d.get(); } + cms::cuda::SimpleVector const* error() const { return error_d.get(); } + cms::cuda::SimpleVector const* c_error() const { return error_d.get(); } using HostDataError = - std::pair, cms::cuda::host::unique_ptr>; + std::pair, cms::cuda::host::unique_ptr>; HostDataError dataErrorToHostAsync(cudaStream_t stream) const; void copyErrorToHostAsync(cudaStream_t stream); private: cms::cuda::device::unique_ptr data_d; - cms::cuda::device::unique_ptr> error_d; - cms::cuda::host::unique_ptr> error_h; + cms::cuda::device::unique_ptr> error_d; + cms::cuda::host::unique_ptr> error_h; PixelFormatterErrors formatterErrors_h; }; diff --git a/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h b/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h index 0eeea4eea..2320fa6d6 100644 --- a/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h +++ b/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h @@ -93,7 +93,7 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH #ifndef __CUDACC__ constexpr #endif - (std::is_same::value) { + (std::is_same::value) { cms::cuda::copyAsync(m_view, view, stream); } else { m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version @@ -140,16 +140,16 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH #ifndef __CUDACC__ constexpr #endif - (std::is_same::value) { + (std::is_same::value) { cms::cuda::copyAsync(m_view, view, stream); } else { m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version } } -using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; #endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h diff --git a/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h b/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h index 166374e41..faaa4378c 100644 --- a/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h +++ b/src/cuda/CUDADataFormats/TrackingRecHit2DSOAView.h @@ -17,7 +17,8 @@ class TrackingRecHit2DSOAView { static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; } using hindex_type = uint16_t; // if above is <=2^16 - using Hist = HistoContainer; + using Hist = + cms::cuda::HistoContainer; using AverageGeometry = phase1PixelTopology::AverageGeometry; diff --git a/src/cuda/CUDADataFormats/ZVertexSoA.h b/src/cuda/CUDADataFormats/ZVertexSoA.h index ae4cba4e7..ecdf76d8e 100644 --- a/src/cuda/CUDADataFormats/ZVertexSoA.h +++ b/src/cuda/CUDADataFormats/ZVertexSoA.h @@ -8,7 +8,7 @@ // These vertices are clusterized and fitted only along the beam line (z) // to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well) struct ZVertexSoA { - static constexpr uint32_t MAXTRACKS = 16 * 1024; + static constexpr uint32_t MAXTRACKS = 32 * 1024; static constexpr uint32_t MAXVTX = 1024; int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate) diff --git a/src/cuda/CondFormats/PixelCPEFast.h b/src/cuda/CondFormats/PixelCPEFast.h index 16e8f2b68..eb0f21c28 100644 --- a/src/cuda/CondFormats/PixelCPEFast.h +++ b/src/cuda/CondFormats/PixelCPEFast.h @@ -4,7 +4,7 @@ #include #include "CUDACore/ESProduct.h" -#include "CUDACore/CUDAHostAllocator.h" +#include "CUDACore/HostAllocator.h" #include "CondFormats/pixelCPEforGPU.h" class PixelCPEFast { @@ -22,7 +22,7 @@ class PixelCPEFast { private: // allocate it with posix malloc to be ocmpatible with cpu wf std::vector m_detParamsGPU; - // std::vector> m_detParamsGPU; + // std::vector> m_detParamsGPU; pixelCPEforGPU::CommonParams m_commonParamsGPU; pixelCPEforGPU::LayerGeometry m_layerGeometry; pixelCPEforGPU::AverageGeometry m_averageGeometry; diff --git a/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h index 85477e87a..027e7d25c 100644 --- a/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h +++ b/src/cuda/CondFormats/SiPixelFedCablingMapGPUWrapper.h @@ -2,7 +2,7 @@ #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h #include "CUDACore/ESProduct.h" -#include "CUDACore/CUDAHostAllocator.h" +#include "CUDACore/HostAllocator.h" #include "CUDACore/device_unique_ptr.h" #include "CondFormats/SiPixelFedCablingMapGPU.h" @@ -25,7 +25,7 @@ class SiPixelFedCablingMapGPUWrapper { const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const; private: - std::vector> modToUnpDefault; + std::vector> modToUnpDefault; bool hasQuality_; SiPixelFedCablingMapGPU *cablingMapHost = nullptr; // pointer to struct in CPU diff --git a/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h b/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h index ee7cfd53a..5bcdc7a66 100644 --- a/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h +++ b/src/cuda/CondFormats/SiPixelGainForHLTonGPU.h @@ -5,6 +5,17 @@ #include #include +// including would pull in the dependency on all of CUDA; +// instead, just define away the CUDA specific attributes to keep GCC happy. +#ifndef __CUDACC__ +#ifndef __host__ +#define __host__ +#endif // __host__ +#ifndef __device__ +#define __device__ +#endif // __device__ +#endif // __CUDACC__ + #include "CUDACore/cuda_assert.h" struct SiPixelGainForHLTonGPU_DecodingStructure { diff --git a/src/cuda/CondFormats/pixelCPEforGPU.h b/src/cuda/CondFormats/pixelCPEforGPU.h index 936ff3e1c..c48c05750 100644 --- a/src/cuda/CondFormats/pixelCPEforGPU.h +++ b/src/cuda/CondFormats/pixelCPEforGPU.h @@ -10,7 +10,6 @@ #include "DataFormats/SOARotation.h" #include "Geometry/phase1PixelTopology.h" #include "CUDACore/cudaCompat.h" -#include "CUDACore/cuda_cxx17.h" namespace pixelCPEforGPU { @@ -36,6 +35,9 @@ namespace pixelCPEforGPU { float shiftY; float chargeWidthX; float chargeWidthY; + // CMSSW 11.2.x adds + //uint16_t pixmx; // max pix charge + // which would break reading the binary dumps float x0, y0, z0; // the vertex in the local coord of the detector diff --git a/src/cuda/DataFormats/BeamSpotPOD.h b/src/cuda/DataFormats/BeamSpotPOD.h new file mode 100644 index 000000000..7b4c7c241 --- /dev/null +++ b/src/cuda/DataFormats/BeamSpotPOD.h @@ -0,0 +1,21 @@ +#ifndef DataFormats_BeamSpot_interface_BeamSpotPOD_h +#define DataFormats_BeamSpot_interface_BeamSpotPOD_h + +// This struct is a transient-only, simplified representation of the beamspot +// data used as the underlying type for data transfers and operations in +// heterogeneous code (e.g. in CUDA code). + +// The covariance matrix is not used in that code, so is left out here. + +// CMSSW 11.2.x uses alignas(128) to align to the CUDA L1 cache line size +// here it would break reading the beamspot data from the binary dumps +struct BeamSpotPOD { + float x, y, z; // position + float sigmaZ; + float beamWidthX, beamWidthY; + float dxdz, dydz; + float emittanceX, emittanceY; + float betaStar; +}; + +#endif // DataFormats_BeamSpot_interface_BeamSpotPOD_h diff --git a/src/cuda/DataFormats/approx_atan2.h b/src/cuda/DataFormats/approx_atan2.h index 729aa4536..4508a35cc 100644 --- a/src/cuda/DataFormats/approx_atan2.h +++ b/src/cuda/DataFormats/approx_atan2.h @@ -121,7 +121,7 @@ constexpr float unsafe_atan2f(float y, float x) { template constexpr float safe_atan2f(float y, float x) { - return unsafe_atan2f_impl(y, (y == 0.f) & (x == 0.f) ? 0.2f : x); + return unsafe_atan2f_impl(y, ((y == 0.f) & (x == 0.f)) ? 0.2f : x); // return (y==0.f)&(x==0.f) ? 0.f : unsafe_atan2f_impl( y, x); } diff --git a/src/cuda/Makefile.deps b/src/cuda/Makefile.deps index 2acf6dffd..a8e6e5798 100644 --- a/src/cuda/Makefile.deps +++ b/src/cuda/Makefile.deps @@ -1,4 +1,4 @@ -cuda_EXTERNAL_DEPENDS := TBB CUDA CUB EIGEN +cuda_EXTERNAL_DEPENDS := TBB CUDA EIGEN BeamSpotProducer_DEPENDS := Framework CUDACore CUDADataFormats CUDACore_DEPENDS := Framework CUDADataFormats_DEPENDS := CUDACore DataFormats diff --git a/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc b/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc index 50208dc5d..3c7f1a1f0 100644 --- a/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc +++ b/src/cuda/plugin-BeamSpotProducer/BeamSpotESProducer.cc @@ -1,4 +1,4 @@ -#include "CUDADataFormats/BeamSpotCUDA.h" +#include "DataFormats/BeamSpotPOD.h" #include "Framework/ESProducer.h" #include "Framework/EventSetup.h" #include "Framework/ESPluginFactory.h" @@ -17,11 +17,11 @@ class BeamSpotESProducer : public edm::ESProducer { }; void BeamSpotESProducer::produce(edm::EventSetup& eventSetup) { - auto bs = std::make_unique(); + auto bs = std::make_unique(); std::ifstream in(data_ / "beamspot.bin", std::ios::binary); in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit); - in.read(reinterpret_cast(bs.get()), sizeof(BeamSpotCUDA::Data)); + in.read(reinterpret_cast(bs.get()), sizeof(BeamSpotPOD)); eventSetup.put(std::move(bs)); } diff --git a/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc index df4331be6..48badcabf 100644 --- a/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc +++ b/src/cuda/plugin-BeamSpotProducer/BeamSpotToCUDA.cc @@ -1,15 +1,17 @@ +#include + +#include + #include "CUDACore/Product.h" +#include "CUDACore/ScopedContext.h" +#include "CUDACore/copyAsync.h" +#include "CUDACore/host_noncached_unique_ptr.h" #include "CUDADataFormats/BeamSpotCUDA.h" -#include "Framework/EventSetup.h" +#include "DataFormats/BeamSpotPOD.h" +#include "Framework/EDProducer.h" #include "Framework/Event.h" +#include "Framework/EventSetup.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" -#include "CUDACore/ScopedContext.h" -#include "CUDACore/host_noncached_unique_ptr.h" - -#include - -#include class BeamSpotToCUDA : public edm::EDProducer { public: @@ -19,21 +21,24 @@ class BeamSpotToCUDA : public edm::EDProducer { void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; private: - edm::EDPutTokenT> bsPutToken_; + const edm::EDPutTokenT> bsPutToken_; - cms::cuda::host::noncached::unique_ptr bsHost; + cms::cuda::host::noncached::unique_ptr bsHost; }; BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg) : bsPutToken_{reg.produces>()}, - bsHost{cms::cuda::make_host_noncached_unique(cudaHostAllocWriteCombined)} {} + bsHost{cms::cuda::make_host_noncached_unique(cudaHostAllocWriteCombined)} {} void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - *bsHost = iSetup.get(); + *bsHost = iSetup.get(); cms::cuda::ScopedContextProduce ctx{iEvent.streamID()}; - ctx.emplace(iEvent, bsPutToken_, bsHost.get(), ctx.stream()); + BeamSpotCUDA bsDevice(ctx.stream()); + cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream()); + + ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice)); } DEFINE_FWK_MODULE(BeamSpotToCUDA); diff --git a/src/cuda/plugin-PixelTriplets/BrokenLine.h b/src/cuda/plugin-PixelTriplets/BrokenLine.h index c3177f021..0a4b5f28f 100644 --- a/src/cuda/plugin-PixelTriplets/BrokenLine.h +++ b/src/cuda/plugin-PixelTriplets/BrokenLine.h @@ -7,8 +7,6 @@ namespace BrokenLine { - using namespace Rfit; - //!< Karimäki's parameters: (phi, d, k=1/R) /*!< covariance matrix: \n |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n @@ -22,12 +20,13 @@ namespace BrokenLine { */ template struct PreparedBrokenLineData { - int q; //!< particle charge - Matrix2xNd radii; //!< xy data in the system in which the pre-fitted center is the origin - VectorNd s; //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach - VectorNd S; //!< total distance traveled (three-dimensional) - VectorNd Z; //!< orthogonal coordinate to the pre-fitted line in the sz plane - VectorNd VarBeta; //!< kink angles in the SZ plane + int q; //!< particle charge + Rfit::Matrix2xNd radii; //!< xy data in the system in which the pre-fitted center is the origin + Rfit::VectorNd s; //!< total distance traveled in the transverse plane + // starting from the pre-fitted closest approach + Rfit::VectorNd S; //!< total distance traveled (three-dimensional) + Rfit::VectorNd Z; //!< orthogonal coordinate to the pre-fitted line in the sz plane + Rfit::VectorNd VarBeta; //!< kink angles in the SZ plane }; /*! @@ -55,9 +54,9 @@ namespace BrokenLine { //XX_0*=1; constexpr double geometry_factor = 0.7; //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned - constexpr double fact = geometry_factor * sqr(13.6 / 1000.); - return fact / (pt2 * (1. + sqr(slope))) * (std::abs(length) * XXI_0) * - sqr(1. + 0.038 * log(std::abs(length) * XXI_0)); + constexpr double fact = geometry_factor * Rfit::sqr(13.6 / 1000.); + return fact / (pt2 * (1. + Rfit::sqr(slope))) * (std::abs(length) * XXI_0) * + Rfit::sqr(1. + 0.038 * log(std::abs(length) * XXI_0)); } /*! @@ -67,9 +66,9 @@ namespace BrokenLine { \return 2D rotation matrix. */ - __host__ __device__ inline Matrix2d RotationMatrix(double slope) { - Matrix2d Rot; - Rot(0, 0) = 1. / sqrt(1. + sqr(slope)); + __host__ __device__ inline Rfit::Matrix2d RotationMatrix(double slope) { + Rfit::Matrix2d Rot; + Rot(0, 0) = 1. / sqrt(1. + Rfit::sqr(slope)); Rot(0, 1) = slope * Rot(0, 0); Rot(1, 0) = -Rot(0, 1); Rot(1, 1) = Rot(0, 0); @@ -82,31 +81,34 @@ namespace BrokenLine { \param circle circle fit in the old coordinate system. \param x0 x coordinate of the translation vector. \param y0 y coordinate of the translation vector. - \param Jacob passed by reference in order to save stack. + \param jacobian passed by reference in order to save stack. */ - __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, double x0, double y0, Matrix3d& Jacob) { + __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, + double x0, + double y0, + Rfit::Matrix3d& jacobian) { double A, U, BB, C, DO, DP, uu, xi, v, mu, lambda, zeta; DP = x0 * cos(circle.par(0)) + y0 * sin(circle.par(0)); DO = x0 * sin(circle.par(0)) - y0 * cos(circle.par(0)) + circle.par(1); uu = 1 + circle.par(2) * circle.par(1); C = -circle.par(2) * y0 + uu * cos(circle.par(0)); BB = circle.par(2) * x0 + uu * sin(circle.par(0)); - A = 2. * DO + circle.par(2) * (sqr(DO) + sqr(DP)); + A = 2. * DO + circle.par(2) * (Rfit::sqr(DO) + Rfit::sqr(DP)); U = sqrt(1. + circle.par(2) * A); - xi = 1. / (sqr(BB) + sqr(C)); + xi = 1. / (Rfit::sqr(BB) + Rfit::sqr(C)); v = 1. + circle.par(2) * DO; - lambda = (0.5 * A) / (U * sqr(1. + U)); + lambda = (0.5 * A) / (U * Rfit::sqr(1. + U)); mu = 1. / (U * (1. + U)) + circle.par(2) * lambda; - zeta = sqr(DO) + sqr(DP); + zeta = Rfit::sqr(DO) + Rfit::sqr(DP); - Jacob << xi * uu * v, -xi * sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v, + jacobian << xi * uu * v, -xi * Rfit::sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v, mu * zeta - lambda * A, 0, 0, 1.; circle.par(0) = atan2(BB, C); circle.par(1) = A / (1 + U); // circle.par(2)=circle.par(2); - circle.cov = Jacob * circle.cov * Jacob.transpose(); + circle.cov = jacobian * circle.cov * jacobian.transpose(); } /*! @@ -125,28 +127,28 @@ namespace BrokenLine { PreparedBrokenLineData& results) { constexpr auto n = N; u_int i; - Vector2d d; - Vector2d e; + Rfit::Vector2d d; + Rfit::Vector2d e; d = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1); e = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1); - results.q = cross2D(d, e) > 0 ? -1 : 1; + results.q = Rfit::cross2D(d, e) > 0 ? -1 : 1; const double slope = -results.q / fast_fit(3); - Matrix2d R = RotationMatrix(slope); + Rfit::Matrix2d R = RotationMatrix(slope); // calculate radii and s - results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * MatrixXd::Constant(1, n, 1); + results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * Rfit::MatrixXd::Constant(1, n, 1); e = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm(); for (i = 0; i < n; i++) { d = results.radii.block(0, i, 2, 1); - results.s(i) = results.q * fast_fit(2) * atan2(cross2D(d, e), d.dot(e)); // calculates the arc length + results.s(i) = results.q * fast_fit(2) * atan2(Rfit::cross2D(d, e), d.dot(e)); // calculates the arc length } - VectorNd z = hits.block(2, 0, 1, n).transpose(); + Rfit::VectorNd z = hits.block(2, 0, 1, n).transpose(); //calculate S and Z - Matrix2xNd pointsSZ = Matrix2xNd::Zero(); + Rfit::Matrix2xNd pointsSZ = Rfit::Matrix2xNd::Zero(); for (i = 0; i < n; i++) { pointsSZ(0, i) = results.s(i); pointsSZ(1, i) = z(i); @@ -173,21 +175,21 @@ namespace BrokenLine { \return the n-by-n matrix of the linear system */ template - __host__ __device__ inline MatrixNd MatrixC_u(const VectorNd& w, - const VectorNd& S, - const VectorNd& VarBeta) { + __host__ __device__ inline Rfit::MatrixNd MatrixC_u(const Rfit::VectorNd& w, + const Rfit::VectorNd& S, + const Rfit::VectorNd& VarBeta) { constexpr u_int n = N; u_int i; - MatrixNd C_U = MatrixNd::Zero(); + Rfit::MatrixNd C_U = Rfit::MatrixNd::Zero(); for (i = 0; i < n; i++) { C_U(i, i) = w(i); if (i > 1) - C_U(i, i) += 1. / (VarBeta(i - 1) * sqr(S(i) - S(i - 1))); + C_U(i, i) += 1. / (VarBeta(i - 1) * Rfit::sqr(S(i) - S(i - 1))); if (i > 0 && i < n - 1) - C_U(i, i) += (1. / VarBeta(i)) * sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1)))); + C_U(i, i) += (1. / VarBeta(i)) * Rfit::sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1)))); if (i < n - 2) - C_U(i, i) += 1. / (VarBeta(i + 1) * sqr(S(i + 1) - S(i))); + C_U(i, i) += 1. / (VarBeta(i + 1) * Rfit::sqr(S(i + 1) - S(i))); if (i > 0 && i < n - 1) C_U(i, i + 1) = @@ -219,22 +221,22 @@ namespace BrokenLine { constexpr uint32_t N = M3xN::ColsAtCompileTime; constexpr auto n = N; // get the number of hits - const Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1); - const Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1); - const Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1); + const Rfit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1); + const Rfit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1); + const Rfit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1); - auto tmp = 0.5 / cross2D(c, a); + auto tmp = 0.5 / Rfit::cross2D(c, a); result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp; result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp; // check Wikipedia for these formulas - result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(cross2D(b, a))); + result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(Rfit::cross2D(b, a))); // Using Math Olympiad's formula R=abc/(4A) - const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2); - const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2); + const Rfit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2); + const Rfit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2); - result(3) = result(2) * atan2(cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0)); + result(3) = result(2) * atan2(Rfit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0)); // ds/dz slope between last and first point } @@ -272,15 +274,15 @@ namespace BrokenLine { auto& Z = data.Z; auto& VarBeta = data.VarBeta; const double slope = -circle_results.q / fast_fit(3); - VarBeta *= 1. + sqr(slope); // the kink angles are projected! + VarBeta *= 1. + Rfit::sqr(slope); // the kink angles are projected! for (i = 0; i < n; i++) { Z(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2); } - Matrix2d V; // covariance matrix - VectorNd w; // weights - Matrix2d RR; // rotation matrix point by point + Rfit::Matrix2d V; // covariance matrix + Rfit::VectorNd w; // weights + Rfit::Matrix2d RR; // rotation matrix point by point //double Slope; // slope of the circle point by point for (i = 0; i < n; i++) { V(0, 0) = hits_ge.col(i)[0]; // x errors @@ -291,13 +293,13 @@ namespace BrokenLine { w(i) = 1. / ((RR * V * RR.transpose())(1, 1)); // compute the orthogonal weight point by point } - VectorNplusONEd r_u; + Rfit::VectorNplusONEd r_u; r_u(n) = 0; for (i = 0; i < n; i++) { r_u(i) = w(i) * Z(i); } - MatrixNplusONEd C_U; + Rfit::MatrixNplusONEd C_U; C_U.block(0, 0, n, n) = MatrixC_u(w, s, VarBeta); C_U(n, n) = 0; //add the border to the C_u matrix @@ -315,69 +317,69 @@ namespace BrokenLine { } C_U(n, i) = C_U(i, n); if (i > 0 && i < n - 1) - C_U(n, n) += sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i)); + C_U(n, n) += Rfit::sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i)); } #ifdef CPP_DUMP std::cout << "CU5\n" << C_U << std::endl; #endif - MatrixNplusONEd I; - choleskyInversion::invert(C_U, I); - // MatrixNplusONEd I = C_U.inverse(); + Rfit::MatrixNplusONEd I; + math::cholesky::invert(C_U, I); + // Rfit::MatrixNplusONEd I = C_U.inverse(); #ifdef CPP_DUMP std::cout << "I5\n" << I << std::endl; #endif - VectorNplusONEd u = I * r_u; // obtain the fitted parameters by solving the linear system + Rfit::VectorNplusONEd u = I * r_u; // obtain the fitted parameters by solving the linear system // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin... radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm(); radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm(); - Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1); - Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1); + Rfit::Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1); + Rfit::Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1); circle_results.par << atan2((e - d)(1), (e - d)(0)), - -circle_results.q * (fast_fit(2) - sqrt(sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())), + -circle_results.q * (fast_fit(2) - sqrt(Rfit::sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())), circle_results.q * (1. / fast_fit(2) + u(n)); assert(circle_results.q * circle_results.par(1) <= 0); - Vector2d eMinusd = e - d; + Rfit::Vector2d eMinusd = e - d; double tmp1 = eMinusd.squaredNorm(); - Matrix3d Jacob; - Jacob << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1, + Rfit::Matrix3d jacobian; + jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1, (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0, (circle_results.q / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) / - sqrt(sqr(2 * fast_fit(2)) - tmp1), + sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1), (circle_results.q / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) / - sqrt(sqr(2 * fast_fit(2)) - tmp1), + sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1), 0, 0, 0, circle_results.q; circle_results.cov << I(0, 0), I(0, 1), I(0, n), I(1, 0), I(1, 1), I(1, n), I(n, 0), I(n, 1), I(n, n); - circle_results.cov = Jacob * circle_results.cov * Jacob.transpose(); + circle_results.cov = jacobian * circle_results.cov * jacobian.transpose(); //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction... - TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), Jacob); - circle_results.cov(0, 0) += (1 + sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope); + TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), jacobian); + circle_results.cov(0, 0) += (1 + Rfit::sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope); //...And translate back to the original system - TranslateKarimaki(circle_results, d(0), d(1), Jacob); + TranslateKarimaki(circle_results, d(0), d(1), jacobian); // compute chi2 circle_results.chi2 = 0; for (i = 0; i < n; i++) { - circle_results.chi2 += w(i) * sqr(Z(i) - u(i)); + circle_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i)); if (i > 0 && i < n - 1) - circle_results.chi2 += - sqr(u(i - 1) / (s(i) - s(i - 1)) - u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) + - u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) / - VarBeta(i); + circle_results.chi2 += Rfit::sqr(u(i - 1) / (s(i) - s(i - 1)) - + u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) + + u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) / + VarBeta(i); } // assert(circle_results.chi2>=0); @@ -405,7 +407,7 @@ namespace BrokenLine { const V4& fast_fit, const double B, const PreparedBrokenLineData& data, - line_fit& line_results) { + Rfit::line_fit& line_results) { constexpr u_int n = N; u_int i; @@ -415,11 +417,11 @@ namespace BrokenLine { const auto& VarBeta = data.VarBeta; const double slope = -data.q / fast_fit(3); - Matrix2d R = RotationMatrix(slope); + Rfit::Matrix2d R = RotationMatrix(slope); - Matrix3d V = Matrix3d::Zero(); // covariance matrix XYZ - Matrix2x3d JacobXYZtosZ = Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz) - VectorNd w = VectorNd::Zero(); + Rfit::Matrix3d V = Rfit::Matrix3d::Zero(); // covariance matrix XYZ + Rfit::Matrix2x3d JacobXYZtosZ = Rfit::Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz) + Rfit::VectorNd w = Rfit::VectorNd::Zero(); for (i = 0; i < n; i++) { V(0, 0) = hits_ge.col(i)[0]; // x errors V(0, 1) = V(1, 0) = hits_ge.col(i)[1]; // cov_xy @@ -435,57 +437,57 @@ namespace BrokenLine { 1, 1)); // compute the orthogonal weight point by point } - VectorNd r_u; + Rfit::VectorNd r_u; for (i = 0; i < n; i++) { r_u(i) = w(i) * Z(i); } #ifdef CPP_DUMP std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl; #endif - MatrixNd I; - choleskyInversion::invert(MatrixC_u(w, S, VarBeta), I); - // MatrixNd I=MatrixC_u(w,S,VarBeta).inverse(); + Rfit::MatrixNd I; + math::cholesky::invert(MatrixC_u(w, S, VarBeta), I); + // Rfit::MatrixNd I=MatrixC_u(w,S,VarBeta).inverse(); #ifdef CPP_DUMP std::cout << "I4\n" << I << std::endl; #endif - VectorNd u = I * r_u; // obtain the fitted parameters by solving the linear system + Rfit::VectorNd u = I * r_u; // obtain the fitted parameters by solving the linear system // line parameters in the system in which the first hit is the origin and with axis along SZ line_results.par << (u(1) - u(0)) / (S(1) - S(0)), u(0); auto idiff = 1. / (S(1) - S(0)); - line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * sqr(idiff) + + line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * Rfit::sqr(idiff) + MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope), (I(0, 1) - I(0, 0)) * idiff, (I(0, 1) - I(0, 0)) * idiff, I(0, 0); // translate to the original SZ system - Matrix2d Jacob; - Jacob(0, 0) = 1.; - Jacob(0, 1) = 0; - Jacob(1, 0) = -S(0); - Jacob(1, 1) = 1.; + Rfit::Matrix2d jacobian; + jacobian(0, 0) = 1.; + jacobian(0, 1) = 0; + jacobian(1, 0) = -S(0); + jacobian(1, 1) = 1.; line_results.par(1) += -line_results.par(0) * S(0); - line_results.cov = Jacob * line_results.cov * Jacob.transpose(); + line_results.cov = jacobian * line_results.cov * jacobian.transpose(); // rotate to the original sz system auto tmp = R(0, 0) - line_results.par(0) * R(0, 1); - Jacob(1, 1) = 1. / tmp; - Jacob(0, 0) = Jacob(1, 1) * Jacob(1, 1); - Jacob(0, 1) = 0; - Jacob(1, 0) = line_results.par(1) * R(0, 1) * Jacob(0, 0); - line_results.par(1) = line_results.par(1) * Jacob(1, 1); - line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * Jacob(1, 1); - line_results.cov = Jacob * line_results.cov * Jacob.transpose(); + jacobian(1, 1) = 1. / tmp; + jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1); + jacobian(0, 1) = 0; + jacobian(1, 0) = line_results.par(1) * R(0, 1) * jacobian(0, 0); + line_results.par(1) = line_results.par(1) * jacobian(1, 1); + line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * jacobian(1, 1); + line_results.cov = jacobian * line_results.cov * jacobian.transpose(); // compute chi2 line_results.chi2 = 0; for (i = 0; i < n; i++) { - line_results.chi2 += w(i) * sqr(Z(i) - u(i)); + line_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i)); if (i > 0 && i < n - 1) - line_results.chi2 += - sqr(u(i - 1) / (S(i) - S(i - 1)) - u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) + - u(i + 1) / (S(i + 1) - S(i))) / - VarBeta(i); + line_results.chi2 += Rfit::sqr(u(i - 1) / (S(i) - S(i - 1)) - + u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) + + u(i + 1) / (S(i + 1) - S(i))) / + VarBeta(i); } // assert(line_results.chi2>=0); @@ -526,27 +528,29 @@ namespace BrokenLine { \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits. */ template - inline helix_fit BL_Helix_fit(const Matrix3xNd& hits, const Eigen::Matrix& hits_ge, const double B) { - helix_fit helix; - Vector4d fast_fit; + inline Rfit::helix_fit BL_Helix_fit(const Rfit::Matrix3xNd& hits, + const Eigen::Matrix& hits_ge, + const double B) { + Rfit::helix_fit helix; + Rfit::Vector4d fast_fit; BL_Fast_fit(hits, fast_fit); PreparedBrokenLineData data; karimaki_circle_fit circle; - line_fit line; - Matrix3d Jacob; + Rfit::line_fit line; + Rfit::Matrix3d jacobian; prepareBrokenLineData(hits, fast_fit, B, data); BL_Line_fit(hits_ge, fast_fit, B, data, line); BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle); // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix - Jacob << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (sqr(circle.par(2)) * circle.par(2)); + jacobian << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (Rfit::sqr(circle.par(2)) * circle.par(2)); circle.par(2) = B / std::abs(circle.par(2)); - circle.cov = Jacob * circle.cov * Jacob.transpose(); + circle.cov = jacobian * circle.cov * jacobian.transpose(); helix.par << circle.par, line.par; - helix.cov = MatrixXd::Zero(5, 5); + helix.cov = Rfit::MatrixXd::Zero(5, 5); helix.cov.block(0, 0, 3, 3) = circle.cov; helix.cov.block(3, 3, 2, 2) = line.cov; helix.q = circle.q; diff --git a/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h b/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h index d696bbadb..a30c251b7 100644 --- a/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h +++ b/src/cuda/plugin-PixelTriplets/BrokenLineFitOnGPU.h @@ -20,8 +20,6 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; using OutputSoA = pixelTrack::TrackSoA; -using namespace Eigen; - // #define BL_DUMP_HITS template diff --git a/src/cuda/plugin-PixelTriplets/CAConstants.h b/src/cuda/plugin-PixelTriplets/CAConstants.h index 40be667a3..b063d0f6e 100644 --- a/src/cuda/plugin-PixelTriplets/CAConstants.h +++ b/src/cuda/plugin-PixelTriplets/CAConstants.h @@ -5,10 +5,10 @@ #include -#include "CUDADataFormats/gpuClusteringConstants.h" -#include "CUDACore/GPUSimpleVector.h" -#include "CUDACore/GPUVecArray.h" #include "CUDACore/HistoContainer.h" +#include "CUDACore/SimpleVector.h" +#include "CUDACore/VecArray.h" +#include "CUDADataFormats/gpuClusteringConstants.h" // #define ONLY_PHICUT @@ -27,7 +27,7 @@ namespace CAConstants { constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); } #ifndef ONLY_PHICUT #ifndef GPU_SMALL_EVENTS - constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; } + constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; } constexpr uint32_t maxCellsPerHit() { return 128; } #else constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; } @@ -37,7 +37,7 @@ namespace CAConstants { constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; } constexpr uint32_t maxCellsPerHit() { return 8 * 128; } #endif - constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; } + constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; } constexpr uint32_t maxNumberOfLayerPairs() { return 20; } constexpr uint32_t maxNumberOfLayers() { return 10; } @@ -48,21 +48,21 @@ namespace CAConstants { using tindex_type = uint16_t; // for tuples #ifndef ONLY_PHICUT - using CellNeighbors = GPU::VecArray; - using CellTracks = GPU::VecArray; + using CellNeighbors = cms::cuda::VecArray; + using CellTracks = cms::cuda::VecArray; #else - using CellNeighbors = GPU::VecArray; - using CellTracks = GPU::VecArray; + using CellNeighbors = cms::cuda::VecArray; + using CellTracks = cms::cuda::VecArray; #endif - using CellNeighborsVector = GPU::SimpleVector; - using CellTracksVector = GPU::SimpleVector; + using CellNeighborsVector = cms::cuda::SimpleVector; + using CellTracksVector = cms::cuda::SimpleVector; - using OuterHitOfCell = GPU::VecArray; - using TuplesContainer = OneToManyAssoc; + using OuterHitOfCell = cms::cuda::VecArray; + using TuplesContainer = cms::cuda::OneToManyAssoc; using HitToTuple = - OneToManyAssoc; // 3.5 should be enough - using TupleMultiplicity = OneToManyAssoc; + cms::cuda::OneToManyAssoc; // 3.5 should be enough + using TupleMultiplicity = cms::cuda::OneToManyAssoc; } // namespace CAConstants diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc index 4b893260c..4ea687af3 100644 --- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc +++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc @@ -24,12 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_isOuterHitOfCell_.reset( (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell))); assert(device_isOuterHitOfCell_.get()); + + cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = + (GPUCACell::CellTracks *)(cellStorage_.get() + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors)); + gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); // device_theCells_ = Traits:: template make_unique(cs, m_params.maxNumberOfDoublets_, stream); device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_)); @@ -47,8 +55,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr assert(nActualPairs <= gpuPixelDoublets::nPairs); gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), hh.view(), device_isOuterHitOfCell_.get(), nActualPairs, @@ -84,7 +92,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * hh.view(), device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_isOuterHitOfCell_.get(), m_params.hardCurvCut_, m_params.ptmin_, @@ -94,13 +102,14 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * m_params.dcaCutOuterTriplet_); if (nhits > 1 && m_params.earlyFishbone_) { - fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + gpuPixelDoublets::fishbone( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); } kernel_find_ntuplets(hh.view(), device_theCells_.get(), device_nCells_, - device_theCellTracks_, + device_theCellTracks_.get(), tuples_d, device_hitTuple_apc_, quality_d, @@ -114,11 +123,12 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d); kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); + cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); if (nhits > 1 && m_params.lateFishbone_) { - fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + gpuPixelDoublets::fishbone( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); } if (m_params.doStats_) { @@ -127,8 +137,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_hitTuple_apc_, device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), device_isOuterHitOfCell_.get(), nhits, m_params.maxNumberOfDoublets_, @@ -154,7 +164,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); + cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream); kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); // remove duplicates (tracks that share a hit) diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu index 0e4a9aec5..08474c1af 100644 --- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu +++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu @@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * hh.view(), device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, + device_theCellNeighbors_.get(), device_isOuterHitOfCell_.get(), m_params.hardCurvCut_, m_params.ptmin_, @@ -68,7 +68,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - fishbone<<>>( + gpuPixelDoublets::fishbone<<>>( hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); cudaCheck(cudaGetLastError()); } @@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_find_ntuplets<<>>(hh.view(), device_theCells_.get(), device_nCells_, - device_theCellTracks_, + device_theCellTracks_.get(), tuples_d, device_hitTuple_apc_, quality_d, @@ -108,7 +108,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize; kernel_countMultiplicity<<>>( tuples_d, quality_d, device_tupleMultiplicity_.get()); - cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); + cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity<<>>( tuples_d, quality_d, device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); @@ -120,7 +120,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - fishbone<<>>( + gpuPixelDoublets::fishbone<<>>( hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); cudaCheck(cudaGetLastError()); } @@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_hitTuple_apc_, device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), device_isOuterHitOfCell_.get(), nhits, m_params.maxNumberOfDoublets_, @@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif + + // free space asap + // device_isOuterHitOfCell_.reset(); } template <> @@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr // in principle we can use "nhits" to heuristically dimension the workspace... device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1U, nhits), stream); assert(device_isOuterHitOfCell_.get()); + + cellStorage_ = cms::cuda::make_device_unique( + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks), + stream); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = + (GPUCACell::CellTracks *)(cellStorage_.get() + + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors)); + { int threadsPerBlock = 128; // at least one block! int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); cudaCheck(cudaGetLastError()); } @@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr dim3 thrs(stride, threadsPerBlock, 1); gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), hh.view(), device_isOuterHitOfCell_.get(), nActualPairs, @@ -252,7 +265,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA kernel_countHitInTracks<<>>( tuples_d, quality_d, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); - cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); + cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream); cudaCheck(cudaGetLastError()); kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h index efe960bcf..3c3e3d447 100644 --- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h +++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h @@ -179,30 +179,29 @@ class CAHitNtupletGeneratorKernels { private: // workspace - CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr; - unique_ptr device_theCellNeighborsContainer_; - CAConstants::CellTracksVector* device_theCellTracks_ = nullptr; - unique_ptr device_theCellTracksContainer_; + unique_ptr cellStorage_; + unique_ptr device_theCellNeighbors_; + CAConstants::CellNeighbors* device_theCellNeighborsContainer_; + unique_ptr device_theCellTracks_; + CAConstants::CellTracks* device_theCellTracksContainer_; unique_ptr device_theCells_; unique_ptr device_isOuterHitOfCell_; uint32_t* device_nCells_ = nullptr; unique_ptr device_hitToTuple_; - AtomicPairCounter* device_hitToTuple_apc_ = nullptr; + cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr; - AtomicPairCounter* device_hitTuple_apc_ = nullptr; + cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr; unique_ptr device_tupleMultiplicity_; - uint8_t* device_tmws_; - - unique_ptr device_storage_; + unique_ptr device_storage_; // params Params const& m_params; }; -using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; -using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels; +using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; +using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels; #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h index d16e6a15f..fb505b126 100644 --- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h +++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h @@ -12,36 +12,24 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - /* not used at the moment - cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector))); - cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector))); - cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector))); - cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector))); - */ + device_theCellNeighbors_ = Traits::template make_unique(stream); + device_theCellTracks_ = Traits::template make_unique(stream); device_hitToTuple_ = Traits::template make_unique(stream); device_tupleMultiplicity_ = Traits::template make_unique(stream); - auto storageSize = - 3 + (std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) + sizeof(AtomicPairCounter::c_type)) / - sizeof(AtomicPairCounter::c_type); + device_storage_ = Traits::template make_unique(3, stream); - device_storage_ = Traits::template make_unique(storageSize, stream); - - device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get(); - device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get() + 1; + device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get(); + device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; device_nCells_ = (uint32_t*)(device_storage_.get() + 2); - device_tmws_ = (uint8_t*)(device_storage_.get() + 3); - - assert(device_tmws_ + std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <= - (uint8_t*)(device_storage_.get() + storageSize)); if #ifndef __CUDACC__ constexpr #endif - (std::is_same::value) { + (std::is_same::value) { cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); } else { *device_nCells_ = 0; diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h index fda3e508d..e35e20be9 100644 --- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h +++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h @@ -19,8 +19,6 @@ #include "gpuFishbone.h" #include "gpuPixelDoublets.h" -using namespace gpuPixelDoublets; - using HitsOnGPU = TrackingRecHit2DSOAView; using HitsOnCPU = TrackingRecHit2DCUDA; @@ -33,11 +31,11 @@ using HitContainer = pixelTrack::HitContainer; __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, CAConstants::TupleMultiplicity *tupleMultiplicity, - AtomicPairCounter *apc, + cms::cuda::AtomicPairCounter *apc, GPUCACell const *__restrict__ cells, uint32_t const *__restrict__ nCells, - CellNeighborsVector const *cellNeighbors, - CellTracksVector const *cellTracks, + gpuPixelDoublets::CellNeighborsVector const *cellNeighbors, + gpuPixelDoublets::CellTracksVector const *cellTracks, GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell, uint32_t nHits, uint32_t maxNumberOfDoublets, @@ -81,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, printf("Tuples overflow\n"); if (*nCells >= maxNumberOfDoublets) printf("Cells overflow\n"); + if (cellNeighbors && cellNeighbors->full()) + printf("cellNeighbors overflow\n"); + if (cellTracks && cellTracks->full()) + printf("cellTracks overflow\n"); } for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { @@ -190,12 +192,12 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, } } -__global__ void kernel_connect(AtomicPairCounter *apc1, - AtomicPairCounter *apc2, // just to zero them, +__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, + cms::cuda::AtomicPairCounter *apc2, // just to zero them, GPUCACell::Hits const *__restrict__ hhp, GPUCACell *cells, uint32_t const *__restrict__ nCells, - CellNeighborsVector *cellNeighbors, + gpuPixelDoublets::CellNeighborsVector *cellNeighbors, GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell, float hardCurvCut, float ptmin, @@ -266,9 +268,9 @@ __global__ void kernel_connect(AtomicPairCounter *apc1, __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, GPUCACell *__restrict__ cells, uint32_t const *nCells, - CellTracksVector *cellTracks, + gpuPixelDoublets::CellTracksVector *cellTracks, HitContainer *foundNtuplets, - AtomicPairCounter *apc, + cms::cuda::AtomicPairCounter *apc, Quality *__restrict__ quality, unsigned int minHitsPerNtuplet) { // recursive: not obvious to widen diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc index 22566d057..d0e428da6 100644 --- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc +++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc @@ -92,17 +92,16 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(edm::ProductRegistry& reg } CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() { - if (m_params.doStats_) { - // crash on multi-gpu processes - if (m_params.onGPU_) { + if (m_params.onGPU_) { + if (m_params.doStats_) { + // crash on multi-gpu processes CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters); - } else { - CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters); } - } - if (m_params.onGPU_) { cudaFree(m_counters); } else { + if (m_params.doStats_) { + CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters); + } delete m_counters; } } @@ -116,14 +115,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.counters_ = m_counters; - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); kernels.allocateOnGPU(stream); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); kernels.buildDoublets(hits_d, stream); kernels.launchKernels(hits_d, soa, stream); kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + + HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); } else { diff --git a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h index 17299c1bd..823987658 100644 --- a/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h +++ b/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h @@ -2,16 +2,14 @@ #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h #include -#include "CUDADataFormats/TrackingRecHit2DCUDA.h" -#include "CUDADataFormats/PixelTrackHeterogeneous.h" -#include "CUDACore/GPUSimpleVector.h" +#include "CUDACore/SimpleVector.h" +#include "CUDADataFormats/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit2DCUDA.h" #include "CAHitNtupletGeneratorKernels.h" -#include "HelixFitOnGPU.h" - -// FIXME (split header???) #include "GPUCACell.h" +#include "HelixFitOnGPU.h" namespace edm { class Event; diff --git a/src/cuda/plugin-PixelTriplets/GPUCACell.h b/src/cuda/plugin-PixelTriplets/GPUCACell.h index 918c13f64..df4354e59 100644 --- a/src/cuda/plugin-PixelTriplets/GPUCACell.h +++ b/src/cuda/plugin-PixelTriplets/GPUCACell.h @@ -9,11 +9,12 @@ #include -#include "CUDADataFormats/TrackingRecHit2DCUDA.h" -#include "CUDACore/GPUSimpleVector.h" -#include "CUDACore/GPUVecArray.h" +#include "CUDACore/SimpleVector.h" +#include "CUDACore/VecArray.h" #include "CUDACore/cuda_assert.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit2DCUDA.h" + #include "CAConstants.h" #include "CircleEq.h" @@ -31,7 +32,7 @@ class GPUCACell { using Hits = TrackingRecHit2DSOAView; using hindex_type = Hits::hindex_type; - using TmpTuple = GPU::VecArray; + using TmpTuple = cms::cuda::VecArray; using HitContainer = pixelTrack::HitContainer; using Quality = trackQuality::Quality; @@ -56,24 +57,56 @@ class GPUCACell { theInnerZ = hh.zGlobal(innerHitId); theInnerR = hh.rGlobal(innerHitId); - outerNeighbors().reset(); - tracks().reset(); + // link to default empty + theOuterNeighbors = &cellNeighbors[0]; + theTracks = &cellTracks[0]; assert(outerNeighbors().empty()); assert(tracks().empty()); } __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) { + // use smart cache + if (outerNeighbors().empty()) { + auto i = cellNeighbors.extend(); // maybe waisted.... + if (i > 0) { + cellNeighbors[i].reset(); +#ifdef __CUDACC__ + auto zero = (ptrAsInt)(&cellNeighbors[0]); + atomicCAS((ptrAsInt*)(&theOuterNeighbors), + zero, + (ptrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back... +#else + theOuterNeighbors = &cellNeighbors[i]; +#endif + } else + return -1; + } + __threadfence(); return outerNeighbors().push_back(t); } __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) { + if (tracks().empty()) { + auto i = cellTracks.extend(); // maybe waisted.... + if (i > 0) { + cellTracks[i].reset(); +#ifdef __CUDACC__ + auto zero = (ptrAsInt)(&cellTracks[0]); + atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back... +#else + theTracks = &cellTracks[i]; +#endif + } else + return -1; + } + __threadfence(); return tracks().push_back(t); } - __device__ __forceinline__ CellTracks& tracks() { return theTracks; } - __device__ __forceinline__ CellTracks const& tracks() const { return theTracks; } - __device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; } - __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; } + __device__ __forceinline__ CellTracks& tracks() { return *theTracks; } + __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } + __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } + __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); } __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); } __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); } @@ -246,7 +279,7 @@ class GPUCACell { GPUCACell* __restrict__ cells, CellTracksVector& cellTracks, HitContainer& foundNtuplets, - AtomicPairCounter& apc, + cms::cuda::AtomicPairCounter& apc, Quality* __restrict__ quality, TmpTuple& tmpNtuplet, const unsigned int minHitsPerNtuplet, @@ -297,8 +330,8 @@ class GPUCACell { } private: - CellNeighbors theOuterNeighbors; - CellTracks theTracks; + CellNeighbors* theOuterNeighbors; + CellTracks* theTracks; public: int32_t theDoubletId; diff --git a/src/cuda/plugin-PixelTriplets/RiemannFit.h b/src/cuda/plugin-PixelTriplets/RiemannFit.h index c68aa00bd..994b1dcf9 100644 --- a/src/cuda/plugin-PixelTriplets/RiemannFit.h +++ b/src/cuda/plugin-PixelTriplets/RiemannFit.h @@ -486,7 +486,7 @@ namespace Rfit { printIt(&V, "circle_fit - V:"); cov_rad += scatter_cov_rad; printIt(&cov_rad, "circle_fit - cov_rad:"); - choleskyInversion::invert(cov_rad, G); + math::cholesky::invert(cov_rad, G); // G = cov_rad.inverse(); renorm = G.sum(); G *= 1. / renorm; @@ -889,11 +889,11 @@ namespace Rfit { // Build A^T V-1 A, where V-1 is the covariance of only the Y components. MatrixNd Vy_inv; - choleskyInversion::invert(cov_with_ms, Vy_inv); + math::cholesky::invert(cov_with_ms, Vy_inv); // MatrixNd Vy_inv = cov_with_ms.inverse(); Eigen::Matrix Cov_params = A * Vy_inv * A.transpose(); // Compute the Covariance Matrix of the fit parameters - choleskyInversion::invert(Cov_params, Cov_params); + math::cholesky::invert(Cov_params, Cov_params); // Now Compute the Parameters in the form [2,1] // The first component is q. diff --git a/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h b/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h index 0e11bb3f3..02766b557 100644 --- a/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h +++ b/src/cuda/plugin-PixelTriplets/RiemannFitOnGPU.h @@ -18,8 +18,6 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; using OutputSoA = pixelTrack::TrackSoA; -using namespace Eigen; - template __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets, CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, diff --git a/src/cuda/plugin-PixelTriplets/choleskyInversion.h b/src/cuda/plugin-PixelTriplets/choleskyInversion.h index eba2a1764..2cb4105f8 100644 --- a/src/cuda/plugin-PixelTriplets/choleskyInversion.h +++ b/src/cuda/plugin-PixelTriplets/choleskyInversion.h @@ -9,325 +9,341 @@ * fully inlined specialized code to perform the inversion of a * positive defined matrix of rank up to 6. * + * adapted from ROOT::Math::CholeskyDecomp * originally by * @author Manuel Schiller * @date Aug 29 2008 * * */ -namespace choleskyInversion { - - template - inline constexpr void invert11(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - dst(0, 0) = F(1.0) / src(0, 0); - } - - template - inline constexpr void invert22(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0) * src(1, 0) * luc0; - auto luc2 = F(1.0) / (src(1, 1) - luc1); - - auto li21 = luc1 * luc0 * luc2; - - dst(0, 0) = li21 + luc0; - dst(1, 0) = -src(1, 0) * luc0 * luc2; - dst(1, 1) = luc2; - } - - template - inline constexpr void invert33(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4); - luc5 = F(1.0) / luc5; - - auto li21 = -luc0 * luc1; - auto li32 = -(luc2 * luc4); - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - - dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0; - dst(1, 0) = luc5 * li31 * li32 + li21 * luc2; - dst(1, 1) = luc5 * li32 * li32 + luc2; - dst(2, 0) = luc5 * li31; - dst(2, 1) = luc5 * li32; - dst(2, 2) = luc5; - } - - template - inline constexpr void invert44(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); - luc5 = F(1.0) / luc5; - auto luc6 = src(3, 0); - auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); - auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); - auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); - luc9 = F(1.0) / luc9; - - auto li21 = -luc1 * luc0; - auto li32 = -luc2 * luc4; - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - auto li43 = -(luc8 * luc5); - auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; - auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; - - dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; - dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; - dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; - dst(2, 0) = luc9 * li41 * li43 + luc5 * li31; - dst(2, 1) = luc9 * li42 * li43 + luc5 * li32; - dst(2, 2) = luc9 * li43 * li43 + luc5; - dst(3, 0) = luc9 * li41; - dst(3, 1) = luc9 * li42; - dst(3, 2) = luc9 * li43; - dst(3, 3) = luc9; - } - - template - inline constexpr void invert55(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); - luc5 = F(1.0) / luc5; - auto luc6 = src(3, 0); - auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); - auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); - auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); - luc9 = F(1.0) / luc9; - auto luc10 = src(4, 0); - auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); - auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); - auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); - auto luc14 = - src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); - luc14 = F(1.0) / luc14; - - auto li21 = -luc1 * luc0; - auto li32 = -luc2 * luc4; - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - auto li43 = -(luc8 * luc5); - auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; - auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; - auto li54 = -luc13 * luc9; - auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; - auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; - auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - - luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + - luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * - luc0; - - dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; - dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; - dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; - dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; - dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; - dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; - dst(3, 0) = luc14 * li51 * li54 + luc9 * li41; - dst(3, 1) = luc14 * li52 * li54 + luc9 * li42; - dst(3, 2) = luc14 * li53 * li54 + luc9 * li43; - dst(3, 3) = luc14 * li54 * li54 + luc9; - dst(4, 0) = luc14 * li51; - dst(4, 1) = luc14 * li52; - dst(4, 2) = luc14 * li53; - dst(4, 3) = luc14 * li54; - dst(4, 4) = luc14; - } - - template - inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); - luc5 = F(1.0) / luc5; - auto luc6 = src(3, 0); - auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); - auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); - auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); - luc9 = F(1.0) / luc9; - auto luc10 = src(4, 0); - auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); - auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); - auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); - auto luc14 = - src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); - luc14 = F(1.0) / luc14; - auto luc15 = src(5, 0); - auto luc16 = (src(5, 1) - luc0 * luc1 * luc15); - auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16); - auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17); - auto luc19 = - (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18); - auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 + - luc9 * luc18 * luc18 + luc14 * luc19 * luc19); - luc20 = F(1.0) / luc20; - - auto li21 = -luc1 * luc0; - auto li32 = -luc2 * luc4; - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - auto li43 = -(luc8 * luc5); - auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; - auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; - auto li54 = -luc13 * luc9; - auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; - auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; - auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - - luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + - luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * - luc0; - - auto li65 = -luc19 * luc14; - auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9; - auto li63 = (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5; - auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 - - luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 + - luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) * - luc2; - auto li61 = - (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 + - luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 + - luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 - - luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 - - luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - luc19 * luc13 * luc6 * luc9 * luc14 + - luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + luc19 * luc10 * luc14 - luc15) * - luc0; - - dst(0, 0) = - luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; - dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; - dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; - dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; - dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; - dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; - dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41; - dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42; - dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43; - dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9; - dst(4, 0) = luc20 * li61 * li65 + luc14 * li51; - dst(4, 1) = luc20 * li62 * li65 + luc14 * li52; - dst(4, 2) = luc20 * li63 * li65 + luc14 * li53; - dst(4, 3) = luc20 * li64 * li65 + luc14 * li54; - dst(4, 4) = luc20 * li65 * li65 + luc14; - dst(5, 0) = luc20 * li61; - dst(5, 1) = luc20 * li62; - dst(5, 2) = luc20 * li63; - dst(5, 3) = luc20 * li64; - dst(5, 4) = luc20 * li65; - dst(5, 5) = luc20; - } - - template - inline constexpr void symmetrize11(M& dst) {} - template - inline constexpr void symmetrize22(M& dst) { - dst(0, 1) = dst(1, 0); - } - template - inline constexpr void symmetrize33(M& dst) { - symmetrize22(dst); - dst(0, 2) = dst(2, 0); - dst(1, 2) = dst(2, 1); - } - template - inline constexpr void symmetrize44(M& dst) { - symmetrize33(dst); - dst(0, 3) = dst(3, 0); - dst(1, 3) = dst(3, 1); - dst(2, 3) = dst(3, 2); - } - template - inline constexpr void symmetrize55(M& dst) { - symmetrize44(dst); - dst(0, 4) = dst(4, 0); - dst(1, 4) = dst(4, 1); - dst(2, 4) = dst(4, 2); - dst(3, 4) = dst(4, 3); - } - template - inline constexpr void symmetrize66(M& dst) { - symmetrize55(dst); - dst(0, 5) = dst(5, 0); - dst(1, 5) = dst(5, 1); - dst(2, 5) = dst(5, 2); - dst(3, 5) = dst(5, 3); - dst(4, 5) = dst(5, 4); - } - - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert22(src, dst); +namespace math { + namespace cholesky { + + template + inline constexpr void invert11(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + dst(0, 0) = F(1.0) / src(0, 0); + } + + template + inline constexpr void invert22(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0) * src(1, 0) * luc0; + auto luc2 = F(1.0) / (src(1, 1) - luc1); + + auto li21 = luc1 * luc0 * luc2; + + dst(0, 0) = li21 + luc0; + dst(1, 0) = -src(1, 0) * luc0 * luc2; + dst(1, 1) = luc2; + } + + template + inline constexpr void invert33(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4); + luc5 = F(1.0) / luc5; + + auto li21 = -luc0 * luc1; + auto li32 = -(luc2 * luc4); + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + + dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0; + dst(1, 0) = luc5 * li31 * li32 + li21 * luc2; + dst(1, 1) = luc5 * li32 * li32 + luc2; + dst(2, 0) = luc5 * li31; + dst(2, 1) = luc5 * li32; + dst(2, 2) = luc5; + } + + template + inline constexpr void invert44(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + + dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc9 * li43 * li43 + luc5; + dst(3, 0) = luc9 * li41; + dst(3, 1) = luc9 * li42; + dst(3, 2) = luc9 * li43; + dst(3, 3) = luc9; + } + + template + inline constexpr void invert55(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc14 * li54 * li54 + luc9; + dst(4, 0) = luc14 * li51; + dst(4, 1) = luc14 * li52; + dst(4, 2) = luc14 * li53; + dst(4, 3) = luc14 * li54; + dst(4, 4) = luc14; + } + + template + inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + auto luc15 = src(5, 0); + auto luc16 = (src(5, 1) - luc0 * luc1 * luc15); + auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16); + auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17); + auto luc19 = + (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18); + auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 + + luc9 * luc18 * luc18 + luc14 * luc19 * luc19); + luc20 = F(1.0) / luc20; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + auto li65 = -luc19 * luc14; + auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9; + auto li63 = + (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5; + auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 - + luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 + + luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) * + luc2; + auto li61 = + (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 + + luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 + + luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 - + luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 - + luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - + luc19 * luc13 * luc6 * luc9 * luc14 + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + + luc19 * luc10 * luc14 - luc15) * + luc0; + + dst(0, 0) = luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9; + dst(4, 0) = luc20 * li61 * li65 + luc14 * li51; + dst(4, 1) = luc20 * li62 * li65 + luc14 * li52; + dst(4, 2) = luc20 * li63 * li65 + luc14 * li53; + dst(4, 3) = luc20 * li64 * li65 + luc14 * li54; + dst(4, 4) = luc20 * li65 * li65 + luc14; + dst(5, 0) = luc20 * li61; + dst(5, 1) = luc20 * li62; + dst(5, 2) = luc20 * li63; + dst(5, 3) = luc20 * li64; + dst(5, 4) = luc20 * li65; + dst(5, 5) = luc20; + } + + template + inline constexpr void symmetrize11(M& dst) {} + + template + inline constexpr void symmetrize22(M& dst) { + dst(0, 1) = dst(1, 0); + } + + template + inline constexpr void symmetrize33(M& dst) { symmetrize22(dst); + dst(0, 2) = dst(2, 0); + dst(1, 2) = dst(2, 1); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert33(src, dst); + + template + inline constexpr void symmetrize44(M& dst) { symmetrize33(dst); + dst(0, 3) = dst(3, 0); + dst(1, 3) = dst(3, 1); + dst(2, 3) = dst(3, 2); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert44(src, dst); + + template + inline constexpr void symmetrize55(M& dst) { symmetrize44(dst); + dst(0, 4) = dst(4, 0); + dst(1, 4) = dst(4, 1); + dst(2, 4) = dst(4, 2); + dst(3, 4) = dst(4, 3); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert55(src, dst); + + template + inline constexpr void symmetrize66(M& dst) { symmetrize55(dst); + dst(0, 5) = dst(5, 0); + dst(1, 5) = dst(5, 1); + dst(2, 5) = dst(5, 2); + dst(3, 5) = dst(5, 3); + dst(4, 5) = dst(5, 4); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert66(src, dst); - symmetrize66(dst); - } - }; - // Eigen interface - template - inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { - using M1 = Eigen::DenseBase; - using M2 = Eigen::DenseBase; - Inverter::eval(src, dst); - } + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert22(src, dst); + symmetrize22(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert33(src, dst); + symmetrize33(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert44(src, dst); + symmetrize44(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert55(src, dst); + symmetrize55(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert66(src, dst); + symmetrize66(dst); + } + }; + + // Eigen interface + template + inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { + using M1 = Eigen::DenseBase; + using M2 = Eigen::DenseBase; + Inverter::eval(src, dst); + } -} // namespace choleskyInversion + } // namespace cholesky +} // namespace math #endif // DataFormat_Math_choleskyInversion_h diff --git a/src/cuda/plugin-PixelTriplets/gpuFishbone.h b/src/cuda/plugin-PixelTriplets/gpuFishbone.h index b6feaae30..2e2446ea3 100644 --- a/src/cuda/plugin-PixelTriplets/gpuFishbone.h +++ b/src/cuda/plugin-PixelTriplets/gpuFishbone.h @@ -9,7 +9,7 @@ #include "DataFormats/approx_atan2.h" #include "Geometry/phase1PixelTopology.h" -#include "CUDACore/GPUVecArray.h" +#include "CUDACore/VecArray.h" #include "CUDACore/cuda_assert.h" #include "GPUCACell.h" diff --git a/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h b/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h index d19ab8930..e906f85f1 100644 --- a/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h +++ b/src/cuda/plugin-PixelTriplets/gpuPixelDoublets.h @@ -7,8 +7,6 @@ namespace gpuPixelDoublets { - using namespace gpuPixelDoubletsAlgos; - constexpr int nPairs = 13 + 2 + 4; static_assert(nPairs <= CAConstants::maxNumberOfLayerPairs()); @@ -75,6 +73,17 @@ namespace gpuPixelDoublets { int first = blockIdx.x * blockDim.x + threadIdx.x; for (int i = first; i < nHits; i += gridDim.x * blockDim.x) isOuterHitOfCell[i].reset(); + + if (0 == first) { + cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer); + cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer); + auto i = cellNeighbors->extend(); + assert(0 == i); + (*cellNeighbors)[0].reset(); + i = cellTracks->extend(); + assert(0 == i); + (*cellTracks)[0].reset(); + } } constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y diff --git a/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h b/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h index 3accacf63..6d6a62c88 100644 --- a/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h +++ b/src/cuda/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h @@ -9,13 +9,13 @@ #include "CUDADataFormats/TrackingRecHit2DCUDA.h" #include "DataFormats/approx_atan2.h" -#include "CUDACore/GPUVecArray.h" +#include "CUDACore/VecArray.h" #include "CUDACore/cuda_assert.h" #include "CAConstants.h" #include "GPUCACell.h" -namespace gpuPixelDoubletsAlgos { +namespace gpuPixelDoublets { using CellNeighbors = CAConstants::CellNeighbors; using CellTracks = CAConstants::CellTracks; @@ -239,6 +239,6 @@ namespace gpuPixelDoubletsAlgos { } // loop in block... } -} // namespace gpuPixelDoubletsAlgos +} // namespace gpuPixelDoublets #endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h diff --git a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h index f98cbf959..201971770 100644 --- a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h +++ b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h @@ -48,7 +48,7 @@ namespace gpuVertexFinder { assert(pdata); assert(zt); - using Hist = HistoContainer; + using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; __shared__ typename Hist::Counter hws[32]; for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { @@ -101,7 +101,7 @@ namespace gpuVertexFinder { nn[i]++; }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __syncthreads(); @@ -122,7 +122,7 @@ namespace gpuVertexFinder { mdist = dist; iv[i] = j; // assign to cluster (better be unique??) }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __syncthreads(); @@ -171,7 +171,7 @@ namespace gpuVertexFinder { mdist = dist; minJ = j; }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); // should belong to the same cluster... assert(iv[i] == iv[minJ]); assert(nn[i] <= nn[iv[i]]); diff --git a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h index 8ac205b00..504ee4cf2 100644 --- a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h +++ b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h @@ -44,7 +44,7 @@ namespace gpuVertexFinder { assert(pdata); assert(zt); - using Hist = HistoContainer; + using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; __shared__ typename Hist::Counter hws[32]; for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { @@ -96,7 +96,7 @@ namespace gpuVertexFinder { nn[i]++; }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __syncthreads(); @@ -118,7 +118,7 @@ namespace gpuVertexFinder { mz = zt[j]; iv[i] = j; // assign to cluster (better be unique??) }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __syncthreads(); @@ -172,7 +172,7 @@ namespace gpuVertexFinder { } assert(iv[i] == iv[j]); }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __syncthreads(); #endif @@ -194,7 +194,7 @@ namespace gpuVertexFinder { mdist = dist; iv[i] = iv[j]; // assign to cluster (better be unique??) }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __shared__ unsigned int foundClusters; diff --git a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h index dd6f5fef7..6e7da0efd 100644 --- a/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h +++ b/src/cuda/plugin-PixelVertexFinding/gpuClusterTracksIterative.h @@ -44,7 +44,7 @@ namespace gpuVertexFinder { assert(pdata); assert(zt); - using Hist = HistoContainer; + using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; __shared__ typename Hist::Counter hws[32]; for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { @@ -97,7 +97,7 @@ namespace gpuVertexFinder { nn[i]++; }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __shared__ int nloops; @@ -165,7 +165,7 @@ namespace gpuVertexFinder { mdist = dist; iv[i] = iv[j]; // assign to cluster (better be unique??) }; - forEachInBins(hist, izt[i], 1, loop); + cms::cuda::forEachInBins(hist, izt[i], 1, loop); } __shared__ unsigned int foundClusters; diff --git a/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h b/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h index a1f1f96bf..f3260cad7 100644 --- a/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h +++ b/src/cuda/plugin-PixelVertexFinding/gpuVertexFinderImpl.h @@ -109,7 +109,7 @@ namespace gpuVertexFinder { loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); cudaCheck(cudaGetLastError()); #else - cudaCompat::resetGrid(); + cms::cudacompat::resetGrid(); init(soa, ws_d.get()); loadTracks(tksoa, soa, ws_d.get(), ptMin); #endif diff --git a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index 207195bda..06624744e 100644 --- a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -44,6 +44,7 @@ class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork { std::unique_ptr wordFedAppender_; PixelFormatterErrors errors_; + const bool isRun2_; const bool includeErrors_; const bool useQuality_; }; @@ -52,6 +53,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg) : rawGetToken_(reg.consumes()), digiPutToken_(reg.produces>()), clusterPutToken_(reg.produces>()), + isRun2_(true), includeErrors_(true), useQuality_(true) { if (includeErrors_) { @@ -145,7 +147,8 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, } // end of for loop - gpuAlgo_.makeClustersAsync(gpuMap, + gpuAlgo_.makeClustersAsync(isRun2_, + gpuMap, gpuModulesToUnpack, gpuGains, *wordFedAppender_, diff --git a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu index e58edcf1a..f5070130a 100644 --- a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu +++ b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu @@ -21,24 +21,19 @@ #include #include -// cub includes -#include - // CMSSW includes #include "CUDADataFormats/gpuClusteringConstants.h" #include "CUDACore/cudaCheck.h" #include "CUDACore/device_unique_ptr.h" #include "CUDACore/host_unique_ptr.h" - #include "CondFormats/SiPixelFedCablingMapGPU.h" +// local includes +#include "SiPixelRawToClusterGPUKernel.h" #include "gpuCalibPixel.h" #include "gpuClusterChargeCut.h" #include "gpuClustering.h" -// local includes -#include "SiPixelRawToClusterGPUKernel.h" - namespace pixelgpudetails { // number of words for all the FEDs @@ -363,7 +358,7 @@ namespace pixelgpudetails { uint32_t *pdigi, uint32_t *rawIdArr, uint16_t *moduleId, - GPU::SimpleVector *err, + cms::cuda::SimpleVector *err, bool useQualityInfo, bool includeErrors, bool debug) { @@ -489,8 +484,8 @@ namespace pixelgpudetails { } __shared__ uint32_t ws[32]; - blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws); - blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::MaxNumModules - 1024, ws); + cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws); + cms::cuda::blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::MaxNumModules - 1024, ws); for (int i = first + 1025, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) { moduleStart[i] += moduleStart[1024]; @@ -524,7 +519,8 @@ namespace pixelgpudetails { } // Interface to outside - void SiPixelRawToClusterGPUKernel::makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, + void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2, + const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, const WordFedAppender &wordFed, @@ -600,7 +596,8 @@ namespace pixelgpudetails { int blocks = (std::max(int(wordCounter), int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock; - gpuCalibPixel::calibDigis<<>>(digis_d.moduleInd(), + gpuCalibPixel::calibDigis<<>>(isRun2, + digis_d.moduleInd(), digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(), diff --git a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h index a9e7a13cc..3cbce9e71 100644 --- a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h +++ b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h @@ -7,7 +7,7 @@ #include "CUDADataFormats/SiPixelDigisCUDA.h" #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" -#include "CUDACore/GPUSimpleVector.h" +#include "CUDACore/SimpleVector.h" #include "CUDACore/host_unique_ptr.h" #include "CUDACore/host_noncached_unique_ptr.h" #include "DataFormats/PixelErrors.h" @@ -167,7 +167,8 @@ namespace pixelgpudetails { SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete; - void makeClustersAsync(const SiPixelFedCablingMapGPU* cablingMap, + void makeClustersAsync(bool isRun2, + const SiPixelFedCablingMapGPU* cablingMap, const unsigned char* modToUnp, const SiPixelGainForHLTonGPU* gains, const WordFedAppender& wordFed, diff --git a/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h b/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h index 029b0912d..da36be6c4 100644 --- a/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h +++ b/src/cuda/plugin-SiPixelClusterizer/gpuCalibPixel.h @@ -13,12 +13,14 @@ namespace gpuCalibPixel { constexpr uint16_t InvId = 9999; // must be > MaxNumModules + // valid for run2 constexpr float VCaltoElectronGain = 47; // L2-4: 47 +- 4.7 constexpr float VCaltoElectronGain_L1 = 50; // L1: 49.6 +- 2.6 constexpr float VCaltoElectronOffset = -60; // L2-4: -60 +- 130 constexpr float VCaltoElectronOffset_L1 = -670; // L1: -670 +- 220 - __global__ void calibDigis(uint16_t* id, + __global__ void calibDigis(bool isRun2, + uint16_t* id, uint16_t const* __restrict__ x, uint16_t const* __restrict__ y, uint16_t* adc, @@ -41,8 +43,8 @@ namespace gpuCalibPixel { if (InvId == id[i]) continue; - float conversionFactor = id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain; - float offset = id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset; + float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f; + float offset = (isRun2) ? (id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset) : 0; bool isDeadColumn = false, isNoisyColumn = false; diff --git a/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h index ef21acc36..d0dd93044 100644 --- a/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h +++ b/src/cuda/plugin-SiPixelClusterizer/gpuClusterChargeCut.h @@ -89,7 +89,7 @@ namespace gpuClustering { // renumber __shared__ uint16_t ws[32]; - blockPrefixScan(newclusId, nclus, ws); + cms::cuda::blockPrefixScan(newclusId, nclus, ws); assert(nclus >= newclusId[nclus - 1]); diff --git a/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h b/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h index cb3a0244f..84609bd10 100644 --- a/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h +++ b/src/cuda/plugin-SiPixelClusterizer/gpuClustering.h @@ -80,7 +80,7 @@ namespace gpuClustering { //init hist (ymax=416 < 512 : 9bits) constexpr uint32_t maxPixInModule = 4000; constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2; - using Hist = HistoContainer; + using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; __shared__ typename Hist::Counter ws[32]; for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { diff --git a/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu b/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu index 7b93508e5..4cd3fc152 100644 --- a/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu +++ b/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu @@ -63,9 +63,7 @@ namespace pixelgpudetails { } if (nHits) { - auto hws = cms::cuda::make_device_unique(TrackingRecHit2DSOAView::Hist::wsSize(), stream); - cms::cuda::fillManyFromVector( - hits_d.phiBinner(), hws.get(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream); + cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream); cudaCheck(cudaGetLastError()); } diff --git a/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h index f70cab66c..433d3b012 100644 --- a/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h +++ b/src/cuda/plugin-SiPixelRecHits/gpuPixelRecHits.h @@ -7,14 +7,14 @@ #include "CUDADataFormats/BeamSpotCUDA.h" #include "CUDADataFormats/TrackingRecHit2DCUDA.h" +#include "DataFormats/approx_atan2.h" #include "CUDACore/cuda_assert.h" #include "CondFormats/pixelCPEforGPU.h" -#include "DataFormats/approx_atan2.h" namespace gpuPixelRecHits { __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams, - BeamSpotCUDA::Data const* __restrict__ bs, + BeamSpotPOD const* __restrict__ bs, SiPixelDigisCUDA::DeviceConstView const* __restrict__ pdigis, int numElements, SiPixelClustersCUDA::DeviceConstView const* __restrict__ pclusters, @@ -134,6 +134,9 @@ namespace gpuPixelRecHits { __syncthreads(); + // pixmx is not available in the binary dumps + //auto pixmx = cpeParams->detParams(me).pixmx; + auto pixmx = std::numeric_limits::max(); for (int i = first; i < numElements; i += blockDim.x) { auto id = digis.moduleInd(i); if (id == InvId) @@ -148,7 +151,7 @@ namespace gpuPixelRecHits { assert(cl < MaxHitsInIter); auto x = digis.xx(i); auto y = digis.yy(i); - auto ch = digis.adc(i); + auto ch = std::min(digis.adc(i), pixmx); atomicAdd(&clusParams.charge[cl], ch); if (clusParams.minRow[cl] == x) atomicAdd(&clusParams.Q_f_X[cl], ch); diff --git a/src/cuda/test/AtomicPairCounter_t.cu b/src/cuda/test/AtomicPairCounter_t.cu index 39041b4b4..8e737e9cb 100644 --- a/src/cuda/test/AtomicPairCounter_t.cu +++ b/src/cuda/test/AtomicPairCounter_t.cu @@ -1,9 +1,12 @@ -#include "CUDACore/cudaCheck.h" -#include "CUDACore/AtomicPairCounter.h" +#include + +#include +#include "CUDACore/AtomicPairCounter.h" +#include "CUDACore/cudaCheck.h" #include "CUDACore/cuda_assert.h" -__global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { +__global__ void update(cms::cuda::AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) return; @@ -17,12 +20,12 @@ __global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uin cont[j] = i; }; -__global__ void finalize(AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { +__global__ void finalize(cms::cuda::AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { assert(dc->get().m == n); ind[n] = dc->get().n; } -__global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) { +__global__ void verify(cms::cuda::AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) return; @@ -37,13 +40,12 @@ __global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_ assert(cont[ib] == k); } -#include int main() { - AtomicPairCounter *dc_d; - cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); - cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); + cms::cuda::AtomicPairCounter *dc_d; + cudaCheck(cudaMalloc(&dc_d, sizeof(cms::cuda::AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(cms::cuda::AtomicPairCounter))); - std::cout << "size " << sizeof(AtomicPairCounter) << std::endl; + std::cout << "size " << sizeof(cms::cuda::AtomicPairCounter) << std::endl; constexpr uint32_t N = 20000; constexpr uint32_t M = N * 6; @@ -56,8 +58,8 @@ int main() { finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000); verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000); - AtomicPairCounter dc; - cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); + cms::cuda::AtomicPairCounter dc; + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(cms::cuda::AtomicPairCounter), cudaMemcpyDeviceToHost)); std::cout << dc.get().n << ' ' << dc.get().m << std::endl; diff --git a/src/cuda/test/HistoContainer_t.cu b/src/cuda/test/HistoContainer_t.cu index 0cae01f56..15aafe0d3 100644 --- a/src/cuda/test/HistoContainer_t.cu +++ b/src/cuda/test/HistoContainer_t.cu @@ -4,9 +4,12 @@ #include #include -#include "CUDACore/device_unique_ptr.h" -#include "CUDACore/cudaCheck.h" #include "CUDACore/HistoContainer.h" +#include "CUDACore/cudaCheck.h" +#include "CUDACore/device_unique_ptr.h" +#include "CUDACore/requireDevices.h" + +using namespace cms::cuda; template void go() { @@ -15,7 +18,7 @@ void go() { constexpr int N = 12000; T v[N]; - auto v_d = cms::cuda::make_device_unique(N, nullptr); + auto v_d = make_device_unique(N, nullptr); cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); @@ -25,14 +28,13 @@ void go() { using Hist = HistoContainer; std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' - << Hist::capacity() << ' ' << Hist::wsSize() << ' ' + << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' ' << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; Hist h; - auto h_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(Hist::wsSize(), nullptr); + auto h_d = make_device_unique(1, nullptr); - auto off_d = cms::cuda::make_device_unique(nParts + 1, nullptr); + auto off_d = make_device_unique(nParts + 1, nullptr); for (int it = 0; it < 5; ++it) { offsets[0] = 0; @@ -67,7 +69,7 @@ void go() { cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); - cms::cuda::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); + fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); assert(0 == h.off[0]); assert(offsets[10] == h.size()); @@ -144,6 +146,8 @@ void go() { } int main() { + cms::cudatest::requireDevices(); + go(); go(); diff --git a/src/cuda/test/HistoContainer_t_cpu.cc b/src/cuda/test/HistoContainer_t_cpu.cc index 475a5484f..ad1121ef1 100644 --- a/src/cuda/test/HistoContainer_t_cpu.cc +++ b/src/cuda/test/HistoContainer_t_cpu.cc @@ -5,6 +5,9 @@ #include #include "CUDACore/HistoContainer.h" +#include "CUDACore/requireDevices.h" + +using namespace cms::cuda; template void go() { diff --git a/src/cuda/test/OneHistoContainer_t.cu b/src/cuda/test/OneHistoContainer_t.cu index a33b9d032..960f77eca 100644 --- a/src/cuda/test/OneHistoContainer_t.cu +++ b/src/cuda/test/OneHistoContainer_t.cu @@ -4,10 +4,13 @@ #include #include -#include "CUDACore/device_unique_ptr.h" -#include "CUDACore/cudaCheck.h" #include "CUDACore/HistoContainer.h" +#include "CUDACore/cudaCheck.h" +#include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" +#include "CUDACore/requireDevices.h" + +using namespace cms::cuda; template __global__ void mykernel(T const* __restrict__ v, uint32_t N) { @@ -105,7 +108,7 @@ void go() { constexpr int N = 12000; T v[N]; - auto v_d = cms::cuda::make_device_unique(N, nullptr); + auto v_d = make_device_unique(N, nullptr); assert(v_d.get()); using Hist = HistoContainer; @@ -124,11 +127,13 @@ void go() { assert(v); cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); assert(v_d.get()); - cms::cuda::launch(mykernel, {1, 256}, v_d.get(), N); + launch(mykernel, {1, 256}, v_d.get(), N); } } int main() { + cms::cudatest::requireDevices(); + go(); go(); go(); diff --git a/src/cuda/test/OneToManyAssoc_t.h b/src/cuda/test/OneToManyAssoc_t.h index b7506aa91..69c3ade3d 100644 --- a/src/cuda/test/OneToManyAssoc_t.h +++ b/src/cuda/test/OneToManyAssoc_t.h @@ -9,20 +9,20 @@ #ifdef __CUDACC__ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" +#include "CUDACore/requireDevices.h" #include "CUDACore/currentDevice.h" #endif #include "CUDACore/HistoContainer.h" +using cms::cuda::AtomicPairCounter; constexpr uint32_t MaxElem = 64000; constexpr uint32_t MaxTk = 8000; constexpr uint32_t MaxAssocs = 4 * MaxTk; -using Assoc = OneToManyAssoc; - -using SmallAssoc = OneToManyAssoc; - -using Multiplicity = OneToManyAssoc; +using Assoc = cms::cuda::OneToManyAssoc; +using SmallAssoc = cms::cuda::OneToManyAssoc; +using Multiplicity = cms::cuda::OneToManyAssoc; using TK = std::array; __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { @@ -97,6 +97,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co int main() { #ifdef __CUDACC__ + cms::cudatest::requireDevices(); auto current_device = cms::cuda::currentDevice(); #else // make sure cuda emulation is working @@ -117,9 +118,9 @@ int main() { assert(gridDim.z == 1); #endif - std::cout << "OneToManyAssoc " << Assoc::nbins() << ' ' << Assoc::capacity() << ' ' << Assoc::wsSize() << std::endl; - std::cout << "OneToManyAssoc (small) " << SmallAssoc::nbins() << ' ' << SmallAssoc::capacity() << ' ' - << SmallAssoc::wsSize() << std::endl; + std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl; + std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' ' + << SmallAssoc::capacity() << std::endl; std::mt19937 eng; @@ -168,8 +169,6 @@ int main() { assert(v_d.get()); auto a_d = cms::cuda::make_device_unique(1, nullptr); auto sa_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(Assoc::wsSize(), nullptr); - cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else auto a_d = std::make_unique(); @@ -177,7 +176,7 @@ int main() { auto v_d = tr.data(); #endif - cms::cuda::launchZero(a_d.get(), 0); + launchZero(a_d.get(), 0); #ifdef __CUDACC__ auto nThreads = 256; @@ -185,12 +184,12 @@ int main() { count<<>>(v_d.get(), a_d.get(), N); - cms::cuda::launchFinalize(a_d.get(), ws_d.get(), 0); + launchFinalize(a_d.get(), 0); verify<<<1, 1>>>(a_d.get()); fill<<>>(v_d.get(), a_d.get(), N); #else count(v_d, a_d.get(), N); - cms::cuda::launchFinalize(a_d.get()); + launchFinalize(a_d.get()); verify(a_d.get()); fill(v_d, a_d.get(), N); #endif @@ -228,7 +227,7 @@ int main() { cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); nBlocks = (N + nThreads - 1) / nThreads; fillBulk<<>>(dc_d, v_d.get(), a_d.get(), N); - cms::cuda::finalizeBulk<<>>(dc_d, a_d.get()); + finalizeBulk<<>>(dc_d, a_d.get()); verifyBulk<<<1, 1>>>(a_d.get(), dc_d); cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); @@ -236,19 +235,19 @@ int main() { cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); - cms::cuda::finalizeBulk<<>>(dc_d, sa_d.get()); + finalizeBulk<<>>(dc_d, sa_d.get()); verifyBulk<<<1, 1>>>(sa_d.get(), dc_d); #else dc_d = &dc; fillBulk(dc_d, v_d, a_d.get(), N); - cms::cuda::finalizeBulk(dc_d, a_d.get()); + finalizeBulk(dc_d, a_d.get()); verifyBulk(a_d.get(), dc_d); memcpy(&la, a_d.get(), sizeof(Assoc)); AtomicPairCounter sdc(0); fillBulk(&sdc, v_d, sa_d.get(), N); - cms::cuda::finalizeBulk(&sdc, sa_d.get()); + finalizeBulk(&sdc, sa_d.get()); verifyBulk(sa_d.get(), &sdc); #endif @@ -277,8 +276,8 @@ int main() { auto m1_d = std::make_unique(); auto m2_d = std::make_unique(); #endif - cms::cuda::launchZero(m1_d.get(), 0); - cms::cuda::launchZero(m2_d.get(), 0); + launchZero(m1_d.get(), 0); + launchZero(m2_d.get(), 0); #ifdef __CUDACC__ nBlocks = (4 * N + nThreads - 1) / nThreads; @@ -286,8 +285,8 @@ int main() { countMultiLocal<<>>(v_d.get(), m2_d.get(), N); verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); - cms::cuda::launchFinalize(m1_d.get(), ws_d.get(), 0); - cms::cuda::launchFinalize(m2_d.get(), ws_d.get(), 0); + launchFinalize(m1_d.get(), 0); + launchFinalize(m2_d.get(), 0); verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); cudaCheck(cudaGetLastError()); @@ -297,8 +296,8 @@ int main() { countMultiLocal(v_d, m2_d.get(), N); verifyMulti(m1_d.get(), m2_d.get()); - cms::cuda::launchFinalize(m1_d.get()); - cms::cuda::launchFinalize(m2_d.get()); + launchFinalize(m1_d.get()); + launchFinalize(m2_d.get()); verifyMulti(m1_d.get(), m2_d.get()); #endif return 0; diff --git a/src/cuda/test/TrajectoryStateSOA_t.h b/src/cuda/test/TrajectoryStateSOA_t.h index c0892fe1e..2fcf9fc09 100644 --- a/src/cuda/test/TrajectoryStateSOA_t.h +++ b/src/cuda/test/TrajectoryStateSOA_t.h @@ -51,10 +51,15 @@ __global__ void testTSSoA(TS* pts, int n) { } #ifdef __CUDACC__ +#include "CUDACore/requireDevices.h" #include "CUDACore/cudaCheck.h" #endif int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); +#endif + TS ts; #ifdef __CUDACC__ diff --git a/src/cuda/test/VertexFinder_t.h b/src/cuda/test/VertexFinder_t.h index 1f944aa71..53f26d2de 100644 --- a/src/cuda/test/VertexFinder_t.h +++ b/src/cuda/test/VertexFinder_t.h @@ -5,16 +5,17 @@ #include #include "CUDACore/cudaCheck.h" +#include "CUDACore/requireDevices.h" #include "CUDACore/launch.h" #ifdef USE_DBSCAN #include "plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h" -#define CLUSTERIZE clusterTracksDBSCAN +#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN #elif USE_ITERATIVE #include "plugin-PixelVertexFinding/gpuClusterTracksIterative.h" -#define CLUSTERIZE clusterTracksIterative +#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative #else #include "plugin-PixelVertexFinding/gpuClusterTracksByDensity.h" -#define CLUSTERIZE clusterTracksByDensityKernel +#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel #endif #include "plugin-PixelVertexFinding/gpuFitVertices.h" #include "plugin-PixelVertexFinding/gpuSortByPt2.h" @@ -42,8 +43,6 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, #endif #endif -using namespace gpuVertexFinder; - struct Event { std::vector zvert; std::vector itrack; @@ -102,10 +101,10 @@ struct ClusterGenerator { }; // a macro SORRY -#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(ZVertices, M)) -#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(WorkSpace, M)) +#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M)) +#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) -__global__ void print(ZVertices const* pdata, WorkSpace const* pws) { +__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) { auto const& __restrict__ data = *pdata; auto const& __restrict__ ws = *pws; printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); @@ -113,11 +112,13 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) { int main() { #ifdef __CUDACC__ - auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(1, nullptr); + cms::cudatest::requireDevices(); + + auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); + auto ws_d = cms::cuda::make_device_unique(1, nullptr); #else - auto onGPU_d = std::make_unique(); - auto ws_d = std::make_unique(); + auto onGPU_d = std::make_unique(); + auto ws_d = std::make_unique(); #endif Event ev; @@ -180,7 +181,7 @@ int main() { cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); @@ -242,7 +243,7 @@ int main() { } #ifdef __CUDACC__ - cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); @@ -262,7 +263,7 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - cms::cuda::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else gridDim.x = 1; @@ -274,10 +275,10 @@ int main() { std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); cudaCheck(cudaGetLastError()); - cms::cuda::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else diff --git a/src/cuda/test/cudastdAlgorithm_t.cu b/src/cuda/test/cudastdAlgorithm_t.cu index b638624d1..4ecc93cf5 100644 --- a/src/cuda/test/cudastdAlgorithm_t.cu +++ b/src/cuda/test/cudastdAlgorithm_t.cu @@ -2,6 +2,7 @@ #include #include "CUDACore/cudastdAlgorithm.h" +#include "CUDACore/requireDevices.h" #include "CUDACore/launch.h" __global__ void testBinaryFind() { @@ -22,4 +23,8 @@ __global__ void testBinaryFind() { void wrapper() { cms::cuda::launch(testBinaryFind, {32, 64}); } -int main() { wrapper(); } +int main() { + cms::cudatest::requireDevices(); + + wrapper(); +} diff --git a/src/cuda/test/eigenSoA_t.h b/src/cuda/test/eigenSoA_t.h index afc17dd5c..5409bce10 100644 --- a/src/cuda/test/eigenSoA_t.h +++ b/src/cuda/test/eigenSoA_t.h @@ -22,7 +22,7 @@ __global__ void testBasicSoA(float* p) { assert(!isPowerOf2(1026)); using M3 = Eigen::Matrix; - ; + __shared__ eigenSoA::MatrixSoA m; int first = threadIdx.x + blockIdx.x * blockDim.x; @@ -59,10 +59,15 @@ __global__ void testBasicSoA(float* p) { #include #ifdef __CUDACC__ +#include "CUDACore/requireDevices.h" #include "CUDACore/cudaCheck.h" #endif int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); +#endif + float p[1024]; std::uniform_real_distribution rgen(0.01, 0.99); diff --git a/src/cuda/test/gpuClustering_t.h b/src/cuda/test/gpuClustering_t.h index 66b8adaa8..5388e3499 100644 --- a/src/cuda/test/gpuClustering_t.h +++ b/src/cuda/test/gpuClustering_t.h @@ -13,6 +13,7 @@ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" +#include "CUDACore/requireDevices.h" #include "CUDACore/launch.h" #endif @@ -21,6 +22,10 @@ #include "plugin-SiPixelClusterizer/gpuClusterChargeCut.h" int main(void) { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); +#endif + using namespace gpuClustering; int numElements = 256 * 2000; diff --git a/src/cuda/test/prefixScan_t.cu b/src/cuda/test/prefixScan_t.cu index de1624db8..307c989b0 100644 --- a/src/cuda/test/prefixScan_t.cu +++ b/src/cuda/test/prefixScan_t.cu @@ -1,9 +1,22 @@ #include -#include - #include "CUDACore/cudaCheck.h" #include "CUDACore/prefixScan.h" +#include "CUDACore/requireDevices.h" + +using namespace cms::cuda; + +template +struct format_traits { +public: + static const constexpr char *failed_msg = "failed %d %d %d: %d %d\n"; +}; + +template <> +struct format_traits { +public: + static const constexpr char *failed_msg = "failed %d %d %d: %f %f\n"; +}; template __global__ void testPrefixScan(uint32_t size) { @@ -23,7 +36,7 @@ __global__ void testPrefixScan(uint32_t size) { assert(1 == co[0]); for (auto i = first + 1; i < size; i += blockDim.x) { if (c[i] != c[i - 1] + 1) - printf("failed %d %d %d: %d %d\n", size, i, blockDim.x, c[i], c[i - 1]); + printf(format_traits::failed_msg, size, i, blockDim.x, c[i], c[i - 1]); assert(c[i] == c[i - 1] + 1); assert(c[i] == i + 1); assert(c[i] = co[i]); @@ -47,7 +60,7 @@ __global__ void testWarpPrefixScan(uint32_t size) { assert(1 == co[0]); if (i != 0) { if (c[i] != c[i - 1] + 1) - printf("failed %d %d %d: %d %d\n", size, i, blockDim.x, c[i], c[i - 1]); + printf(format_traits::failed_msg, size, i, blockDim.x, c[i], c[i - 1]); assert(c[i] == c[i - 1] + 1); assert(c[i] == i + 1); assert(c[i] = co[i]); @@ -71,6 +84,8 @@ __global__ void verify(uint32_t const *v, uint32_t n) { } int main() { + cms::cudatest::requireDevices(); + std::cout << "warp level" << std::endl; // std::cout << "warp 32" << std::endl; testWarpPrefixScan<<<1, 32>>>(32); @@ -117,34 +132,17 @@ int main() { // the block counter int32_t *d_pc; cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t))); - cudaCheck(cudaMemset(d_pc, 0, 4)); + cudaCheck(cudaMemset(d_pc, 0, sizeof(int32_t))); nthreads = 1024; nblocks = (num_items + nthreads - 1) / nthreads; - multiBlockPrefixScan<<>>(d_in, d_out1, num_items, d_pc); + std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nblocks << std::endl; + multiBlockPrefixScan<<>>(d_in, d_out1, num_items, d_pc); + cudaCheck(cudaGetLastError()); verify<<>>(d_out1, num_items); + cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - // test cub - std::cout << "cub" << std::endl; - // Determine temporary device storage requirements for inclusive prefix sum - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items); - - std::cout << "temp storage " << temp_storage_bytes << std::endl; - - // Allocate temporary storage for inclusive prefix sum - // fake larger ws already available - temp_storage_bytes *= 8; - cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - std::cout << "temp storage " << temp_storage_bytes << std::endl; - // Run inclusive prefix sum - CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items)); - std::cout << "temp storage " << temp_storage_bytes << std::endl; - - verify<<>>(d_out2, num_items); - cudaDeviceSynchronize(); } // ksize return 0; } diff --git a/src/cuda/test/radixSort_t.cu b/src/cuda/test/radixSort_t.cu index 72134a603..e1b9bca4c 100644 --- a/src/cuda/test/radixSort_t.cu +++ b/src/cuda/test/radixSort_t.cu @@ -10,9 +10,12 @@ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" +#include "CUDACore/requireDevices.h" #include "CUDACore/launch.h" #include "CUDACore/radixSort.h" +using namespace cms::cuda; + template struct RS { using type = std::uniform_int_distribution; @@ -161,6 +164,8 @@ void go(bool useShared) { } int main() { + cms::cudatest::requireDevices(); + bool useShared = false; std::cout << "using Global memory" << std::endl; diff --git a/src/cuda/test/testEigenGPU.cu b/src/cuda/test/testEigenGPU.cu index 3ecd7d36b..9cbcc4c57 100644 --- a/src/cuda/test/testEigenGPU.cu +++ b/src/cuda/test/testEigenGPU.cu @@ -4,6 +4,7 @@ #include #include "CUDACore/cudaCheck.h" +#include "CUDACore/requireDevices.h" #ifdef USE_BL #include "plugin-PixelTriplets/BrokenLine.h" @@ -328,6 +329,8 @@ void testFit() { } int main(int argc, char* argv[]) { + cms::cudatest::requireDevices(); + testFit<4>(); testFit<3>(); testFit<5>(); diff --git a/src/cuda/test/testEigenGPUNoFit.cu b/src/cuda/test/testEigenGPUNoFit.cu index 734b0fb67..8e0b28f0f 100644 --- a/src/cuda/test/testEigenGPUNoFit.cu +++ b/src/cuda/test/testEigenGPUNoFit.cu @@ -4,6 +4,7 @@ #include #include "CUDACore/cudaCheck.h" +#include "CUDACore/requireDevices.h" #include "test_common.h" using namespace Eigen; @@ -214,6 +215,8 @@ void testEigenvalues() { } int main(int argc, char *argv[]) { + cms::cudatest::requireDevices(); + testEigenvalues(); testInverse3x3(); testInverse4x4(); diff --git a/src/cuda/test/test_GPUSimpleVector.cu b/src/cuda/test/test_GPUSimpleVector.cu index da8719147..92974d8a2 100644 --- a/src/cuda/test/test_GPUSimpleVector.cu +++ b/src/cuda/test/test_GPUSimpleVector.cu @@ -6,43 +6,46 @@ #include #include -#include "CUDACore/GPUSimpleVector.h" +#include "CUDACore/SimpleVector.h" #include "CUDACore/cudaCheck.h" +#include "CUDACore/requireDevices.h" -__global__ void vector_pushback(GPU::SimpleVector *foo) { +__global__ void vector_pushback(cms::cuda::SimpleVector *foo) { auto index = threadIdx.x + blockIdx.x * blockDim.x; foo->push_back(index); } -__global__ void vector_reset(GPU::SimpleVector *foo) { foo->reset(); } +__global__ void vector_reset(cms::cuda::SimpleVector *foo) { foo->reset(); } -__global__ void vector_emplace_back(GPU::SimpleVector *foo) { +__global__ void vector_emplace_back(cms::cuda::SimpleVector *foo) { auto index = threadIdx.x + blockIdx.x * blockDim.x; foo->emplace_back(index); } int main() { + cms::cudatest::requireDevices(); + auto maxN = 10000; - GPU::SimpleVector *obj_ptr = nullptr; - GPU::SimpleVector *d_obj_ptr = nullptr; - GPU::SimpleVector *tmp_obj_ptr = nullptr; + cms::cuda::SimpleVector *obj_ptr = nullptr; + cms::cuda::SimpleVector *d_obj_ptr = nullptr; + cms::cuda::SimpleVector *tmp_obj_ptr = nullptr; int *data_ptr = nullptr; int *d_data_ptr = nullptr; - cudaCheck(cudaMallocHost(&obj_ptr, sizeof(GPU::SimpleVector))); + cudaCheck(cudaMallocHost(&obj_ptr, sizeof(cms::cuda::SimpleVector))); cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int))); cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int))); - auto v = GPU::make_SimpleVector(obj_ptr, maxN, data_ptr); + auto v = cms::cuda::make_SimpleVector(obj_ptr, maxN, data_ptr); - cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(GPU::SimpleVector))); - GPU::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr); + cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(cms::cuda::SimpleVector))); + cms::cuda::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr); assert(tmp_obj_ptr->size() == 0); assert(tmp_obj_ptr->capacity() == static_cast(maxN)); - cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(GPU::SimpleVector))); + cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(cms::cuda::SimpleVector))); // ... and copy the object to the device. - cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); int numBlocks = 5; int numThreadsPerBlock = 256; @@ -50,14 +53,14 @@ int main() { cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); vector_reset<<>>(d_obj_ptr); cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); assert(obj_ptr->size() == 0); @@ -65,7 +68,7 @@ int main() { cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));