From 98ee4f45bfbd23c932619661bba7650210401ccd Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 30 Jul 2018 14:27:31 +0200 Subject: [PATCH] Move hitsModuleStart from RecHit to Raw2Cluster to avoid a cudaStreamSynchronize() We need the number of hits/clusters in RecHit acquire(), but fortunately we can do the necessary calculation already in Raw2Cluster. --- .../SiPixelClusterizer/plugins/BuildFile.xml | 1 + .../plugins/SiPixelRawToClusterGPUKernel.cu | 28 +++++++++++ .../plugins/SiPixelRawToClusterGPUKernel.h | 9 +++- .../siPixelRawToClusterHeterogeneousProduct.h | 4 ++ .../SiPixelRecHits/plugins/BuildFile.xml | 1 - .../SiPixelRecHits/plugins/PixelRecHits.cu | 50 ++++++------------- .../SiPixelRecHits/plugins/PixelRecHits.h | 5 +- .../plugins/SiPixelRecHitHeterogeneous.cc | 2 +- .../siPixelRecHitsHeterogeneousProduct.h | 2 +- 9 files changed, 60 insertions(+), 42 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml index 1dc69b4dd7b73..9db4a46f367b3 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml @@ -12,6 +12,7 @@ + diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 13daf90da28cc..839cb6c6202a3 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -27,6 +27,9 @@ #include #include +// cub includes +#include + // CMSSW includes #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" @@ -96,6 +99,12 @@ namespace pixelgpudetails { assert(xx_d==gpuProduct.xx_d); cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id())); + + // originally from rechits + cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); + uint32_t *tmp = nullptr; + cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); + cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize)); } SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { @@ -118,6 +127,10 @@ namespace pixelgpudetails { cudaCheck(cudaFree(clusInModule_d)); cudaCheck(cudaFree(moduleId_d)); cudaCheck(cudaFree(gpuProduct_d)); + + // originally from rechits + cudaCheck(cudaFree(tempScanStorage_d)); + cudaCheck(cudaFree(clusModuleStart_d)); } void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { @@ -685,6 +698,21 @@ namespace pixelgpudetails { wordCounter); cudaCheck(cudaGetLastError()); + // count the module start indices already here (instead of + // rechits) so that the number of clusters/hits can be made + // available in the rechit producer without additional points of + // synchronization/ExternalWork + // + // Set first the first element to 0 + cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id())); + // Then use inclusive_scan to get the partial sum to the rest + cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize, + clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules, + stream.id())); + // last element holds the number of all clusters + cudaCheck(cudaMemcpyAsync(&nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + + // clusters cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); } // end clusterizer scope diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 00e37d82959db..2f8a51901eaab 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -182,7 +182,8 @@ namespace pixelgpudetails { pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, gpuProduct_d, xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d, - nDigis, nModulesActive + clusModuleStart_d, + nDigis, nModulesActive, nClusters }; } @@ -205,6 +206,7 @@ namespace pixelgpudetails { uint32_t nDigis = 0; uint32_t nModulesActive = 0; + uint32_t nClusters = 0; // scratch memory buffers uint32_t * word_d; @@ -224,6 +226,11 @@ namespace pixelgpudetails { int32_t * clus_d; uint32_t * clusInModule_d; uint32_t * moduleId_d; + + // originally in rechit, moved here + uint32_t *clusModuleStart_d = nullptr; + void *tempScanStorage_d = nullptr; + size_t tempScanStorageSize = 0; }; // configuration and memory buffers alocated on the GPU diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h index ac4cba5e356ad..a37f780b52e5e 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h @@ -57,8 +57,12 @@ namespace siPixelRawToClusterHeterogeneousProduct { uint32_t const * clusInModule_d; uint32_t const * moduleId_d; + // originally from rechits + uint32_t const * clusModuleStart_d; + uint32_t nDigis; uint32_t nModules; + uint32_t nClusters; }; using HeterogeneousDigiCluster = HeterogeneousProductImpl, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml index 069f4ca05ab8d..a8af0c8a7c4f9 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml @@ -8,6 +8,5 @@ - diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 0cda5733d125f..bb1a7cc2707bd 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -5,9 +5,6 @@ // CUDA runtime #include -// headers -#include - // CMSSW headers #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -33,7 +30,6 @@ namespace pixelgpudetails { PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) { cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float))); - cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t))); cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t))); cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); @@ -54,10 +50,6 @@ namespace pixelgpudetails { gpu_.me_d = gpu_d; cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id())); - uint32_t *tmp = nullptr; - cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize_, tmp, tmp, gpuClustering::MaxNumModules)); - cudaCheck(cudaMalloc(&tempScanStorage_, tempScanStorageSize_)); - // Feels a bit dumb but constexpr arrays are not supported for device code // TODO: should be moved to EventSetup (or better ideas?) // Would it be better to use "constant memory"? @@ -66,7 +58,6 @@ namespace pixelgpudetails { } PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { - cudaCheck(cudaFree(gpu_.hitsModuleStart_d)); cudaCheck(cudaFree(gpu_.charge_d)); cudaCheck(cudaFree(gpu_.detInd_d)); cudaCheck(cudaFree(gpu_.xg_d)); @@ -84,8 +75,6 @@ namespace pixelgpudetails { cudaCheck(cudaFree(gpu_.hist_d)); cudaCheck(cudaFree(gpu_d)); - cudaCheck(cudaFree(tempScanStorage_)); - cudaCheck(cudaFree(d_phase1TopologyLayerStart_)); } @@ -93,15 +82,9 @@ namespace pixelgpudetails { float const * bs, pixelCPEforGPU::ParamsOnGPU const * cpeParams, cuda::stream_t<>& stream) { - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id())); - - // Set first the first element to 0 - cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id())); - // Then use inclusive_scan to get the partial sum to the rest - cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_, tempScanStorageSize_, - input.clusInModule_d, &gpu_.hitsModuleStart_d[1], gpuClustering::MaxNumModules, - stream.id())); + gpu_.hitsModuleStart_d = input.clusModuleStart_d; + cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) @@ -130,6 +113,15 @@ namespace pixelgpudetails { // needed only if hits on CPU are required... cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(hitsLayerStart_, gpu_.hitsLayerStart_d, 11*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + auto nhits = input.nClusters; + cpu_ = std::make_unique(nhits); + cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); #ifdef GPU_DEBUG cudaStreamSynchronize(stream.id()); @@ -147,21 +139,9 @@ namespace pixelgpudetails { cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id()); } - HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const { - // needed only if hits on CPU are required... - auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules]; - - HitsOnCPU hoc(nhits); - hoc.gpu_d = gpu_d; - memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)); - cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaStreamSynchronize(stream.id())); - return hoc; + std::unique_ptr&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) { + cpu_->gpu_d = gpu_d; + memcpy(cpu_->hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)); + return std::move(cpu_); } } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index 7198e2875b298..21549936a4fc6 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -36,13 +36,12 @@ namespace pixelgpudetails { pixelCPEforGPU::ParamsOnGPU const * cpeParams, cuda::stream_t<>& stream); - HitsOnCPU getOutput(cuda::stream_t<>& stream) const; + std::unique_ptr&& getOutput(cuda::stream_t<>& stream); private: HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product" HitsOnGPU gpu_; - void *tempScanStorage_ = nullptr; - size_t tempScanStorageSize_ = 0; + std::unique_ptr cpu_; uint32_t *d_phase1TopologyLayerStart_ = nullptr; uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1]; uint32_t hitsLayerStart_[11]; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index bfd839295bc8f..05b6846b9aacb 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i } void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) { - auto output = std::make_unique(gpuAlgo_->getOutput(cudaStream)); + auto output = gpuAlgo_->getOutput(cudaStream); // Need the CPU clusters to // - properly fill the output DetSetVector of hits diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 87a3cab0e5e98..498dc3d04b925 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -16,7 +16,7 @@ namespace siPixelRecHitsHeterogeneousProduct { struct HitsOnGPU{ float * bs_d; - uint32_t * hitsModuleStart_d; + const uint32_t * hitsModuleStart_d; // forwarded from clusters uint32_t * hitsLayerStart_d; int32_t * charge_d; uint16_t * detInd_d;