From 98ee4f45bfbd23c932619661bba7650210401ccd Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 30 Jul 2018 14:27:31 +0200
Subject: [PATCH] Move hitsModuleStart from RecHit to Raw2Cluster to avoid a
 cudaStreamSynchronize()

We need the number of hits/clusters in RecHit acquire(), but
fortunately we can do the necessary calculation already in Raw2Cluster.
---
 .../SiPixelClusterizer/plugins/BuildFile.xml  |  1 +
 .../plugins/SiPixelRawToClusterGPUKernel.cu   | 28 +++++++++++
 .../plugins/SiPixelRawToClusterGPUKernel.h    |  9 +++-
 .../siPixelRawToClusterHeterogeneousProduct.h |  4 ++
 .../SiPixelRecHits/plugins/BuildFile.xml      |  1 -
 .../SiPixelRecHits/plugins/PixelRecHits.cu    | 50 ++++++-------------
 .../SiPixelRecHits/plugins/PixelRecHits.h     |  5 +-
 .../plugins/SiPixelRecHitHeterogeneous.cc     |  2 +-
 .../siPixelRecHitsHeterogeneousProduct.h      |  2 +-
 9 files changed, 60 insertions(+), 42 deletions(-)
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
index 1dc69b4dd7b73..9db4a46f367b3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
@@ -12,6 +12,7 @@
 <use   name="HeterogeneousCore/CUDACore"/>
 <use   name="cuda"/>
 <use   name="cuda-api-wrappers"/>
+<use   name="cub"/>
 <library   file="*.cc *.cu" name="RecoLocalTrackerSiPixelClusterizerPlugins">
   <flags   EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 13daf90da28cc..839cb6c6202a3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -27,6 +27,9 @@
 #include <thrust/unique.h>
 #include <thrust/execution_policy.h>
 
+// cub includes
+#include <cub/cub.cuh>
+
 // CMSSW includes
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
@@ -96,6 +99,12 @@ namespace pixelgpudetails {
     assert(xx_d==gpuProduct.xx_d);
 
     cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
+
+    // originally from rechits
+    cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
+    uint32_t *tmp = nullptr;
+    cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
+    cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize));
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
@@ -118,6 +127,10 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(clusInModule_d));
     cudaCheck(cudaFree(moduleId_d));
     cudaCheck(cudaFree(gpuProduct_d));
+
+    // originally from rechits
+    cudaCheck(cudaFree(tempScanStorage_d));
+    cudaCheck(cudaFree(clusModuleStart_d));
   }
 
   void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
@@ -685,6 +698,21 @@ namespace pixelgpudetails {
           wordCounter);
       cudaCheck(cudaGetLastError());
 
+      // count the module start indices already here (instead of
+      // rechits) so that the number of clusters/hits can be made
+      // available in the rechit producer without additional points of
+      // synchronization/ExternalWork
+      //
+      // Set first the first element to 0
+      cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id()));
+      // Then use inclusive_scan to get the partial sum to the rest
+      cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize,
+                                              clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules,
+                                              stream.id()));
+      // last element holds the number of all clusters
+      cudaCheck(cudaMemcpyAsync(&nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+
       // clusters
       cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     } // end clusterizer scope
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 00e37d82959db..2f8a51901eaab 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -182,7 +182,8 @@ namespace pixelgpudetails {
         pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
         gpuProduct_d,
         xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
-        nDigis, nModulesActive
+        clusModuleStart_d,
+        nDigis, nModulesActive, nClusters
       };
     }
 
@@ -205,6 +206,7 @@ namespace pixelgpudetails {
 
     uint32_t nDigis = 0;
     uint32_t nModulesActive = 0;
+    uint32_t nClusters = 0;
 
     // scratch memory buffers
     uint32_t * word_d;
@@ -224,6 +226,11 @@ namespace pixelgpudetails {
     int32_t *  clus_d;
     uint32_t * clusInModule_d;
     uint32_t * moduleId_d;
+
+    // originally in rechit, moved here
+    uint32_t *clusModuleStart_d = nullptr;
+    void *tempScanStorage_d = nullptr;
+    size_t tempScanStorageSize = 0;
   };
 
   // configuration and memory buffers alocated on the GPU
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
index ac4cba5e356ad..a37f780b52e5e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
@@ -57,8 +57,12 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     uint32_t const * clusInModule_d;
     uint32_t const * moduleId_d;
 
+    // originally from rechits
+    uint32_t const * clusModuleStart_d;
+
     uint32_t nDigis;
     uint32_t nModules;
+    uint32_t nClusters;
   };
 
   using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
index 069f4ca05ab8d..a8af0c8a7c4f9 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
@@ -8,6 +8,5 @@
 <library file="*.cc *.cu" name="RecoLocalTrackerSiPixelRecHitsPlugins">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
-  <use name="cub"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 0cda5733d125f..bb1a7cc2707bd 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -5,9 +5,6 @@
 // CUDA runtime
 #include <cuda_runtime.h>
 
-// headers
-#include <cub/cub.cuh>
-
 // CMSSW headers
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
@@ -33,7 +30,6 @@ namespace pixelgpudetails {
   PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
 
     cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float)));
-    cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
@@ -54,10 +50,6 @@ namespace pixelgpudetails {
     gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
 
-    uint32_t *tmp = nullptr;
-    cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize_, tmp, tmp, gpuClustering::MaxNumModules));
-    cudaCheck(cudaMalloc(&tempScanStorage_, tempScanStorageSize_));
-
     // Feels a bit dumb but constexpr arrays are not supported for device code
     // TODO: should be moved to EventSetup (or better ideas?)
     // Would it be better to use "constant memory"?
@@ -66,7 +58,6 @@ namespace pixelgpudetails {
   }
 
   PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
-    cudaCheck(cudaFree(gpu_.hitsModuleStart_d));
     cudaCheck(cudaFree(gpu_.charge_d));
     cudaCheck(cudaFree(gpu_.detInd_d));
     cudaCheck(cudaFree(gpu_.xg_d));
@@ -84,8 +75,6 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.hist_d));
     cudaCheck(cudaFree(gpu_d));
 
-    cudaCheck(cudaFree(tempScanStorage_));
-
     cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
   }
 
@@ -93,15 +82,9 @@ namespace pixelgpudetails {
                                            float const * bs,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            cuda::stream_t<>& stream) {
-
    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));
-
-    // Set first the first element to 0
-    cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id()));
-    // Then use inclusive_scan to get the partial sum to the rest
-    cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_, tempScanStorageSize_,
-                                            input.clusInModule_d, &gpu_.hitsModuleStart_d[1], gpuClustering::MaxNumModules,
-                                            stream.id()));
+   gpu_.hitsModuleStart_d = input.clusModuleStart_d;
+   cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
 
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
@@ -130,6 +113,15 @@ namespace pixelgpudetails {
     // needed only if hits on CPU are required...
     cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hitsLayerStart_, gpu_.hitsLayerStart_d, 11*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    auto nhits = input.nClusters;
+    cpu_ = std::make_unique<HitsOnCPU>(nhits);
+    cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
 
 #ifdef GPU_DEBUG
     cudaStreamSynchronize(stream.id());
@@ -147,21 +139,9 @@ namespace pixelgpudetails {
     cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());
   }
 
-  HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
-    // needed only if hits on CPU are required...
-    auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
-
-    HitsOnCPU hoc(nhits);
-    hoc.gpu_d = gpu_d;
-    memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
-    cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaStreamSynchronize(stream.id()));
-    return hoc;
+  std::unique_ptr<HitsOnCPU>&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) {
+    cpu_->gpu_d = gpu_d;
+    memcpy(cpu_->hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
+    return std::move(cpu_);
   }
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index 7198e2875b298..21549936a4fc6 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -36,13 +36,12 @@ namespace pixelgpudetails {
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        cuda::stream_t<>& stream);
 
-    HitsOnCPU getOutput(cuda::stream_t<>& stream) const;
+    std::unique_ptr<HitsOnCPU>&& getOutput(cuda::stream_t<>& stream);
 
   private:
     HitsOnGPU * gpu_d;  // copy of the structure on the gpu itself: this is the "Product" 
     HitsOnGPU gpu_;
-    void *tempScanStorage_ = nullptr;
-    size_t tempScanStorageSize_ = 0;
+    std::unique_ptr<HitsOnCPU> cpu_;
     uint32_t *d_phase1TopologyLayerStart_ = nullptr;
     uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1];
     uint32_t hitsLayerStart_[11];
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index bfd839295bc8f..05b6846b9aacb 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
 }
 
 void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
-  auto output = std::make_unique<GPUProduct>(gpuAlgo_->getOutput(cudaStream));
+  auto output = gpuAlgo_->getOutput(cudaStream);
 
  // Need the CPU clusters to
   // - properly fill the output DetSetVector of hits
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 87a3cab0e5e98..498dc3d04b925 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -16,7 +16,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
 
   struct HitsOnGPU{
      float * bs_d;
-     uint32_t * hitsModuleStart_d;
+     const uint32_t * hitsModuleStart_d; // forwarded from clusters
      uint32_t * hitsLayerStart_d;
      int32_t  * charge_d;
      uint16_t * detInd_d;