Skip to content

Commit

Permalink
Move hitsModuleStart from RecHit to Raw2Cluster to avoid a cudaStream…
Browse files Browse the repository at this point in the history
…Synchronize()

We need the number of hits/clusters in RecHit acquire(), but
fortunately we can do the necessary calculation already in Raw2Cluster.
  • Loading branch information
makortel committed Jul 31, 2018
1 parent fd9ee77 commit 184d50c
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 46 deletions.
1 change: 1 addition & 0 deletions RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
<use name="HeterogeneousCore/CUDACore"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="cub"/>
<library file="*.cc *.cu" name="RecoLocalTrackerSiPixelClusterizerPlugins">
<flags EDM_PLUGIN="1"/>
</library>
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#include <thrust/unique.h>
#include <thrust/execution_policy.h>

// cub includes
#include <cub/cub.cuh>

// CMSSW includes
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
Expand Down Expand Up @@ -90,6 +93,12 @@ namespace pixelgpudetails {
cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) ));

// originally from rechits
cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
uint32_t *tmp = nullptr;
cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize));
}

SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
Expand All @@ -111,6 +120,10 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(clus_d));
cudaCheck(cudaFree(clusInModule_d));
cudaCheck(cudaFree(moduleId_d));

// originally from rechits
cudaCheck(cudaFree(tempScanStorage_d));
cudaCheck(cudaFree(clusModuleStart_d));
}

void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
Expand Down Expand Up @@ -680,6 +693,21 @@ namespace pixelgpudetails {
wordCounter);
cudaCheck(cudaGetLastError());

// count the module start indices already here (instead of
// rechits) so that the number of clusters/hits can be made
// available in the rechit producer without additional points of
// synchronization/ExternalWork
//
// Set first the first element to 0
cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize,
clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules,
stream.id()));
// last element holds the number of all clusters
cudaCheck(cudaMemcpyAsync(&nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));


// clusters
cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
} // end clusterizer scope
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,9 @@ namespace pixelgpudetails {
error_h->set_data(data_h);
return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
nDigis, nModulesActive,
xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
nDigis, nModulesActive, nClusters,
xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
clusModuleStart_d
};
}

Expand All @@ -189,6 +190,7 @@ namespace pixelgpudetails {

uint32_t nDigis = 0;
uint32_t nModulesActive = 0;
uint32_t nClusters = 0;

// scratch memory buffers
uint32_t * word_d;
Expand All @@ -208,6 +210,11 @@ namespace pixelgpudetails {
int32_t * clus_d;
uint32_t * clusInModule_d;
uint32_t * moduleId_d;

// originally in rechit, moved here
uint32_t *clusModuleStart_d = nullptr;
void *tempScanStorage_d = nullptr;
size_t tempScanStorageSize = 0;
};

// configuration and memory buffers alocated on the GPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ namespace siPixelRawToClusterHeterogeneousProduct {
// Needed for GPU rechits
uint32_t nDigis;
uint32_t nModules;
uint32_t nClusters;
uint16_t const * xx_d;
uint16_t const * yy_d;
uint16_t const * adc_d;
Expand All @@ -55,6 +56,9 @@ namespace siPixelRawToClusterHeterogeneousProduct {
int32_t const * clus_d;
uint32_t const * clusInModule_d;
uint32_t const * moduleId_d;

// originally from rechits
uint32_t const * clusModuleStart_d;
};

using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
Expand Down
1 change: 0 additions & 1 deletion RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,5 @@
<library file="*.cc *.cu" name="RecoLocalTrackerSiPixelRecHitsPlugins">
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="cub"/>
<flags EDM_PLUGIN="1"/>
</library>
54 changes: 16 additions & 38 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
// CUDA runtime
#include <cuda_runtime.h>

// headers
#include <cub/cub.cuh>

// CMSSW headers
#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
Expand All @@ -33,7 +30,6 @@ namespace pixelgpudetails {
PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {

cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
Expand All @@ -54,10 +50,6 @@ namespace pixelgpudetails {
cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));

uint32_t *tmp = nullptr;
cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize_, tmp, tmp, gpuClustering::MaxNumModules));
cudaCheck(cudaMalloc(&tempScanStorage_, tempScanStorageSize_));

// Feels a bit dumb but constexpr arrays are not supported for device code
// TODO: should be moved to EventSetup (or better ideas?)
// Would it be better to use "constant memory"?
Expand All @@ -66,7 +58,6 @@ namespace pixelgpudetails {
}

PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.hitsModuleStart_d));
cudaCheck(cudaFree(gpu_.charge_d));
cudaCheck(cudaFree(gpu_.detInd_d));
cudaCheck(cudaFree(gpu_.xg_d));
Expand All @@ -85,25 +76,15 @@ namespace pixelgpudetails {

cudaCheck(cudaFree(gpu_d));

cudaCheck(cudaFree(tempScanStorage_));

cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
}

void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
float const * bs,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream) {

cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));

// Set first the first element to 0
cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_, tempScanStorageSize_,
input.clusInModule_d, &gpu_.hitsModuleStart_d[1], gpuClustering::MaxNumModules,
stream.id()));

int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
Expand All @@ -115,7 +96,7 @@ namespace pixelgpudetails {
input.clusInModule_d, input.moduleId_d,
input.clus_d,
input.nDigis,
gpu_.hitsModuleStart_d,
input.clusModuleStart_d,
gpu_.charge_d,
gpu_.detInd_d,
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
Expand All @@ -126,11 +107,20 @@ namespace pixelgpudetails {
);

// assuming full warp of threads is better than a smaller number...
setHitsLayerStart<<<1, 32, 0, stream.id()>>>(gpu_.hitsModuleStart_d, d_phase1TopologyLayerStart_, gpu_.hitsLayerStart_d);
setHitsLayerStart<<<1, 32, 0, stream.id()>>>(input.clusModuleStart_d, d_phase1TopologyLayerStart_, gpu_.hitsLayerStart_d);

// needed only if hits on CPU are required...
cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hitsModuleStart_, input.clusModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hitsLayerStart_, gpu_.hitsLayerStart_d, 11*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
auto nhits = input.nClusters;
cpu_ = std::make_unique<HitsOnCPU>(nhits);
cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));

#ifdef GPU_DEBUG
cudaStreamSynchronize(stream.id());
Expand All @@ -150,21 +140,9 @@ namespace pixelgpudetails {

}

HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
// needed only if hits on CPU are required...
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];

HitsOnCPU hoc(nhits);
hoc.gpu_d = gpu_d;
memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaStreamSynchronize(stream.id()));
return hoc;
std::unique_ptr<HitsOnCPU>&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) {
cpu_->gpu_d = gpu_d;
memcpy(cpu_->hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
return std::move(cpu_);
}
}
5 changes: 2 additions & 3 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,12 @@ namespace pixelgpudetails {
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream);

HitsOnCPU getOutput(cuda::stream_t<>& stream) const;
std::unique_ptr<HitsOnCPU>&& getOutput(cuda::stream_t<>& stream);

private:
HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product"
HitsOnGPU gpu_;
void *tempScanStorage_ = nullptr;
size_t tempScanStorageSize_ = 0;
std::unique_ptr<HitsOnCPU> cpu_;
uint32_t *d_phase1TopologyLayerStart_ = nullptr;
uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1];
uint32_t hitsLayerStart_[11];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
}

void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
auto output = std::make_unique<GPUProduct>(gpuAlgo_->getOutput(cudaStream));
auto output = gpuAlgo_->getOutput(cudaStream);

// Need the CPU clusters to
// - properly fill the output DetSetVector of hits
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace siPixelRecHitsHeterogeneousProduct {

struct HitsOnGPU{
float * bs_d;
uint32_t * hitsModuleStart_d;
//uint32_t * hitsModuleStart_d; // moved to part of clusters
uint32_t * hitsLayerStart_d;
int32_t * charge_d;
uint16_t * detInd_d;
Expand Down

0 comments on commit 184d50c

Please sign in to comment.