Skip to content

Commit

Permalink
Move hitsModuleStart from RecHit to Raw2Cluster to avoid a cudaStream…
Browse files Browse the repository at this point in the history
…Synchronize()

We need the number of hits/clusters in RecHit acquire(), but
fortunately we can do the necessary calculation already in Raw2Cluster.
  • Loading branch information
makortel committed Jul 31, 2018
1 parent 6713272 commit 98ee4f4
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 42 deletions.
1 change: 1 addition & 0 deletions RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
<use name="HeterogeneousCore/CUDACore"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="cub"/>
<library file="*.cc *.cu" name="RecoLocalTrackerSiPixelClusterizerPlugins">
<flags EDM_PLUGIN="1"/>
</library>
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#include <thrust/unique.h>
#include <thrust/execution_policy.h>

// cub includes
#include <cub/cub.cuh>

// CMSSW includes
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
Expand Down Expand Up @@ -96,6 +99,12 @@ namespace pixelgpudetails {
assert(xx_d==gpuProduct.xx_d);

cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));

// originally from rechits
cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
uint32_t *tmp = nullptr;
cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize));
}

SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
Expand All @@ -118,6 +127,10 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(clusInModule_d));
cudaCheck(cudaFree(moduleId_d));
cudaCheck(cudaFree(gpuProduct_d));

// originally from rechits
cudaCheck(cudaFree(tempScanStorage_d));
cudaCheck(cudaFree(clusModuleStart_d));
}

void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
Expand Down Expand Up @@ -685,6 +698,21 @@ namespace pixelgpudetails {
wordCounter);
cudaCheck(cudaGetLastError());

// count the module start indices already here (instead of
// rechits) so that the number of clusters/hits can be made
// available in the rechit producer without additional points of
// synchronization/ExternalWork
//
// Set first the first element to 0
cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize,
clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules,
stream.id()));
// last element holds the number of all clusters
cudaCheck(cudaMemcpyAsync(&nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));


// clusters
cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
} // end clusterizer scope
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ namespace pixelgpudetails {
pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
gpuProduct_d,
xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
nDigis, nModulesActive
clusModuleStart_d,
nDigis, nModulesActive, nClusters
};
}

Expand All @@ -205,6 +206,7 @@ namespace pixelgpudetails {

uint32_t nDigis = 0;
uint32_t nModulesActive = 0;
uint32_t nClusters = 0;

// scratch memory buffers
uint32_t * word_d;
Expand All @@ -224,6 +226,11 @@ namespace pixelgpudetails {
int32_t * clus_d;
uint32_t * clusInModule_d;
uint32_t * moduleId_d;

// originally in rechit, moved here
uint32_t *clusModuleStart_d = nullptr;
void *tempScanStorage_d = nullptr;
size_t tempScanStorageSize = 0;
};

// configuration and memory buffers alocated on the GPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,12 @@ namespace siPixelRawToClusterHeterogeneousProduct {
uint32_t const * clusInModule_d;
uint32_t const * moduleId_d;

// originally from rechits
uint32_t const * clusModuleStart_d;

uint32_t nDigis;
uint32_t nModules;
uint32_t nClusters;
};

using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
Expand Down
1 change: 0 additions & 1 deletion RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,5 @@
<library file="*.cc *.cu" name="RecoLocalTrackerSiPixelRecHitsPlugins">
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="cub"/>
<flags EDM_PLUGIN="1"/>
</library>
50 changes: 15 additions & 35 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
// CUDA runtime
#include <cuda_runtime.h>

// headers
#include <cub/cub.cuh>

// CMSSW headers
#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
Expand All @@ -33,7 +30,6 @@ namespace pixelgpudetails {
PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {

cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
Expand All @@ -54,10 +50,6 @@ namespace pixelgpudetails {
gpu_.me_d = gpu_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));

uint32_t *tmp = nullptr;
cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize_, tmp, tmp, gpuClustering::MaxNumModules));
cudaCheck(cudaMalloc(&tempScanStorage_, tempScanStorageSize_));

// Feels a bit dumb but constexpr arrays are not supported for device code
// TODO: should be moved to EventSetup (or better ideas?)
// Would it be better to use "constant memory"?
Expand All @@ -66,7 +58,6 @@ namespace pixelgpudetails {
}

PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.hitsModuleStart_d));
cudaCheck(cudaFree(gpu_.charge_d));
cudaCheck(cudaFree(gpu_.detInd_d));
cudaCheck(cudaFree(gpu_.xg_d));
Expand All @@ -84,24 +75,16 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(gpu_.hist_d));
cudaCheck(cudaFree(gpu_d));

cudaCheck(cudaFree(tempScanStorage_));

cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
}

void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
float const * bs,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream) {

cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));

// Set first the first element to 0
cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_, tempScanStorageSize_,
input.clusInModule_d, &gpu_.hitsModuleStart_d[1], gpuClustering::MaxNumModules,
stream.id()));
gpu_.hitsModuleStart_d = input.clusModuleStart_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));

int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
Expand Down Expand Up @@ -130,6 +113,15 @@ namespace pixelgpudetails {
// needed only if hits on CPU are required...
cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hitsLayerStart_, gpu_.hitsLayerStart_d, 11*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
auto nhits = input.nClusters;
cpu_ = std::make_unique<HitsOnCPU>(nhits);
cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(cpu_->mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));

#ifdef GPU_DEBUG
cudaStreamSynchronize(stream.id());
Expand All @@ -147,21 +139,9 @@ namespace pixelgpudetails {
cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());
}

HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
// needed only if hits on CPU are required...
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];

HitsOnCPU hoc(nhits);
hoc.gpu_d = gpu_d;
memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaStreamSynchronize(stream.id()));
return hoc;
std::unique_ptr<HitsOnCPU>&& PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) {
cpu_->gpu_d = gpu_d;
memcpy(cpu_->hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
return std::move(cpu_);
}
}
5 changes: 2 additions & 3 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,12 @@ namespace pixelgpudetails {
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream);

HitsOnCPU getOutput(cuda::stream_t<>& stream) const;
std::unique_ptr<HitsOnCPU>&& getOutput(cuda::stream_t<>& stream);

private:
HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product"
HitsOnGPU gpu_;
void *tempScanStorage_ = nullptr;
size_t tempScanStorageSize_ = 0;
std::unique_ptr<HitsOnCPU> cpu_;
uint32_t *d_phase1TopologyLayerStart_ = nullptr;
uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1];
uint32_t hitsLayerStart_[11];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
}

void SiPixelRecHitHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
auto output = std::make_unique<GPUProduct>(gpuAlgo_->getOutput(cudaStream));
auto output = gpuAlgo_->getOutput(cudaStream);

// Need the CPU clusters to
// - properly fill the output DetSetVector of hits
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace siPixelRecHitsHeterogeneousProduct {

struct HitsOnGPU{
float * bs_d;
uint32_t * hitsModuleStart_d;
const uint32_t * hitsModuleStart_d; // forwarded from clusters
uint32_t * hitsLayerStart_d;
int32_t * charge_d;
uint16_t * detInd_d;
Expand Down

0 comments on commit 98ee4f4

Please sign in to comment.