Skip to content

Commit

Permalink
Move BeamSpot transfer to GPU to its own producer (#318)
Browse files Browse the repository at this point in the history
Implement a non-caching host allocator, useful for host-to-device copy buffers:
  - not bound to any CUDA stream to allow use in EDM beginStream();
  - with the possibility to pass flags to cudaHostAlloc(), e.g. cudaHostAllocWriteCombined.

Add perfect forwarding overload for CUDAProduct constructor, enabling the use of CUDAScopedContext::emplace() in BeamSpotToCUDA::produce().

Move the BeamSpot host-to-device transfer to its own EDProducer, making use of beginStream()-allocated write-combined memory for the transfer.
  • Loading branch information
makortel authored and fwyzard committed Oct 20, 2020
1 parent 8245b38 commit a9a6ce3
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
#include "RecoTracker/Record/interface/CkfComponentsRecord.h"

Expand Down Expand Up @@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
std::unique_ptr<PixelUnpackingRegions> regions_;

pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
PixelDataFormatter::Errors errors_;

const bool includeErrors_;
Expand All @@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
}

if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster") << " Use pilot blade data (FED 40)";

edm::Service<CUDAService> cs;
if(cs->enabled()) {
wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
}
}

void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand Down Expand Up @@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event

// In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
ErrorChecker errorcheck;
auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
for(int fedId: fedIds_) {
if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
Expand Down Expand Up @@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);

assert(0 == (ew-bw)%2);
wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
wordCounterGPU+=(ew-bw);

} // end of for loop

gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
wordFedAppender,
*wordFedAppender_,
std::move(errors_),
wordCounterGPU, fedCounter,
useQuality_, includeErrors_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,9 @@ namespace pixelgpudetails {
// number of words for all the FEDs
constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;

SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
edm::Service<CUDAService> cs;
word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
word_ = cudautils::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
fedId_ = cudautils::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
}

void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "FWCore/Utilities/interface/typedefs.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"

struct SiPixelFedCablingMapGPU;
Expand Down Expand Up @@ -159,7 +160,7 @@ namespace pixelgpudetails {
public:
class WordFedAppender {
public:
WordFedAppender(cuda::stream_t<>& cudaStream);
WordFedAppender();
~WordFedAppender() = default;

void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
Expand All @@ -168,8 +169,8 @@ namespace pixelgpudetails {
const unsigned char *fedId() const { return fedId_.get(); }

private:
cudautils::host::unique_ptr<unsigned int[]> word_;
cudautils::host::unique_ptr<unsigned char[]> fedId_;
cudautils::host::noncached::unique_ptr<unsigned int[]> word_;
cudautils::host::noncached::unique_ptr<unsigned char[]> fedId_;
};

SiPixelRawToClusterGPUKernel() = default;
Expand Down
1 change: 1 addition & 0 deletions RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<use name="CUDADataFormats/BeamSpot"/>
<use name="DataFormats/TrackerCommon"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="HeterogeneousCore/Producer"/>
Expand Down
7 changes: 2 additions & 5 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ namespace pixelgpudetails {

constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();

cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));

// Coalesce all 32bit and 16bit arrays to two big blobs
Expand Down Expand Up @@ -111,7 +110,6 @@ namespace pixelgpudetails {
#endif
}
PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.bs_d));
cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
cudaCheck(cudaFree(gpu_.owner_32bit_));
cudaCheck(cudaFree(gpu_.owner_16bit_));
Expand All @@ -131,11 +129,10 @@ namespace pixelgpudetails {

void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
SiPixelClustersCUDA const& clusters_d,
float const * bs,
BeamSpotCUDA const& bs_d,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
bool transferToCPU,
cuda::stream_t<>& stream) {
cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
gpu_.cpeParams = cpeParams; // copy it for use in clients
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
Expand All @@ -148,7 +145,7 @@ namespace pixelgpudetails {
#endif
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
cpeParams,
gpu_.bs_d,
bs_d.data(),
digis_d.moduleInd(),
digis_d.xx(), digis_d.yy(), digis_d.adc(),
clusters_d.moduleStart(),
Expand Down
3 changes: 2 additions & 1 deletion RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
#define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h

#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
Expand Down Expand Up @@ -34,7 +35,7 @@ namespace pixelgpudetails {

void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
SiPixelClustersCUDA const& clusters_d,
float const * bs,
BeamSpotCUDA const& bs_d,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
bool transferToCPU,
cuda::stream_t<>& stream);
Expand Down
9 changes: 5 additions & 4 deletions RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cstdio>
#include <limits>

#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
#include "DataFormats/Math/interface/approx_atan2.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
Expand All @@ -15,7 +16,7 @@ namespace gpuPixelRecHits {


__global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams,
float const * __restrict__ bs,
BeamSpotCUDA::Data const * __restrict__ bs,
uint16_t const * __restrict__ id,
uint16_t const * __restrict__ x,
uint16_t const * __restrict__ y,
Expand Down Expand Up @@ -143,9 +144,9 @@ namespace gpuPixelRecHits {
// to global and compute phi...
cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]);
// here correct for the beamspot...
xg[h]-=bs[0];
yg[h]-=bs[1];
zg[h]-=bs[2];
xg[h]-=bs->x;
yg[h]-=bs->y;
zg[h]-=bs->z;

rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]);
iph[h] = unsafe_atan2s<7>(yg[h],xg[h]);
Expand Down

0 comments on commit a9a6ce3

Please sign in to comment.