diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml
new file mode 100644
index 0000000000000..21c527e7b2f0d
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
new file mode 100644
index 0000000000000..22d9ff9d103ba
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -0,0 +1,73 @@
+#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+
+#include
+
+class SiPixelClustersCUDA {
+public:
+ SiPixelClustersCUDA() = default;
+ explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
+ ~SiPixelClustersCUDA() = default;
+
+ SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
+ SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete;
+ SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
+ SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;
+
+ uint32_t *moduleStart() { return moduleStart_d.get(); }
+ int32_t *clus() { return clus_d.get(); }
+ uint32_t *clusInModule() { return clusInModule_d.get(); }
+ uint32_t *moduleId() { return moduleId_d.get(); }
+ uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
+
+ uint32_t const *moduleStart() const { return moduleStart_d.get(); }
+ int32_t const *clus() const { return clus_d.get(); }
+ uint32_t const *clusInModule() const { return clusInModule_d.get(); }
+ uint32_t const *moduleId() const { return moduleId_d.get(); }
+ uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
+
+ uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
+ int32_t const *c_clus() const { return clus_d.get(); }
+ uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
+ uint32_t const *c_moduleId() const { return moduleId_d.get(); }
+ uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
+
+ class DeviceConstView {
+ public:
+ DeviceConstView() = default;
+
+#ifdef __CUDACC__
+ __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
+ __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); }
+ __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
+ __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
+ __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
+#endif
+
+ friend SiPixelClustersCUDA;
+
+ private:
+ uint32_t const *moduleStart_ = nullptr;
+ int32_t const *clus_ = nullptr;
+ uint32_t const *clusInModule_ = nullptr;
+ uint32_t const *moduleId_ = nullptr;
+ uint32_t const *clusModuleStart_ = nullptr;
+ };
+
+ DeviceConstView *view() const { return view_d.get(); }
+
+private:
+ edm::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module
+ edm::cuda::device::unique_ptr clus_d; // cluster id of each pixel
+ edm::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module
+ edm::cuda::device::unique_ptr moduleId_d; // module id of each module
+
+ // originally from rechits
+ edm::cuda::device::unique_ptr clusModuleStart_d;
+
+ edm::cuda::device::unique_ptr view_d; // "me" pointer
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
new file mode 100644
index 0000000000000..7363c2fd364af
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -0,0 +1,24 @@
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
+ edm::Service cs;
+
+ moduleStart_d = cs->make_device_unique(nelements+1, stream);
+ clus_d = cs->make_device_unique< int32_t[]>(feds, stream);
+ clusInModule_d = cs->make_device_unique(nelements, stream);
+ moduleId_d = cs->make_device_unique(nelements, stream);
+ clusModuleStart_d = cs->make_device_unique(nelements+1, stream);
+
+ auto view = cs->make_host_unique(stream);
+ view->moduleStart_ = moduleStart_d.get();
+ view->clus_ = clus_d.get();
+ view->clusInModule_ = clusInModule_d.get();
+ view->moduleId_ = moduleId_d.get();
+ view->clusModuleStart_ = clusModuleStart_d.get();
+
+ view_d = cs->make_device_unique(stream);
+ cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
+}
diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml
new file mode 100644
index 0000000000000..259aa9f08d054
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
new file mode 100644
index 0000000000000..25e8b54a743c2
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -0,0 +1,65 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "FWCore/Utilities/interface/propagate_const.h"
+
+#include
+
+class SiPixelDigisCUDA {
+public:
+ SiPixelDigisCUDA() = default;
+ explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream);
+ ~SiPixelDigisCUDA() = default;
+
+ SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
+ SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete;
+ SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
+ SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;
+
+ uint16_t * xx() { return xx_d.get(); }
+ uint16_t * yy() { return yy_d.get(); }
+ uint16_t * adc() { return adc_d.get(); }
+ uint16_t * moduleInd() { return moduleInd_d.get(); }
+
+ uint16_t const *xx() const { return xx_d.get(); }
+ uint16_t const *yy() const { return yy_d.get(); }
+ uint16_t const *adc() const { return adc_d.get(); }
+ uint16_t const *moduleInd() const { return moduleInd_d.get(); }
+
+ uint16_t const *c_xx() const { return xx_d.get(); }
+ uint16_t const *c_yy() const { return yy_d.get(); }
+ uint16_t const *c_adc() const { return adc_d.get(); }
+ uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
+
+ class DeviceConstView {
+ public:
+ DeviceConstView() = default;
+
+#ifdef __CUDACC__
+ __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); }
+ __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
+ __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
+ __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
+#endif
+
+ friend class SiPixelDigisCUDA;
+
+ private:
+ uint16_t const *xx_ = nullptr;
+ uint16_t const *yy_ = nullptr;
+ uint16_t const *adc_ = nullptr;
+ uint16_t const *moduleInd_ = nullptr;
+ };
+
+ const DeviceConstView *view() const { return view_d.get(); }
+
+private:
+ edm::cuda::device::unique_ptr xx_d; // local coordinates of each pixel
+ edm::cuda::device::unique_ptr yy_d; //
+ edm::cuda::device::unique_ptr adc_d; // ADC of each pixel
+ edm::cuda::device::unique_ptr moduleInd_d; // module id of each pixel
+ edm::cuda::device::unique_ptr view_d; // "me" pointer
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
new file mode 100644
index 0000000000000..5ba2e920e9b04
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -0,0 +1,24 @@
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include
+
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) {
+ edm::Service cs;
+
+ xx_d = cs->make_device_unique(nelements, stream);
+ yy_d = cs->make_device_unique(nelements, stream);
+ adc_d = cs->make_device_unique(nelements, stream);
+ moduleInd_d = cs->make_device_unique(nelements, stream);
+
+ auto view = cs->make_host_unique(stream);
+ view->xx_ = xx_d.get();
+ view->yy_ = yy_d.get();
+ view->adc_ = adc_d.get();
+ view->moduleInd_ = moduleInd_d.get();
+
+ view_d = cs->make_device_unique(stream);
+ cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
+}
diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
index e4a07e55bd3c9..50e1fb51949e3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
+++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
@@ -1,6 +1,8 @@
#ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
#define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
@@ -33,7 +35,7 @@ class SiPixelFedCablingMapGPUWrapper {
// operations on the device memory have completed.
class ModulesToUnpack {
public:
- ModulesToUnpack();
+ ModulesToUnpack(cuda::stream_t<>& cudaStream);
~ModulesToUnpack() = default;
void fillAsync(SiPixelFedCablingMap const& cablingMap, std::set const& modules, cuda::stream_t<>& cudaStream);
@@ -41,8 +43,8 @@ class SiPixelFedCablingMapGPUWrapper {
const unsigned char *get() const { return modToUnpDevice.get(); }
private:
- cuda::memory::device::unique_ptr modToUnpDevice;
- std::vector> modToUnpHost;
+ edm::cuda::device::unique_ptr modToUnpDevice;
+ edm::cuda::host::unique_ptr modToUnpHost;
};
private:
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
index 9db4a46f367b3..40a489f763397 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
@@ -7,6 +7,8 @@
+
+
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 7bd6eac473cc7..dc768ce8f643d 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -31,6 +31,8 @@
#include
// CMSSW includes
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
@@ -48,114 +50,17 @@ namespace pixelgpudetails {
// number of words for all the FEDs
constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
- constexpr uint32_t MAX_WORD08_SIZE = MAX_FED_WORDS * sizeof(uint8_t);
- constexpr uint32_t MAX_WORD32_SIZE = MAX_FED_WORDS * sizeof(uint32_t);
- constexpr uint32_t MAX_WORD16_SIZE = MAX_FED_WORDS * sizeof(uint16_t);
constexpr uint32_t MAX_ERROR_SIZE = MAX_FED_WORDS * esize;
- SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
-
- cudaCheck(cudaMallocHost(&word, MAX_FED_WORDS * sizeof(unsigned int)));
- cudaCheck(cudaMallocHost(&fedId_h, MAX_FED_WORDS * sizeof(unsigned char)));
-
- // to store the output of RawToDigi
- cudaCheck(cudaMallocHost(&pdigi_h, MAX_FED_WORDS * sizeof(uint32_t)));
- cudaCheck(cudaMallocHost(&rawIdArr_h, MAX_FED_WORDS * sizeof(uint32_t)));
-
- cudaCheck(cudaMallocHost(&adc_h, MAX_FED_WORDS * sizeof(uint16_t)));
- cudaCheck(cudaMallocHost(&clus_h, MAX_FED_WORDS * sizeof(int32_t)));
-
- cudaCheck(cudaMallocHost(&error_h, vsize));
- cudaCheck(cudaMallocHost(&error_h_tmp, vsize));
- cudaCheck(cudaMallocHost(&data_h, MAX_ERROR_SIZE));
-
- cudaCheck(cudaMalloc((void**) & word_d, MAX_WORD32_SIZE));
- cudaCheck(cudaMalloc((void**) & fedId_d, MAX_WORD08_SIZE));
- cudaCheck(cudaMalloc((void**) & pdigi_d, MAX_WORD32_SIZE)); // to store thepacked digi
- cudaCheck(cudaMalloc((void**) & xx_d, MAX_WORD16_SIZE)); // to store the x and y coordinate
- cudaCheck(cudaMalloc((void**) & yy_d, MAX_WORD16_SIZE));
- cudaCheck(cudaMalloc((void**) & adc_d, MAX_WORD16_SIZE));
-
- cudaCheck(cudaMalloc((void**) & moduleInd_d, MAX_WORD16_SIZE));
- cudaCheck(cudaMalloc((void**) & rawIdArr_d, MAX_WORD32_SIZE));
- cudaCheck(cudaMalloc((void**) & error_d, vsize));
- cudaCheck(cudaMalloc((void**) & data_d, MAX_ERROR_SIZE));
- cudaCheck(cudaMemset(data_d, 0x00, MAX_ERROR_SIZE));
-
- // for the clusterizer
- cudaCheck(cudaMalloc((void**) & clus_d, MAX_WORD32_SIZE)); // cluser index in module
-
- using namespace gpuClustering;
- cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
- cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
- cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) ));
-
- new (error_h) GPU::SimpleVector(MAX_FED_WORDS, data_h);
- new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d);
- assert(error_h->size() == 0);
- assert(error_h->capacity() == static_cast(MAX_FED_WORDS));
- assert(error_h_tmp->size() == 0);
- assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS));
-
- // Need these in pinned memory to be truly asynchronous
- cudaCheck(cudaMallocHost(&nModulesActive, sizeof(uint32_t)));
- cudaCheck(cudaMallocHost(&nClusters, sizeof(uint32_t)));
-
- cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
- gpuProduct = getProduct();
- assert(xx_d==gpuProduct.xx_d);
-
- cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
-
- // originally from rechits
- cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
- uint32_t *tmp = nullptr;
- cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
- cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize));
+ SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
+ edm::Service cs;
+ word_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream);
+ fedId_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream);
}
- SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
- // free the host memory
- cudaCheck(cudaFreeHost(word));
- cudaCheck(cudaFreeHost(fedId_h));
- cudaCheck(cudaFreeHost(pdigi_h));
- cudaCheck(cudaFreeHost(rawIdArr_h));
- cudaCheck(cudaFreeHost(adc_h));
- cudaCheck(cudaFreeHost(clus_h));
- cudaCheck(cudaFreeHost(error_h));
- cudaCheck(cudaFreeHost(error_h_tmp));
- cudaCheck(cudaFreeHost(data_h));
- cudaCheck(cudaFreeHost(nModulesActive));
- cudaCheck(cudaFreeHost(nClusters));
-
- // free device memory used for RawToDigi on GPU
- // free the GPU memory
- cudaCheck(cudaFree(word_d));
- cudaCheck(cudaFree(fedId_d));
- cudaCheck(cudaFree(pdigi_d));
- cudaCheck(cudaFree(xx_d));
- cudaCheck(cudaFree(yy_d));
- cudaCheck(cudaFree(adc_d));
- cudaCheck(cudaFree(moduleInd_d));
- cudaCheck(cudaFree(rawIdArr_d));
- cudaCheck(cudaFree(error_d));
- cudaCheck(cudaFree(data_d));
-
- // these are for the clusterizer
- cudaCheck(cudaFree(moduleStart_d));
- cudaCheck(cudaFree(clus_d));
- cudaCheck(cudaFree(clusInModule_d));
- cudaCheck(cudaFree(moduleId_d));
- cudaCheck(cudaFree(gpuProduct_d));
-
- // originally from rechits
- cudaCheck(cudaFree(tempScanStorage_d));
- cudaCheck(cudaFree(clusModuleStart_d));
- }
-
- void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
- std::memcpy(word+wordCounterGPU, src, sizeof(cms_uint32_t)*length);
- std::memset(fedId_h+wordCounterGPU/2, fedId - 1200, length/2);
+ void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
+ std::memcpy(word_.get()+wordCounterGPU, src, sizeof(cms_uint32_t)*length);
+ std::memset(fedId_.get()+wordCounterGPU/2, fedId - 1200, length/2);
}
////////////////////
@@ -613,6 +518,7 @@ namespace pixelgpudetails {
const SiPixelFedCablingMapGPU *cablingMap,
const unsigned char *modToUnp,
const SiPixelGainForHLTonGPU *gains,
+ const WordFedAppender& wordFed,
const uint32_t wordCounter, const uint32_t fedCounter,
bool convertADCtoElectrons,
bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug,
@@ -620,52 +526,82 @@ namespace pixelgpudetails {
{
nDigis = wordCounter;
- const int threadsPerBlock = 512;
- const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
-
- assert(0 == wordCounter%2);
- // wordCounter is the total no of words in each event to be trasfered on device
- cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
- cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id()));
- cudaCheck(cudaMemcpyAsync(error_d, error_h_tmp, vsize, cudaMemcpyDefault, stream.id()));
-
- // Launch rawToDigi kernel
- RawToDigi_kernel<<>>(
- cablingMap,
- modToUnp,
- wordCounter,
- word_d,
- fedId_d,
- xx_d, yy_d, adc_d,
- pdigi_d,
- rawIdArr_d,
- moduleInd_d,
- error_d,
- useQualityInfo,
- includeErrors,
- debug);
- cudaCheck(cudaGetLastError());
-
- // copy data to host variable
- if(transferToCPU) {
- cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
- cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-
- if (includeErrors) {
- cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id()));
- cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
- // If we want to transfer only the minimal amount of data, we
- // need a synchronization point. A single ExternalWork (of
- // SiPixelRawToClusterHeterogeneous) does not help because it is
- // already used to synchronize the data movement. So we'd need
- // two ExternalWorks (or explicit use of TBB tasks). The
- // prototype of #100 would allow this easily (as there would be
- // two ExternalWorks).
- //
- //error_h->set_data(data_h);
- //cudaCheck(cudaStreamSynchronize(stream.id()));
- //int size = error_h->size();
- //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id()));
+ constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
+ digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream);
+ clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream);
+
+ edm::Service cs;
+ digis_clusters_h.nModules_Clusters = cs->make_host_unique(2, stream);
+
+ {
+ const int threadsPerBlock = 512;
+ const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
+
+ assert(0 == wordCounter%2);
+ // wordCounter is the total no of words in each event to be trasfered on device
+ auto word_d = cs->make_device_unique(wordCounter, stream);
+ auto fedId_d = cs->make_device_unique(wordCounter, stream);
+
+ auto error_d = cs->make_device_unique>(stream);
+ auto data_d = cs->make_device_unique(MAX_FED_WORDS, stream);
+ cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id()));
+ auto error_h_tmp = cs->make_host_unique>(stream);
+ new (error_h_tmp.get()) GPU::SimpleVector(MAX_FED_WORDS, data_d.get()); // should make_host_unique() call the constructor as well? note that even if std::make_unique does that, we can't do that in make_device_unique
+ assert(error_h_tmp->size() == 0);
+ assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS));
+
+ cudaCheck(cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+ cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id()));
+ cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id()));
+
+ auto pdigi_d = cs->make_device_unique(wordCounter, stream);
+ auto rawIdArr_d = cs->make_device_unique(wordCounter, stream);
+
+ // Launch rawToDigi kernel
+ RawToDigi_kernel<<>>(
+ cablingMap,
+ modToUnp,
+ wordCounter,
+ word_d.get(),
+ fedId_d.get(),
+ digis_d.xx(), digis_d.yy(), digis_d.adc(),
+ pdigi_d.get(),
+ rawIdArr_d.get(),
+ digis_d.moduleInd(),
+ error_d.get(),
+ useQualityInfo,
+ includeErrors,
+ debug);
+ cudaCheck(cudaGetLastError());
+
+ // copy data to host variable
+ if(transferToCPU) {
+ digis_clusters_h.pdigi = cs->make_host_unique(MAX_FED_WORDS, stream);
+ digis_clusters_h.rawIdArr = cs->make_host_unique(MAX_FED_WORDS, stream);
+ cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+ cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+ if (includeErrors) {
+ digis_clusters_h.data = cs->make_host_unique(MAX_FED_WORDS, stream);
+ digis_clusters_h.error = cs->make_host_unique>(stream);
+ new (digis_clusters_h.error.get()) GPU::SimpleVector(MAX_FED_WORDS, digis_clusters_h.data.get());
+ assert(digis_clusters_h.error->size() == 0);
+ assert(digis_clusters_h.error->capacity() == static_cast(MAX_FED_WORDS));
+
+ cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id()));
+ cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
+ // If we want to transfer only the minimal amount of data, we
+ // need a synchronization point. A single ExternalWork (of
+ // SiPixelRawToClusterHeterogeneous) does not help because it is
+ // already used to synchronize the data movement. So we'd need
+ // two ExternalWorks (or explicit use of TBB tasks). The
+ // prototype of #100 would allow this easily (as there would be
+ // two ExternalWorks).
+ //
+ //cudaCheck(cudaStreamSynchronize(stream.id()));
+ //int size = digis_clusters_h.error->size();
+ //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id()));
+ }
}
}
// End of Raw2Digi and passing data for cluserisation
@@ -677,15 +613,16 @@ namespace pixelgpudetails {
int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
gpuCalibPixel::calibDigis<<>>(
- moduleInd_d,
- xx_d, yy_d, adc_d,
+ digis_d.moduleInd(),
+ digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(),
gains,
wordCounter);
cudaCheck(cudaGetLastError());
// calibrated adc
if(transferToCPU) {
- cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+ digis_clusters_h.adc = cs->make_host_unique(MAX_FED_WORDS, stream);
+ cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
}
#ifdef GPU_DEBUG
@@ -694,13 +631,13 @@ namespace pixelgpudetails {
<< " blocks of " << threadsPerBlock << " threads\n";
#endif
- cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id()));
+ cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id()));
- countModules<<>>(moduleInd_d, moduleStart_d, clus_d, wordCounter);
+ countModules<<>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter);
cudaCheck(cudaGetLastError());
// read the number of modules into a data member, used by getProduct())
- cudaCheck(cudaMemcpyAsync(nModulesActive, moduleStart_d, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+ cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
threadsPerBlock = 256;
blocks = MaxNumModules;
@@ -708,23 +645,23 @@ namespace pixelgpudetails {
std::cout << "CUDA findClus kernel launch with " << blocks
<< " blocks of " << threadsPerBlock << " threads\n";
#endif
- cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
+ cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
findClus<<>>(
- moduleInd_d,
- xx_d, yy_d,
- moduleStart_d,
- clusInModule_d, moduleId_d,
- clus_d,
+ digis_d.c_moduleInd(),
+ digis_d.c_xx(), digis_d.c_yy(),
+ clusters_d.c_moduleStart(),
+ clusters_d.clusInModule(), clusters_d.moduleId(),
+ clusters_d.clus(),
wordCounter);
cudaCheck(cudaGetLastError());
// apply charge cut
clusterChargeCut<<>>(
- moduleInd_d,
- adc_d,
- moduleStart_d,
- clusInModule_d, moduleId_d,
- clus_d,
+ digis_d.moduleInd(),
+ digis_d.c_adc(),
+ clusters_d.c_moduleStart(),
+ clusters_d.clusInModule(), clusters_d.c_moduleId(),
+ clusters_d.clus(),
wordCounter);
cudaCheck(cudaGetLastError());
@@ -735,19 +672,27 @@ namespace pixelgpudetails {
// available in the rechit producer without additional points of
// synchronization/ExternalWork
//
+ // Temporary storage
+ size_t tempScanStorageSize = 0;
+ {
+ uint32_t *tmp = nullptr;
+ cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
+ }
+ auto tempScanStorage_d = cs->make_device_unique(tempScanStorageSize, stream);
// Set first the first element to 0
- cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id()));
+ cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
- cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize,
- clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules,
+ cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize,
+ clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules,
stream.id()));
// last element holds the number of all clusters
- cudaCheck(cudaMemcpyAsync(nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+ cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
// clusters
if(transferToCPU) {
- cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+ digis_clusters_h.clus = cs->make_host_unique(MAX_FED_WORDS, stream);
+ cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
}
} // end clusterizer scope
}
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index ca8bd73106c2c..a2d9cdda92573 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -5,6 +5,7 @@
#include
#include "cuda/api_wrappers.h"
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
#include "FWCore/Utilities/interface/typedefs.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
#include "siPixelRawToClusterHeterogeneousProduct.h"
@@ -159,8 +160,43 @@ namespace pixelgpudetails {
using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
- SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
- ~SiPixelRawToClusterGPUKernel();
+ struct CPUData {
+ CPUData() = default;
+ ~CPUData() = default;
+
+ CPUData(const CPUData&) = delete;
+ CPUData& operator=(const CPUData&) = delete;
+ CPUData(CPUData&&) = default;
+ CPUData& operator=(CPUData&&) = default;
+
+ edm::cuda::host::unique_ptr nModules_Clusters; // These should really be part of the GPU product
+
+ edm::cuda::host::unique_ptr data;
+ edm::cuda::host::unique_ptr> error;
+
+ edm::cuda::host::unique_ptr pdigi;
+ edm::cuda::host::unique_ptr rawIdArr;
+ edm::cuda::host::unique_ptr adc;
+ edm::cuda::host::unique_ptr clus;
+ };
+
+ class WordFedAppender {
+ public:
+ WordFedAppender(cuda::stream_t<>& cudaStream);
+ ~WordFedAppender() = default;
+
+ void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
+
+ const unsigned int *word() const { return word_.get(); }
+ const unsigned char *fedId() const { return fedId_.get(); }
+
+ private:
+ edm::cuda::host::unique_ptr word_;
+ edm::cuda::host::unique_ptr fedId_;
+ };
+
+ SiPixelRawToClusterGPUKernel() = default;
+ ~SiPixelRawToClusterGPUKernel() = default;
SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete;
@@ -168,69 +204,35 @@ namespace pixelgpudetails {
SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
- void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
-
void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
const SiPixelGainForHLTonGPU *gains,
+ const WordFedAppender& wordFed,
const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons,
bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug,
cuda::stream_t<>& stream);
- auto getProduct() {
- error_h->set_data(data_h);
- return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
- pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
- gpuProduct_d,
- xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
- clusModuleStart_d,
- nDigis, *nModulesActive, *nClusters
- };
+ siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() {
+ return siPixelRawToClusterHeterogeneousProduct::GPUProduct(
+ std::move(digis_d), std::move(clusters_d),
+ nDigis,
+ digis_clusters_h.nModules_Clusters[0],
+ digis_clusters_h.nModules_Clusters[1]
+ );
}
- private:
- // input
- unsigned int *word = nullptr; // to hold input for rawtodigi
- unsigned char *fedId_h = nullptr; // to hold fed index for each word
-
- // output
- GPUProduct gpuProduct;
- GPUProduct * gpuProduct_d;
-
- // FIXME cleanup all these are in the gpuProduct above...
-
- uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output
- uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
- pixelgpudetails::error_obj *data_h = nullptr;
- GPU::SimpleVector *error_h = nullptr;
- GPU::SimpleVector *error_h_tmp = nullptr;
+ CPUData&& getCPUData() {
+ return std::move(digis_clusters_h);
+ }
+ private:
uint32_t nDigis = 0;
- uint32_t *nModulesActive = nullptr;
- uint32_t *nClusters = nullptr;
-
- // scratch memory buffers
- uint32_t * word_d;
- uint8_t * fedId_d;
- uint32_t * pdigi_d;
- uint16_t * xx_d;
- uint16_t * yy_d;
- uint16_t * adc_d;
- uint16_t * moduleInd_d;
- uint32_t * rawIdArr_d;
- GPU::SimpleVector * error_d;
- error_obj * data_d;
-
- // these are for the clusterizer (to be moved)
- uint32_t * moduleStart_d;
- int32_t * clus_d;
- uint32_t * clusInModule_d;
- uint32_t * moduleId_d;
+ // CPU data
+ CPUData digis_clusters_h;
- // originally in rechit, moved here
- uint32_t *clusModuleStart_d = nullptr;
- void *tempScanStorage_d = nullptr;
- size_t tempScanStorageSize = 0;
+ // Data to be put in the event
+ SiPixelDigisCUDA digis_d;
+ SiPixelClustersCUDA clusters_d;
};
// configuration and memory buffers alocated on the GPU
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index df2f488fe488c..1f872f3cb7464 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -14,6 +14,7 @@
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "Geometry/CommonDetUnit/interface/GeomDetType.h"
#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
@@ -123,10 +124,11 @@ const SiPixelFedCablingMapGPU *SiPixelFedCablingMapGPUWrapper::getGPUProductAsyn
return data.cablingMapDevice;
}
-SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack():
- modToUnpDevice(cuda::memory::device::make_unique(cuda::device::current::get().id(), pixelgpudetails::MAX_SIZE)),
- modToUnpHost(pixelgpudetails::MAX_SIZE)
+SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack(cuda::stream_t<>& cudaStream)
{
+ edm::Service cs;
+ modToUnpDevice = cs->make_device_unique(pixelgpudetails::MAX_SIZE, cudaStream);
+ modToUnpHost = cs->make_host_unique(pixelgpudetails::MAX_SIZE, cudaStream);
}
void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablingMap const& cablingMap, std::set const& modules, cuda::stream_t<>& cudaStream) {
@@ -154,7 +156,7 @@ void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablin
}
}
- cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.data(), modToUnpHost.size() * sizeof(unsigned char), cudaStream.id());
+ cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream.id());
}
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index c63466f157a1b..333898eaa601a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -127,10 +127,10 @@ namespace pixelgpudetails {
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
bool transferToCPU,
cuda::stream_t<>& stream) {
- cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
- gpu_.hitsModuleStart_d = input.clusModuleStart_d;
- gpu_.cpeParams = cpeParams; // copy it for use in clients
- cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
+ cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
+ gpu_.hitsModuleStart_d = input.clusters_d.clusModuleStart();
+ gpu_.cpeParams = cpeParams; // copy it for use in clients
+ cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
@@ -141,11 +141,11 @@ namespace pixelgpudetails {
gpuPixelRecHits::getHits<<>>(
cpeParams,
gpu_.bs_d,
- input.moduleInd_d,
- input.xx_d, input.yy_d, input.adc_d,
- input.moduleStart_d,
- input.clusInModule_d, input.moduleId_d,
- input.clus_d,
+ input.digis_d.moduleInd(),
+ input.digis_d.xx(), input.digis_d.yy(), input.digis_d.adc(),
+ input.clusters_d.moduleStart(),
+ input.clusters_d.clusInModule(), input.clusters_d.moduleId(),
+ input.clusters_d.clus(),
input.nDigis,
gpu_.hitsModuleStart_d,
gpu_.charge_d,