Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add infrastructure around cub CachingDeviceAllocator, and use it in SiPixelRawToCluster #172

Merged
merged 19 commits into from
Nov 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CUDADataFormats/Common/interface/device_unique_ptr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h
#define CUDADataFormats_Common_interface_device_unique_ptr_h

#include <memory>
#include <functional>

namespace edm {
namespace cuda {
namespace device {
template <typename T>
using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
}
}
}

#endif
16 changes: 16 additions & 0 deletions CUDADataFormats/Common/interface/host_unique_ptr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h
#define CUDADataFormats_Common_interface_host_unique_ptr_h

#include <memory>
#include <functional>

namespace edm {
namespace cuda {
namespace host {
template <typename T>
using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
}
}
}

#endif
8 changes: 8 additions & 0 deletions CUDADataFormats/SiPixelCluster/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>

<export>
<lib name="1"/>
</export>

73 changes: 73 additions & 0 deletions CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"

#include <cuda/api_wrappers.h>

class SiPixelClustersCUDA {
public:
SiPixelClustersCUDA() = default;
explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
~SiPixelClustersCUDA() = default;

SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;

uint32_t *moduleStart() { return moduleStart_d.get(); }
int32_t *clus() { return clus_d.get(); }
uint32_t *clusInModule() { return clusInModule_d.get(); }
uint32_t *moduleId() { return moduleId_d.get(); }
uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }

uint32_t const *moduleStart() const { return moduleStart_d.get(); }
int32_t const *clus() const { return clus_d.get(); }
uint32_t const *clusInModule() const { return clusInModule_d.get(); }
uint32_t const *moduleId() const { return moduleId_d.get(); }
uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }

uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
int32_t const *c_clus() const { return clus_d.get(); }
uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
uint32_t const *c_moduleId() const { return moduleId_d.get(); }
uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }

class DeviceConstView {
public:
DeviceConstView() = default;

#ifdef __CUDACC__
__device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
__device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); }
__device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
__device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
__device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
#endif

friend SiPixelClustersCUDA;

private:
uint32_t const *moduleStart_ = nullptr;
int32_t const *clus_ = nullptr;
uint32_t const *clusInModule_ = nullptr;
uint32_t const *moduleId_ = nullptr;
uint32_t const *clusModuleStart_ = nullptr;
};
VinInn marked this conversation as resolved.
Show resolved Hide resolved

DeviceConstView *view() const { return view_d.get(); }

private:
edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d; // index of the first pixel of each module
edm::cuda::device::unique_ptr<int32_t[]> clus_d; // cluster id of each pixel
edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d; // number of clusters found in each module
edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d; // module id of each module

// originally from rechits
edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;

edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
};

#endif
24 changes: 24 additions & 0 deletions CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"

SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

moduleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
clus_d = cs->make_device_unique< int32_t[]>(feds, stream);
clusInModule_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
moduleId_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
clusModuleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);

auto view = cs->make_host_unique<DeviceConstView>(stream);
view->moduleStart_ = moduleStart_d.get();
view->clus_ = clus_d.get();
view->clusInModule_ = clusInModule_d.get();
view->moduleId_ = moduleId_d.get();
view->clusModuleStart_ = clusModuleStart_d.get();

view_d = cs->make_device_unique<DeviceConstView>(stream);
cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
}
7 changes: 7 additions & 0 deletions CUDADataFormats/SiPixelDigi/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>

<export>
<lib name="1"/>
</export>
65 changes: 65 additions & 0 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
#include "FWCore/Utilities/interface/propagate_const.h"

#include <cuda/api_wrappers.h>

class SiPixelDigisCUDA {
public:
SiPixelDigisCUDA() = default;
explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream);
~SiPixelDigisCUDA() = default;

SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete;
SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;

uint16_t * xx() { return xx_d.get(); }
uint16_t * yy() { return yy_d.get(); }
uint16_t * adc() { return adc_d.get(); }
uint16_t * moduleInd() { return moduleInd_d.get(); }

uint16_t const *xx() const { return xx_d.get(); }
uint16_t const *yy() const { return yy_d.get(); }
uint16_t const *adc() const { return adc_d.get(); }
uint16_t const *moduleInd() const { return moduleInd_d.get(); }

uint16_t const *c_xx() const { return xx_d.get(); }
uint16_t const *c_yy() const { return yy_d.get(); }
uint16_t const *c_adc() const { return adc_d.get(); }
uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }

class DeviceConstView {
public:
DeviceConstView() = default;

#ifdef __CUDACC__
__device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); }
__device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
__device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
__device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
#endif

friend class SiPixelDigisCUDA;

private:
uint16_t const *xx_ = nullptr;
uint16_t const *yy_ = nullptr;
uint16_t const *adc_ = nullptr;
uint16_t const *moduleInd_ = nullptr;
};

const DeviceConstView *view() const { return view_d.get(); }

private:
edm::cuda::device::unique_ptr<uint16_t[]> xx_d; // local coordinates of each pixel
edm::cuda::device::unique_ptr<uint16_t[]> yy_d; //
edm::cuda::device::unique_ptr<uint16_t[]> adc_d; // ADC of each pixel
edm::cuda::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
};

#endif
24 changes: 24 additions & 0 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"

#include <cuda_runtime.h>

SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

xx_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
yy_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
adc_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
moduleInd_d = cs->make_device_unique<uint16_t[]>(nelements, stream);

auto view = cs->make_host_unique<DeviceConstView>(stream);
view->xx_ = xx_d.get();
view->yy_ = yy_d.get();
view->adc_ = adc_d.get();
view->moduleInd_ = moduleInd_d.get();

view_d = cs->make_device_unique<DeviceConstView>(stream);
cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
}
16 changes: 10 additions & 6 deletions HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,19 @@ namespace heterogeneous {
waitingTaskHolder, // copy needed for the catch block
locationSetter = iEvent.locationSetter()
](cuda::stream::id_t streamId, cuda::status_t status) mutable {
if(status == cudaSuccess) {
if (status == cudaSuccess) {
locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, deviceId));
LogTrace("GPUCuda") << " GPU kernel finished (in callback) device " << deviceId << " CUDA stream " << streamId;
waitingTaskHolder.doneWaiting(nullptr);
}
else {
auto error = cudaGetErrorName(status);
auto message = cudaGetErrorString(status);
waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message));
} else {
// wrap the exception in a try-catch block to let GDB "catch throw" break on it
try {
auto error = cudaGetErrorName(status);
auto message = cudaGetErrorString(status);
throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message;
} catch(...) {
waitingTaskHolder.doneWaiting(std::current_exception());
}
}
});
} catch(...) {
Expand Down
2 changes: 2 additions & 0 deletions HeterogeneousCore/CUDAServices/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/MessageLogger"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="cub"/>

<export>
<lib name="1"/>
Expand Down
87 changes: 87 additions & 0 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,35 @@
#include <utility>
#include <vector>

#include <cuda/api_wrappers.h>

#include "FWCore/Utilities/interface/StreamID.h"

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
#include "CUDADataFormats/Common/interface/host_unique_ptr.h"

namespace edm {
class ParameterSet;
class ActivityRegistry;
class ConfigurationDescriptions;
}

namespace cudaserviceimpl {
template <typename T>
struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr<T>; };
template <typename T>
struct make_device_unique_selector<T[]> { using unbounded_array = edm::cuda::device::unique_ptr<T[]>; };
template <typename T, size_t N>
struct make_device_unique_selector<T[N]> { struct bounded_array {}; };

template <typename T>
struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr<T>; };
template <typename T>
struct make_host_unique_selector<T[]> { using unbounded_array = edm::cuda::host::unique_ptr<T[]>; };
template <typename T, size_t N>
struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
}

/**
* TODO:
* - CUDA stream management?
Expand Down Expand Up @@ -47,7 +68,73 @@ class CUDAService {
// Get the current device
int getCurrentDevice() const;

// Allocate device memory
template <typename T>
typename cudaserviceimpl::make_device_unique_selector<T>::non_array
make_device_unique(cuda::stream_t<>& stream) {
int dev = getCurrentDevice();
void *mem = allocate_device(dev, sizeof(T), stream);
return typename cudaserviceimpl::make_device_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
[this, dev](void *ptr) {
this->free_device(dev, ptr);
});
}

template <typename T>
typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array
make_device_unique(size_t n, cuda::stream_t<>& stream) {
int dev = getCurrentDevice();
using element_type = typename std::remove_extent<T>::type;
void *mem = allocate_device(dev, n*sizeof(element_type), stream);
return typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
[this, dev](void *ptr) {
this->free_device(dev, ptr);
});
}

template <typename T, typename ...Args>
typename cudaserviceimpl::make_device_unique_selector<T>::bounded_array
make_device_unique(Args&&...) = delete;

// Allocate pinned host memory
template <typename T>
typename cudaserviceimpl::make_host_unique_selector<T>::non_array
make_host_unique(cuda::stream_t<>& stream) {
void *mem = allocate_host(sizeof(T), stream);
return typename cudaserviceimpl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
[this](void *ptr) {
this->free_host(ptr);
});
}

template <typename T>
typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array
make_host_unique(size_t n, cuda::stream_t<>& stream) {
using element_type = typename std::remove_extent<T>::type;
void *mem = allocate_host(n*sizeof(element_type), stream);
return typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
[this](void *ptr) {
this->free_host(ptr);
});
}

template <typename T, typename ...Args>
typename cudaserviceimpl::make_host_unique_selector<T>::bounded_array
make_host_unique(Args&&...) = delete;

// Free device memory (to be called from unique_ptr)
void free_device(int device, void *ptr);

// Free pinned host memory (to be called from unique_ptr)
void free_host(void *ptr);

private:
// PIMPL to hide details of allocator
struct Allocator;
std::unique_ptr<Allocator> allocator_;
void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream);
void *allocate_host(size_t nbytes, cuda::stream_t<>& stream);

int numberOfDevices_ = 0;
unsigned int numberOfStreamsTotal_ = 0;
std::vector<std::pair<int, int>> computeCapabilities_;
Expand Down
Loading