Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Require allocated type to have only a trivial constructor for make_device_unique and make_host_unique #204

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ class SiPixelClustersCUDA {
friend SiPixelClustersCUDA;

private:
uint32_t const *moduleStart_ = nullptr;
int32_t const *clus_ = nullptr;
uint32_t const *clusInModule_ = nullptr;
uint32_t const *moduleId_ = nullptr;
uint32_t const *clusModuleStart_ = nullptr;
uint32_t const *moduleStart_;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you remove the nullptr initialisation ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because those lead to constructor being non-trivial (since it would do some initialization).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#include <iostream>
class Xyz {
  int bar = 0;
};
int main(void) {
  std::cout << "Xyz " << std::is_trivially_constructible_v<Xyz> << std::endl;
  return 0;
}

gives

Xyz 0

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, of course sorry (trivial really means it does nothing...).

int32_t const *clus_;
uint32_t const *clusInModule_;
uint32_t const *moduleId_;
uint32_t const *clusModuleStart_;
};

DeviceConstView *view() const { return view_d.get(); }
Expand Down
8 changes: 4 additions & 4 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ class SiPixelDigisCUDA {
friend class SiPixelDigisCUDA;

private:
uint16_t const *xx_ = nullptr;
uint16_t const *yy_ = nullptr;
uint16_t const *adc_ = nullptr;
uint16_t const *moduleInd_ = nullptr;
uint16_t const *xx_;
uint16_t const *yy_;
uint16_t const *adc_;
uint16_t const *moduleInd_;
};

const DeviceConstView *view() const { return view_d.get(); }
Expand Down
6 changes: 5 additions & 1 deletion HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class CUDAService {
template <typename T>
typename cudaserviceimpl::make_device_unique_selector<T>::non_array
make_device_unique(cuda::stream_t<>& stream) {
static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the device memory is not supported");
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to fail the instantiation using std::enable_if_t<std::is_trivially_constructible<T>::value, typename ...> as the return type, instead of using a static_assert ?

Pro: the templates can be used with SFINAE.
Con: the error message may be harder to understand

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm probably not woken up enough, but what would be the benefit of SFINAE be with these?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, no, I don't think there is any advantage...
I guess we could start playing with concepts (later).

int dev = getCurrentDevice();
void *mem = allocate_device(dev, sizeof(T), stream);
return typename cudaserviceimpl::make_device_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
Expand All @@ -83,8 +84,9 @@ class CUDAService {
template <typename T>
typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array
make_device_unique(size_t n, cuda::stream_t<>& stream) {
int dev = getCurrentDevice();
using element_type = typename std::remove_extent<T>::type;
static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the device memory is not supported");
int dev = getCurrentDevice();
void *mem = allocate_device(dev, n*sizeof(element_type), stream);
return typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
[this, dev](void *ptr) {
Expand All @@ -100,6 +102,7 @@ class CUDAService {
template <typename T>
typename cudaserviceimpl::make_host_unique_selector<T>::non_array
make_host_unique(cuda::stream_t<>& stream) {
static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
void *mem = allocate_host(sizeof(T), stream);
return typename cudaserviceimpl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
[this](void *ptr) {
Expand All @@ -111,6 +114,7 @@ class CUDAService {
typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array
make_host_unique(size_t n, cuda::stream_t<>& stream) {
using element_type = typename std::remove_extent<T>::type;
static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
void *mem = allocate_host(n*sizeof(element_type), stream);
return typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
[this](void *ptr) {
Expand Down
31 changes: 23 additions & 8 deletions HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@
#include <cuda.h>

namespace GPU {

template <class T> struct SimpleVector {
// Constructors
constexpr SimpleVector(int capacity, T *data) // ownership of m_data stays within the caller
: m_size(0), m_capacity(capacity), m_data(data)
{
static_assert(std::is_trivially_destructible<T>::value);
}
constexpr SimpleVector() = default;

constexpr SimpleVector() : SimpleVector(0) {}
// ownership of m_data stays within the caller
constexpr void construct(int capacity, T *data) {
m_size = 0;
m_capacity = capacity;
m_data = data;
}

inline constexpr int push_back_unsafe(const T &element) {
auto previousSize = m_size;
Expand Down Expand Up @@ -98,6 +97,22 @@ template <class T> struct SimpleVector {
T *m_data;
};

// ownership of m_data stays within the caller
template <class T>
SimpleVector<T> make_SimpleVector(int capacity, T *data) {
SimpleVector<T> ret;
ret.construct(capacity, data);
return ret;
}

// ownership of m_data stays within the caller
template <class T>
SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
auto ret = new(mem) SimpleVector<T>();
ret->construct(capacity, data);
return ret;
}

} // namespace GPU

#endif // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
4 changes: 2 additions & 2 deletions HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ int main() {
cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int)));
cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int)));

auto v = new (obj_ptr) GPU::SimpleVector<int>(maxN, data_ptr);
auto v = GPU::make_SimpleVector(obj_ptr, maxN, data_ptr);

cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(GPU::SimpleVector<int>)));
new (tmp_obj_ptr) GPU::SimpleVector<int>(maxN, d_data_ptr);
GPU::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr);
assert(tmp_obj_ptr->size() == 0);
assert(tmp_obj_ptr->capacity() == static_cast<int>(maxN));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ namespace pixelgpudetails {
if (includeErrors and skipROC)
{
uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
err->emplace_back(rID, ww, errorType, fedId);
err->push_back(pixelgpudetails::error_obj{rID, ww, errorType, fedId});
continue;
}

Expand Down Expand Up @@ -476,7 +476,7 @@ namespace pixelgpudetails {
if (includeErrors) {
if (not rocRowColIsValid(row, col)) {
uint32_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
err->emplace_back(rawId, ww, error, fedId);
err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId});
if(debug) printf("BPIX1 Error status: %i\n", error);
continue;
}
Expand All @@ -491,7 +491,7 @@ namespace pixelgpudetails {
localPix.col = col;
if (includeErrors and not dcolIsValid(dcol, pxid)) {
uint32_t error = conversionError(fedId, 3, debug);
err->emplace_back(rawId, ww, error, fedId);
err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId});
if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
continue;
}
Expand Down Expand Up @@ -541,7 +541,7 @@ namespace pixelgpudetails {
auto data_d = cs->make_device_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id()));
auto error_h_tmp = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
new (error_h_tmp.get()) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED_WORDS, data_d.get()); // should make_host_unique() call the constructor as well? note that even if std::make_unique does that, we can't do that in make_device_unique
GPU::make_SimpleVector(error_h_tmp.get(), MAX_FED_WORDS, data_d.get());
assert(error_h_tmp->size() == 0);
assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED_WORDS));

Expand Down Expand Up @@ -579,7 +579,7 @@ namespace pixelgpudetails {
if (includeErrors) {
digis_clusters_h.data = cs->make_host_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
digis_clusters_h.error = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
new (digis_clusters_h.error.get()) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED_WORDS, digis_clusters_h.data.get());
GPU::make_SimpleVector(digis_clusters_h.error.get(), MAX_FED_WORDS, digis_clusters_h.data.get());
assert(digis_clusters_h.error->size() == 0);
assert(digis_clusters_h.error->capacity() == static_cast<int>(MAX_FED_WORDS));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@ namespace siPixelRawToClusterHeterogeneousProduct {
uint32_t word;
unsigned char errorType;
unsigned char fedId;

constexpr
error_obj(uint32_t a, uint32_t b, unsigned char c, unsigned char d):
rawId(a),
word(b),
errorType(c),
fedId(d)
{ }
};

// FIXME split in two
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,11 @@ void CAHitQuadrupletGeneratorGPU::allocateOnGPU()
for (int i = 0; i < maxNumberOfRegions_; ++i) {
cudaCheck(cudaMallocHost(&h_foundNtupletsData_[i], sizeof(Quadruplet) * maxNumberOfQuadruplets_));
cudaCheck(cudaMallocHost(&h_foundNtupletsVec_[i], sizeof(GPU::SimpleVector<Quadruplet>)));
new(h_foundNtupletsVec_[i]) GPU::SimpleVector<Quadruplet>(maxNumberOfQuadruplets_, h_foundNtupletsData_[i]);
GPU::make_SimpleVector(h_foundNtupletsVec_[i], maxNumberOfQuadruplets_, h_foundNtupletsData_[i]);
cudaCheck(cudaMalloc(&d_foundNtupletsData_[i], sizeof(Quadruplet) * maxNumberOfQuadruplets_));
cudaCheck(cudaMemset(d_foundNtupletsData_[i], 0x00, sizeof(Quadruplet) * maxNumberOfQuadruplets_));
cudaCheck(cudaMalloc(&d_foundNtupletsVec_[i], sizeof(GPU::SimpleVector<Quadruplet>)));
GPU::SimpleVector<Quadruplet> tmp_foundNtuplets(maxNumberOfQuadruplets_, d_foundNtupletsData_[i]);
auto tmp_foundNtuplets = GPU::make_SimpleVector<Quadruplet>(maxNumberOfQuadruplets_, d_foundNtupletsData_[i]);
cudaCheck(cudaMemcpy(d_foundNtupletsVec_[i], & tmp_foundNtuplets, sizeof(GPU::SimpleVector<Quadruplet>), cudaMemcpyDefault));
}

Expand Down