-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add AlpakaCore/HistoContainer.h + HistoContainer_t, OneHistoContainer_t and OneToManyAssoc_t tests #165
Add AlpakaCore/HistoContainer.h + HistoContainer_t, OneHistoContainer_t and OneToManyAssoc_t tests #165
Changes from 31 commits
faaede0
adb66eb
f548e27
c4878d5
d8c9ad6
5bacd9c
46805ad
5b7b5ab
0cd5c71
e8c66b6
b465663
c6e30b8
995af06
82ffd14
c6477b6
94f94e6
2ff31f9
8e9e776
8aaa1f3
b6e9adc
94105db
c08d205
88545ea
ba641ee
5c9a8b9
94176e9
cbb609f
d759b1f
c1824b8
52ccf49
788556d
548b4b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,354 @@ | ||||||
#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h | ||||||
#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h | ||||||
|
||||||
#include <algorithm> | ||||||
#include <cstddef> | ||||||
#include <cstdint> | ||||||
#include <type_traits> | ||||||
|
||||||
#include "AlpakaCore/alpakaConfig.h" | ||||||
#include "AlpakaCore/alpakaWorkDivHelper.h" | ||||||
#include "AlpakaCore/AtomicPairCounter.h" | ||||||
#include "AlpakaCore/alpakastdAlgorithm.h" | ||||||
#include "AlpakaCore/prefixScan.h" | ||||||
|
||||||
namespace cms { | ||||||
namespace alpakatools { | ||||||
|
||||||
struct countFromVector { | ||||||
template <typename T_Acc, typename Histo, typename T> | ||||||
ALPAKA_FN_ACC void operator()(const T_Acc &acc, | ||||||
Histo *__restrict__ h, | ||||||
uint32_t nh, | ||||||
T const *__restrict__ v, | ||||||
uint32_t const *__restrict__ offsets) const { | ||||||
const uint32_t nt = offsets[nh]; | ||||||
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]); | ||||||
const auto &[firstElementIdxNoStride, endElementIdxNoStride] = | ||||||
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt)); | ||||||
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; | ||||||
threadIdx < nt; | ||||||
threadIdx += gridDimension, endElementIdx += gridDimension) { | ||||||
for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { | ||||||
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); | ||||||
assert((*off) > 0); | ||||||
int32_t ih = off - offsets - 1; | ||||||
assert(ih >= 0); | ||||||
assert(ih < int(nh)); | ||||||
h->count(acc, v[i], ih); | ||||||
} | ||||||
} | ||||||
} | ||||||
}; | ||||||
|
||||||
struct fillFromVector { | ||||||
template <typename T_Acc, typename Histo, typename T> | ||||||
ALPAKA_FN_ACC void operator()(const T_Acc &acc, | ||||||
Histo *__restrict__ h, | ||||||
uint32_t nh, | ||||||
T const *__restrict__ v, | ||||||
uint32_t const *__restrict__ offsets) const { | ||||||
const uint32_t nt = offsets[nh]; | ||||||
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]); | ||||||
const auto &[firstElementIdxNoStride, endElementIdxNoStride] = | ||||||
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt)); | ||||||
|
||||||
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; | ||||||
threadIdx < nt; | ||||||
threadIdx += gridDimension, endElementIdx += gridDimension) { | ||||||
for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { | ||||||
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); | ||||||
assert((*off) > 0); | ||||||
int32_t ih = off - offsets - 1; | ||||||
assert(ih >= 0); | ||||||
assert(ih < int(nh)); | ||||||
h->fill(acc, v[i], i, ih); | ||||||
} | ||||||
} | ||||||
} | ||||||
}; | ||||||
|
||||||
struct launchZero { | ||||||
template <typename T_Acc, typename Histo> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, | ||||||
Histo *__restrict__ h) const { | ||||||
const auto &[firstElementIdxGlobal, endElementIdxGlobal] = | ||||||
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(Histo::totbins())); | ||||||
|
||||||
for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { | ||||||
h->off[i] = 0; | ||||||
} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The CUDA implementation uses memset. I suppose we could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes at first I had just tried a |
||||||
} | ||||||
}; | ||||||
|
||||||
template <typename Histo> | ||||||
ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize( | ||||||
Histo *__restrict__ h, | ||||||
const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After removing the memory allocation, the
Suggested change
|
||||||
ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { | ||||||
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); | ||||||
|
||||||
const int num_items = Histo::totbins(); | ||||||
|
||||||
const unsigned int nthreads = 1024; | ||||||
const Vec1 threadsPerBlockOrElementsPerThread(nthreads); | ||||||
const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; | ||||||
const Vec1 blocksPerGrid(nblocks); | ||||||
|
||||||
const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); | ||||||
alpaka::queue::enqueue(queue, | ||||||
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>( | ||||||
workDiv, multiBlockPrefixScanFirstStep<uint32_t>(), poff, poff, num_items)); | ||||||
|
||||||
const WorkDiv1 &workDivWith1Block = | ||||||
cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); | ||||||
alpaka::queue::enqueue( | ||||||
queue, | ||||||
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>( | ||||||
workDivWith1Block, multiBlockPrefixScanSecondStep<uint32_t>(), poff, poff, num_items, nblocks)); | ||||||
} | ||||||
|
||||||
template <typename Histo, typename T> | ||||||
ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector( | ||||||
Histo *__restrict__ h, | ||||||
uint32_t nh, | ||||||
T const *__restrict__ v, | ||||||
uint32_t const *__restrict__ offsets, | ||||||
uint32_t totSize, | ||||||
unsigned int nthreads, | ||||||
const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { | ||||||
const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; | ||||||
const Vec1 blocksPerGrid(nblocks); | ||||||
const Vec1 threadsPerBlockOrElementsPerThread(nthreads); | ||||||
const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); | ||||||
|
||||||
alpaka::queue::enqueue( | ||||||
queue, alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(workDiv, launchZero(), h)); | ||||||
|
||||||
alpaka::queue::enqueue(queue, | ||||||
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>( | ||||||
workDiv, countFromVector(), h, nh, v, offsets)); | ||||||
launchFinalize(h, device, queue); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Following removal of
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ha yes true, this argument is not necessary anymore now, thanks. I just removed it. |
||||||
|
||||||
alpaka::queue::enqueue(queue, | ||||||
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>( | ||||||
workDiv, fillFromVector(), h, nh, v, offsets)); | ||||||
} | ||||||
|
||||||
struct finalizeBulk { | ||||||
template <typename T_Acc, typename Assoc> | ||||||
ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { | ||||||
assoc->bulkFinalizeFill(acc, *apc); | ||||||
} | ||||||
}; | ||||||
|
||||||
// iteratate over N bins left and right of the one containing "v" | ||||||
template <typename Hist, typename V, typename Func> | ||||||
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { | ||||||
int bs = Hist::bin(value); | ||||||
int be = std::min(int(Hist::nbins() - 1), bs + n); | ||||||
bs = std::max(0, bs - n); | ||||||
assert(be >= bs); | ||||||
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { | ||||||
func(*pj); | ||||||
} | ||||||
} | ||||||
|
||||||
// iteratate over bins containing all values in window wmin, wmax | ||||||
template <typename Hist, typename V, typename Func> | ||||||
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { | ||||||
auto bs = Hist::bin(wmin); | ||||||
auto be = Hist::bin(wmax); | ||||||
assert(be >= bs); | ||||||
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { | ||||||
func(*pj); | ||||||
} | ||||||
} | ||||||
|
||||||
template <typename T, // the type of the discretized input values | ||||||
uint32_t NBINS, // number of bins | ||||||
uint32_t SIZE, // max number of element | ||||||
uint32_t S = sizeof(T) * 8, // number of significant bits in T | ||||||
typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values) | ||||||
uint32_t NHISTS = 1 // number of histos stored | ||||||
> | ||||||
class HistoContainer { | ||||||
public: | ||||||
using Counter = uint32_t; | ||||||
|
||||||
using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>; | ||||||
|
||||||
using index_type = I; | ||||||
using UT = typename std::make_unsigned<T>::type; | ||||||
|
||||||
static constexpr uint32_t ilog2(uint32_t v) { | ||||||
constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; | ||||||
constexpr uint32_t s[] = {1, 2, 4, 8, 16}; | ||||||
|
||||||
uint32_t r = 0; // result of log2(v) will go here | ||||||
for (auto i = 4; i >= 0; i--) | ||||||
if (v & b[i]) { | ||||||
v >>= s[i]; | ||||||
r |= s[i]; | ||||||
} | ||||||
return r; | ||||||
} | ||||||
|
||||||
static constexpr uint32_t sizeT() { return S; } | ||||||
static constexpr uint32_t nbins() { return NBINS; } | ||||||
static constexpr uint32_t nhists() { return NHISTS; } | ||||||
static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } | ||||||
static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } | ||||||
static constexpr uint32_t capacity() { return SIZE; } | ||||||
|
||||||
static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } | ||||||
|
||||||
static constexpr UT bin(T t) { | ||||||
constexpr uint32_t shift = sizeT() - nbits(); | ||||||
constexpr uint32_t mask = (1 << nbits()) - 1; | ||||||
return (t >> shift) & mask; | ||||||
} | ||||||
|
||||||
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void zero() { | ||||||
for (auto &i : off) | ||||||
i = 0; | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc &acc, CountersOnly const &co) { | ||||||
for (uint32_t i = 0; i < totbins(); ++i) { | ||||||
alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, off + i, co.off[i]); | ||||||
} | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc &acc, Counter &x) { | ||||||
return alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &x, 1u); | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc &acc, Counter &x) { | ||||||
return alpaka::atomic::atomicOp<alpaka::atomic::op::Sub>(acc, &x, 1u); | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const T_Acc &acc, T b) { | ||||||
assert(b < nbins()); | ||||||
atomicIncrement(acc, off[b]); | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const T_Acc &acc, T b, index_type j) { | ||||||
assert(b < nbins()); | ||||||
auto w = atomicDecrement(acc, off[b]); | ||||||
assert(w > 0); | ||||||
bins[w - 1] = j; | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t | ||||||
bulkFill(const T_Acc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { | ||||||
auto c = apc.add(acc, n); | ||||||
if (c.m >= nbins()) | ||||||
return -int32_t(c.m); | ||||||
off[c.m] = c.n; | ||||||
for (uint32_t j = 0; j < n; ++j) | ||||||
bins[c.n + j] = v[j]; | ||||||
return c.m; | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const T_Acc &acc, AtomicPairCounter const &apc) { | ||||||
off[apc.get().m] = apc.get().n; | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { | ||||||
auto m = apc.get().m; | ||||||
auto n = apc.get().n; | ||||||
|
||||||
if (m >= nbins()) { // overflow! | ||||||
off[nbins()] = uint32_t(off[nbins() - 1]); | ||||||
return; | ||||||
} | ||||||
|
||||||
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]); | ||||||
const auto &[firstElementIdxNoStride, endElementIdxNoStride] = | ||||||
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(totbins())); | ||||||
|
||||||
for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u]; | ||||||
threadIdx < totbins(); | ||||||
threadIdx += gridDimension, endElementIdx += gridDimension) { | ||||||
for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) { | ||||||
off[i] = n; | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t) { | ||||||
uint32_t b = bin(t); | ||||||
assert(b < nbins()); | ||||||
atomicIncrement(acc, off[b]); | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j) { | ||||||
uint32_t b = bin(t); | ||||||
assert(b < nbins()); | ||||||
auto w = atomicDecrement(acc, off[b]); | ||||||
assert(w > 0); | ||||||
bins[w - 1] = j; | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t, uint32_t nh) { | ||||||
uint32_t b = bin(t); | ||||||
assert(b < nbins()); | ||||||
b += histOff(nh); | ||||||
assert(b < totbins()); | ||||||
atomicIncrement(acc, off[b]); | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j, uint32_t nh) { | ||||||
uint32_t b = bin(t); | ||||||
assert(b < nbins()); | ||||||
b += histOff(nh); | ||||||
assert(b < totbins()); | ||||||
auto w = atomicDecrement(acc, off[b]); | ||||||
assert(w > 0); | ||||||
bins[w - 1] = j; | ||||||
} | ||||||
|
||||||
template <typename T_Acc> | ||||||
ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const T_Acc &acc, Counter *ws = nullptr) { | ||||||
assert(off[totbins() - 1] == 0); | ||||||
blockPrefixScan(acc, off, totbins(), ws); | ||||||
assert(off[totbins() - 1] == off[totbins() - 2]); | ||||||
} | ||||||
|
||||||
constexpr auto size() const { return uint32_t(off[totbins() - 1]); } | ||||||
constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } | ||||||
|
||||||
constexpr index_type const *begin() const { return bins; } | ||||||
constexpr index_type const *end() const { return begin() + size(); } | ||||||
|
||||||
constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } | ||||||
constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } | ||||||
|
||||||
Counter off[totbins()]; | ||||||
index_type bins[capacity()]; | ||||||
}; | ||||||
|
||||||
template <typename I, // type stored in the container (usually an index in a vector of the input values) | ||||||
uint32_t MAXONES, // max number of "ones" | ||||||
uint32_t MAXMANYS // max number of "manys" | ||||||
> | ||||||
using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>; | ||||||
|
||||||
} // namespace alpakatools | ||||||
} // namespace cms | ||||||
|
||||||
#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see this pattern
repeats in almost(?) every kernel in this PR. That's 4+ lines of repetitive and error prone code. I'm wondering if it would be worth to abstract that along
that could be called here along
? I know this starts to look like we would be building our own abstraction layer on top of Alpaka, but to me the boilerplace calls for something.
Written that I'm fine if this is left for a future PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes absolutely, was thinking marking this as a to-do comment for this PR.
Maybe in an additional PR indeed is better.