Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AlpakaCore/HistoContainer.h + HistoContainer_t, OneHistoContainer_t and OneToManyAssoc_t tests #165

Merged
merged 32 commits into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
faaede0
Start porting HistoContainer to AlpakaCore
ghugo83 Jan 19, 2021
adb66eb
Finish first-try porting of HistoContainer and its test
ghugo83 Jan 19, 2021
f548e27
HistoContainer and its test now compiles properly. Still segfaults is…
ghugo83 Jan 20, 2021
c4878d5
Tests now run smoothly and pass all assertions in serial and CUDA tes…
ghugo83 Jan 21, 2021
d8c9ad6
Seems to need to add alpaka::wait::wait(queue) around host function c…
ghugo83 Jan 21, 2021
5bacd9c
Can also place them inside the host function.
ghugo83 Jan 21, 2021
46805ad
Remove commented code
ghugo83 Jan 21, 2021
5b7b5ab
[alpaka] Also offload h->off initialization, as is done for Kokkos. h…
ghugo83 Jan 25, 2021
0cd5c71
minor cleaning
ghugo83 Jan 25, 2021
e8c66b6
[alpaka] Add OneToManyAssoc_t test
ghugo83 Jan 26, 2021
b465663
Change index variables names for strided access.
ghugo83 Jan 26, 2021
c6e30b8
Minor fixes
ghugo83 Jan 26, 2021
995af06
[alpaka] Add OneHistoContainer_t test
ghugo83 Jan 26, 2021
82ffd14
Fixes in OneHistoContainer_t
ghugo83 Jan 26, 2021
c6477b6
[alpaka] Important to initlize pointers properly, especially when loc…
ghugo83 Jan 26, 2021
94f94e6
Important fix in OneHistoContainer_t test: correct strided access, no…
ghugo83 Jan 27, 2021
2ff31f9
Also correct strided access in serial and TBB cases in HistoContainer…
ghugo83 Jan 27, 2021
8e9e776
Simplify Vec construction
ghugo83 Jan 27, 2021
8aaa1f3
clang-format
ghugo83 Jan 27, 2021
b6e9adc
Remove alpaka::wait::wait(queue); before host function end of scope. …
ghugo83 Jan 28, 2021
94105db
Remove using namespace ALPAKA_ACCELERATOR_NAMESPACE, and directly pre…
ghugo83 Jan 28, 2021
c08d205
OneHistoContainer: Add 1-kernel version
ghugo83 Jan 29, 2021
88545ea
Add cms::alpakatools::element_global_index_range_uncut to avoid havin…
ghugo83 Jan 29, 2021
ba641ee
Include endElementIdx within loop. NB: Will need to add a dedicated h…
ghugo83 Jan 29, 2021
5c9a8b9
Remove psws from HistoContainer class, never used. It corresponds to …
ghugo83 Jan 29, 2021
94176e9
Indices with += gridDimension stride: add endElementIdx within loop, …
ghugo83 Jan 29, 2021
cbb609f
clang-format
ghugo83 Jan 29, 2021
d759b1f
[alpaka] PrefixScan: use dynamic shared memory (as in CUDA version) i…
ghugo83 Feb 1, 2021
c1824b8
Renaming: element_global_index_range to compute non-truncated range, …
ghugo83 Feb 1, 2021
52ccf49
clang-format
ghugo83 Feb 1, 2021
788556d
minor cleaning
ghugo83 Feb 1, 2021
548b4b9
Forgot to remove device, now that memory allocation is removed
ghugo83 Feb 2, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
354 changes: 354 additions & 0 deletions src/alpaka/AlpakaCore/HistoContainer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,354 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <type_traits>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/alpakaWorkDivHelper.h"
#include "AlpakaCore/AtomicPairCounter.h"
#include "AlpakaCore/alpakastdAlgorithm.h"
#include "AlpakaCore/prefixScan.h"

namespace cms {
namespace alpakatools {

struct countFromVector {
template <typename T_Acc, typename Histo, typename T>
ALPAKA_FN_ACC void operator()(const T_Acc &acc,
Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
const auto &[firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt));
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < nt;
threadIdx += gridDimension, endElementIdx += gridDimension) {
for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
h->count(acc, v[i], ih);
}
}
}
};

struct fillFromVector {
template <typename T_Acc, typename Histo, typename T>
ALPAKA_FN_ACC void operator()(const T_Acc &acc,
Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
const auto &[firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt));

for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < nt;
threadIdx += gridDimension, endElementIdx += gridDimension) {
for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this pattern

const auto gridDim = ...;
const auto [firstNoStride, endNoStride] = ...;
for (auto threadIdx = firstNoStride, endElementIdx = endNoStride; threadIdx < MAX; threadIdx += gridDimension, endElementIdx += gridDimension) {
  for (auto i = threadIdx; i < std::min(endElementIdx, MAX); ++i ) {
    <body>
  }
}

repeats in almost(?) every kernel in this PR. That's 4+ lines of repetitive and error prone code. I'm wondering if it would be worth to abstract that along

// the name should be more descriptive...
template <typename T_Acc, typename N, typename Func>
void for_each_element(const T_Acc& acc, const N nitems, Func func) {
  const auto gridDim = ...;
  const auto [firstNoStride, endNoStride] = ...;
  for (auto threadIdx = firstNoStride, endElementIdx = endNoStride; threadIdx < nitems; threadIdx += gridDimension, endElementIdx += gridDimension) {
    for (auto i = threadIdx; i < std::min(endElementIdx, nitems); ++i ) {
      func(i);
    }
  }
}

that could be called here along

const uint32_t nt = offsets[nh];
for_each_element(acc, nt, [&](uint32_t i) {
  auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
  ...
}

? I know this starts to look like we would be building our own abstraction layer on top of Alpaka, but to me the boilerplace calls for something.

Written that I'm fine if this is left for a future PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes absolutely, was thinking marking this as a to-do comment for this PR.
Maybe in an additional PR indeed is better.

auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
h->fill(acc, v[i], i, ih);
}
}
}
};

struct launchZero {
template <typename T_Acc, typename Histo>
ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc,
Histo *__restrict__ h) const {
const auto &[firstElementIdxGlobal, endElementIdxGlobal] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(Histo::totbins()));

for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) {
h->off[i] = 0;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CUDA implementation uses memset. I suppose we could use alpaka::mem::view::set, but would that work only with buffers from alpaka::mem::buf::alloc() (i.e. not with bare pointers)?

Copy link
Contributor Author

@ghugo83 ghugo83 Jan 27, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes at first I had just tried a alpaka::mem::view::set on the raw pointer (equivalent to the poff in the CUDA version), but that was not working. Also with the additional complication that we do not want to set h to null, but only h->off.
Since ideally we want to keep the same interface as for CUDA (passing around a pointer to Histo as argument of fillManyFromVector), this was the only way to access h->off: on the device, which in a way, makes sense.

}
};

template <typename Histo>
ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(
Histo *__restrict__ h,
const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After removing the memory allocation, the device parameter is not needed anymore

Suggested change
const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device,

ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) {
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));

const int num_items = Histo::totbins();

const unsigned int nthreads = 1024;
const Vec1 threadsPerBlockOrElementsPerThread(nthreads);
const unsigned int nblocks = (num_items + nthreads - 1) / nthreads;
const Vec1 blocksPerGrid(nblocks);

const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread);
alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
workDiv, multiBlockPrefixScanFirstStep<uint32_t>(), poff, poff, num_items));

const WorkDiv1 &workDivWith1Block =
cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread);
alpaka::queue::enqueue(
queue,
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
workDivWith1Block, multiBlockPrefixScanSecondStep<uint32_t>(), poff, poff, num_items, nblocks));
}

template <typename Histo, typename T>
ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(
Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
unsigned int nthreads,
const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

device parameter can be removed from here as well.

Suggested change
const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device,

ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) {
const unsigned int nblocks = (totSize + nthreads - 1) / nthreads;
const Vec1 blocksPerGrid(nblocks);
const Vec1 threadsPerBlockOrElementsPerThread(nthreads);
const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread);

alpaka::queue::enqueue(
queue, alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(workDiv, launchZero(), h));

alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
workDiv, countFromVector(), h, nh, v, offsets));
launchFinalize(h, device, queue);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following removal of device.

Suggested change
launchFinalize(h, device, queue);
launchFinalize(h, queue);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha yes true, this argument is not necessary anymore now, thanks. I just removed it.


alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
workDiv, fillFromVector(), h, nh, v, offsets));
}

struct finalizeBulk {
template <typename T_Acc, typename Assoc>
ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const {
assoc->bulkFinalizeFill(acc, *apc);
}
};

// iteratate over N bins left and right of the one containing "v"
template <typename Hist, typename V, typename Func>
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) {
int bs = Hist::bin(value);
int be = std::min(int(Hist::nbins() - 1), bs + n);
bs = std::max(0, bs - n);
assert(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}

// iteratate over bins containing all values in window wmin, wmax
template <typename Hist, typename V, typename Func>
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
auto bs = Hist::bin(wmin);
auto be = Hist::bin(wmax);
assert(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}

template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
uint32_t SIZE, // max number of element
uint32_t S = sizeof(T) * 8, // number of significant bits in T
typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values)
uint32_t NHISTS = 1 // number of histos stored
>
class HistoContainer {
public:
using Counter = uint32_t;

using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;

using index_type = I;
using UT = typename std::make_unsigned<T>::type;

static constexpr uint32_t ilog2(uint32_t v) {
constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
constexpr uint32_t s[] = {1, 2, 4, 8, 16};

uint32_t r = 0; // result of log2(v) will go here
for (auto i = 4; i >= 0; i--)
if (v & b[i]) {
v >>= s[i];
r |= s[i];
}
return r;
}

static constexpr uint32_t sizeT() { return S; }
static constexpr uint32_t nbins() { return NBINS; }
static constexpr uint32_t nhists() { return NHISTS; }
static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
static constexpr uint32_t capacity() { return SIZE; }

static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }

static constexpr UT bin(T t) {
constexpr uint32_t shift = sizeT() - nbits();
constexpr uint32_t mask = (1 << nbits()) - 1;
return (t >> shift) & mask;
}

ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void zero() {
for (auto &i : off)
i = 0;
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc &acc, CountersOnly const &co) {
for (uint32_t i = 0; i < totbins(); ++i) {
alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, off + i, co.off[i]);
}
}

template <typename T_Acc>
static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc &acc, Counter &x) {
return alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &x, 1u);
}

template <typename T_Acc>
static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc &acc, Counter &x) {
return alpaka::atomic::atomicOp<alpaka::atomic::op::Sub>(acc, &x, 1u);
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const T_Acc &acc, T b) {
assert(b < nbins());
atomicIncrement(acc, off[b]);
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const T_Acc &acc, T b, index_type j) {
assert(b < nbins());
auto w = atomicDecrement(acc, off[b]);
assert(w > 0);
bins[w - 1] = j;
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t
bulkFill(const T_Acc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) {
auto c = apc.add(acc, n);
if (c.m >= nbins())
return -int32_t(c.m);
off[c.m] = c.n;
for (uint32_t j = 0; j < n; ++j)
bins[c.n + j] = v[j];
return c.m;
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const T_Acc &acc, AtomicPairCounter const &apc) {
off[apc.get().m] = apc.get().n;
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) {
auto m = apc.get().m;
auto n = apc.get().n;

if (m >= nbins()) { // overflow!
off[nbins()] = uint32_t(off[nbins() - 1]);
return;
}

const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
const auto &[firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(totbins()));

for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u];
threadIdx < totbins();
threadIdx += gridDimension, endElementIdx += gridDimension) {
for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) {
off[i] = n;
}
}
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t) {
uint32_t b = bin(t);
assert(b < nbins());
atomicIncrement(acc, off[b]);
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j) {
uint32_t b = bin(t);
assert(b < nbins());
auto w = atomicDecrement(acc, off[b]);
assert(w > 0);
bins[w - 1] = j;
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t, uint32_t nh) {
uint32_t b = bin(t);
assert(b < nbins());
b += histOff(nh);
assert(b < totbins());
atomicIncrement(acc, off[b]);
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j, uint32_t nh) {
uint32_t b = bin(t);
assert(b < nbins());
b += histOff(nh);
assert(b < totbins());
auto w = atomicDecrement(acc, off[b]);
assert(w > 0);
bins[w - 1] = j;
}

template <typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const T_Acc &acc, Counter *ws = nullptr) {
assert(off[totbins() - 1] == 0);
blockPrefixScan(acc, off, totbins(), ws);
assert(off[totbins() - 1] == off[totbins() - 2]);
}

constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }

constexpr index_type const *begin() const { return bins; }
constexpr index_type const *end() const { return begin() + size(); }

constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }

Counter off[totbins()];
index_type bins[capacity()];
};

template <typename I, // type stored in the container (usually an index in a vector of the input values)
uint32_t MAXONES, // max number of "ones"
uint32_t MAXMANYS // max number of "manys"
>
using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;

} // namespace alpakatools
} // namespace cms

#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
Loading