From faaede06c04619d55302cc1de6f08614bd0b7e4a Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 19 Jan 2021 16:01:22 +0100 Subject: [PATCH 01/32] Start porting HistoContainer to AlpakaCore --- src/alpaka/AlpakaCore/HistoContainer.h | 379 ++++++++++++++++++++ src/alpaka/test/alpaka/clustering_alpaka.cc | 2 + 2 files changed, 381 insertions(+) create mode 100644 src/alpaka/AlpakaCore/HistoContainer.h diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h new file mode 100644 index 000000000..b83296ea3 --- /dev/null +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -0,0 +1,379 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h +#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h + +#include +#ifndef __CUDA_ARCH__ /// TO DO!!!!!!!!!!!!!!!!!! +#include +#endif // __CUDA_ARCH__ +#include +#include +#include + + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" +#include "AlpakaCore/AtomicPairCounter.h" +#include "AlpakaCore/alpakastdAlgorithm.h" +#include "AlpakaCore/prefixScan.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +namespace cms { + namespace alpakatools { + + struct countFromVector { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int nt = offsets[nh]; + const uint32_t gridDimensionGlobal(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + for (int threadIdxGlobal = firstElementIdxGlobal[0u]; threadIdxGlobal < nt; threadIdxGlobal += gridDimensionGlobal) { + for (int i = threadIdxGlobal; i < endElementIdxGlobal[0u]; ++i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).count(v[i], ih); + } + endElementIdxGlobal[0u] += gridDimensionGlobal; + } + } + }; + + struct fillFromVector { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int nt = offsets[nh]; + const uint32_t gridDimensionGlobal(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + for (int threadIdxGlobal = firstElementIdxGlobal[0u]; threadIdxGlobal < nt; threadIdxGlobal += gridDimensionGlobal) { + for (int i = threadIdxGlobal; i < endElementIdxGlobal[0u]; ++i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).fill(v[i], i, ih); + } + endElementIdxGlobal[0u] += gridDimensionGlobal; + } + } + }; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void launchZero(Histo *__restrict__ h, + Queue& queue) { + uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + int32_t size = offsetof(Histo, bins) - offsetof(Histo, off); + assert(size >= int(sizeof(uint32_t) * Histo::totbins())); + alpaka::mem::view::set(queue, poff, 0, Vec1::all(size)); + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, + const DevAcc1& device, + Queue& queue) { +#ifdef __CUDACC__ + uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + //int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws)); // now unused??? + + + auto nthreads = 1024; + auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads; + int num_items = Histo::totbins(); + + multiBlockPrefixScan<<>>( + poff, poff, num_items, ppsws); + + + + + + multiBlockPrefixScan<<>>(d_in, d_out1, num_items, d_pc); + + + auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items * sizeof(uint32_t))); + uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(nblocks), Vec1::all(nthreads), Vec1::all(nelements)}, + multiBlockPrefixScanFirstStep(), + poff, + poff, + psum_d, + num_items)); + + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1), Vec1::all(nthreads), Vec1::all(nelements)}, + multiBlockPrefixScanSecondStep(), + poff, + poff, + psum_d, + num_items, + nblocks)); + + + + + + + + + + + + + + cudaCheck(cudaGetLastError()); +#else + h->finalize(); +#endif + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + int nthreads, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + launchZero(h, stream); +#ifdef __CUDACC__ + auto nblocks = (totSize + nthreads - 1) / nthreads; + countFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); + launchFinalize(h, stream); + fillFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); +#else + countFromVector(h, nh, v, offsets); + h->finalize(); + fillFromVector(h, nh, v, offsets); +#endif + } + + struct finalizeBulk { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { + assoc->bulkFinalizeFill(acc, *apc); + } + }; + + // iteratate over N bins left and right of the one containing "v" + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + // iteratate over bins containing all values in window wmin, wmax + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + template + class HistoContainer { + public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr uint32_t capacity() { return SIZE; } + + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } + + ALPAKA_FN_HOST_ACC void zero() { + for (auto &i : off) + i = 0; + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void add(CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { +#ifdef __CUDA_ARCH__ + atomicAdd(off + i, co.off[i]); +#else + auto &a = (std::atomic &)(off[i]); + a += co.off[i]; +#endif + } + } + + static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicAdd(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a++; +#endif + } + + static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicSub(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a--; +#endif + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void countDirect(T b) { + assert(b < nbins()); + atomicIncrement(off[b]); + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void fillDirect(T b, index_type j) { + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalize(AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } + auto first = m + blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { + off[i] = n; + } + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void count(T t) { + uint32_t b = bin(t); + assert(b < nbins()); + atomicIncrement(off[b]); + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void fill(T t, index_type j) { + uint32_t b = bin(t); + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void count(T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + atomicIncrement(off[b]); + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void fill(T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize(Counter *ws = nullptr) { + assert(off[totbins() - 1] == 0); + blockPrefixScan(off, totbins(), ws); + assert(off[totbins() - 1] == off[totbins() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + + Counter off[totbins()]; + int32_t psws; // prefix-scan working space + index_type bins[capacity()]; + }; + + template + using OneToManyAssoc = HistoContainer; + + } // namespace alpakatools +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/src/alpaka/test/alpaka/clustering_alpaka.cc b/src/alpaka/test/alpaka/clustering_alpaka.cc index 4f4982afa..0907744c9 100644 --- a/src/alpaka/test/alpaka/clustering_alpaka.cc +++ b/src/alpaka/test/alpaka/clustering_alpaka.cc @@ -1,6 +1,8 @@ #include #include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/HistoContainer.h" + int main() { std::cout << "Exit success" << std::endl; From adb66eb989982cbae0a9a7f19749eac6859e9c02 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 19 Jan 2021 20:03:31 +0100 Subject: [PATCH 02/32] Finish first-try porting of HistoContainer and its test --- src/alpaka/AlpakaCore/HistoContainer.h | 327 ++++++++++---------- src/alpaka/test/alpaka/HistoContainer_t.cc | 169 ++++++++++ src/alpaka/test/alpaka/clustering_alpaka.cc | 2 - 3 files changed, 331 insertions(+), 167 deletions(-) create mode 100644 src/alpaka/test/alpaka/HistoContainer_t.cc diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index b83296ea3..4c0708a01 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -2,9 +2,6 @@ #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h #include -#ifndef __CUDA_ARCH__ /// TO DO!!!!!!!!!!!!!!!!!! -#include -#endif // __CUDA_ARCH__ #include #include #include @@ -22,7 +19,7 @@ namespace cms { namespace alpakatools { struct countFromVector { - template + template ALPAKA_FN_ACC void operator()(const T_Acc &acc, Histo *__restrict__ h, uint32_t nh, @@ -46,7 +43,7 @@ namespace cms { }; struct fillFromVector { - template + template ALPAKA_FN_ACC void operator()(const T_Acc &acc, Histo *__restrict__ h, uint32_t nh, @@ -75,100 +72,80 @@ namespace cms { uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); int32_t size = offsetof(Histo, bins) - offsetof(Histo, off); assert(size >= int(sizeof(uint32_t) * Histo::totbins())); - alpaka::mem::view::set(queue, poff, 0, Vec1::all(size)); + + //auto c_dbuf = alpaka::mem::buf::alloc(device, sizeC); + //alpaka::mem::view::set(queue, poff, 0, Vec1::all(size)); // TO DOOOOOOO: this was removed!!! } template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, const DevAcc1& device, Queue& queue) { -#ifdef __CUDACC__ + uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + // NB: Why are we not interested in poff on device memory (cuda version as well, different from test). ?? //int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws)); // now unused??? + const int num_items = Histo::totbins(); + auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); + uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); - auto nthreads = 1024; - auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads; - int num_items = Histo::totbins(); - - multiBlockPrefixScan<<>>( - poff, poff, num_items, ppsws); - - - - - - multiBlockPrefixScan<<>>(d_in, d_out1, num_items, d_pc); - + const unsigned int nthreads = 1024; + const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; + const Vec1 &blocksPerGrid(Vec1::all(nblocks)); + const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); - auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items * sizeof(uint32_t))); - uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(nblocks), Vec1::all(nthreads), Vec1::all(nelements)}, - multiBlockPrefixScanFirstStep(), + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + multiBlockPrefixScanFirstStep(), poff, poff, psum_d, num_items)); - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1), Vec1::all(nthreads), Vec1::all(nelements)}, + const WorkDiv1 &workDivWith1Block = cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, psum_d, num_items, nblocks)); - - - - - - - - - - - - - - cudaCheck(cudaGetLastError()); -#else - h->finalize(); -#endif } template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - uint32_t totSize, - int nthreads, - cudaStream_t stream -#ifndef __CUDACC__ - = cudaStreamDefault -#endif - ) { - launchZero(h, stream); -#ifdef __CUDACC__ - auto nblocks = (totSize + nthreads - 1) / nthreads; - countFromVector<<>>(h, nh, v, offsets); - cudaCheck(cudaGetLastError()); - launchFinalize(h, stream); - fillFromVector<<>>(h, nh, v, offsets); - cudaCheck(cudaGetLastError()); -#else - countFromVector(h, nh, v, offsets); - h->finalize(); - fillFromVector(h, nh, v, offsets); -#endif + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + unsigned int nthreads, + const DevAcc1& device, + Queue& queue) { + launchZero(h, queue); + + unsigned int nblocks = (totSize + nthreads - 1) / nthreads; + const Vec1 &blocksPerGrid(Vec1::all(nblocks)); + const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + countFromVector(), + h, nh, v, offsets)); + + + + launchFinalize(h, device, queue); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + fillFromVector(), + h, nh, v, offsets)); } struct finalizeBulk { - template + template ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { assoc->bulkFinalizeFill(acc, *apc); } @@ -176,7 +153,7 @@ namespace cms { // iteratate over N bins left and right of the one containing "v" template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + ALPAKA_FN_HOST ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { int bs = Hist::bin(value); int be = std::min(int(Hist::nbins() - 1), bs + n); bs = std::max(0, bs - n); @@ -241,117 +218,137 @@ namespace cms { return (t >> shift) & mask; } - ALPAKA_FN_HOST_ACC void zero() { - for (auto &i : off) - i = 0; - } + ALPAKA_FN_HOST ALPAKA_FN_INLINE void zero() { + for (auto &i : off) + i = 0; + } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void add(CountersOnly const &co) { - for (uint32_t i = 0; i < totbins(); ++i) { -#ifdef __CUDA_ARCH__ - atomicAdd(off + i, co.off[i]); -#else - auto &a = (std::atomic &)(off[i]); - a += co.off[i]; -#endif - } + /* + ALPAKA_FN_HOST ALPAKA_FN_INLINE void add(CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { + auto &a = (std::atomic &)(off[i]); + a += co.off[i]; } + }*/ - static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(Counter &x) { -#ifdef __CUDA_ARCH__ - return atomicAdd(&x, 1); -#else - auto &a = (std::atomic &)(x); - return a++; -#endif + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc& acc, CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { + alpaka::atomic::atomicOp(acc, off + i, co.off[i]); } + } - static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(Counter &x) { -#ifdef __CUDA_ARCH__ - return atomicSub(&x, 1); -#else - auto &a = (std::atomic &)(x); - return a--; -#endif - } + /* + static ALPAKA_FN_HOST ALPAKA_FN_INLINE uint32_t atomicIncrement(Counter &x) { + auto &a = (std::atomic &)(x); + return a++; + } + */ - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void countDirect(T b) { - assert(b < nbins()); - atomicIncrement(off[b]); - } + template + static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc& acc, Counter &x) { + return alpaka::atomic::atomicOp(acc, &x, 1); + } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void fillDirect(T b, index_type j) { - assert(b < nbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } + template + static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc& acc, Counter &x) { + return alpaka::atomic::atomicOp(acc, &x, 1); + } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { - auto c = apc.add(n); - if (c.m >= nbins()) - return -int32_t(c.m); - off[c.m] = c.n; - for (uint32_t j = 0; j < n; ++j) - bins[c.n + j] = v[j]; - return c.m; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const T_Acc& acc, T b) { + assert(b < nbins()); + atomicIncrement(acc, off[b]); + } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalize(AtomicPairCounter const &apc) { - off[apc.get().m] = apc.get().n; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const T_Acc& acc, T b, index_type j) { + assert(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + assert(w > 0); + bins[w - 1] = j; + } template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { - auto m = apc.get().m; - auto n = apc.get().n; - if (m >= nbins()) { // overflow! - off[nbins()] = uint32_t(off[nbins() - 1]); - return; - } - auto first = m + blockDim.x * blockIdx.x + threadIdx.x; - for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { - off[i] = n; - } - } + ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t bulkFill(const T_Acc& acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(acc, n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void count(T t) { - uint32_t b = bin(t); - assert(b < nbins()); - atomicIncrement(off[b]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const T_Acc& acc, AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void fill(T t, index_type j) { - uint32_t b = bin(t); - assert(b < nbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void count(T t, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - atomicIncrement(off[b]); - } + const uint32_t gridDimensionGlobal(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); + firstElementIdxGlobal[0u] += m; + endElementIdxGlobal[0u] += m; - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void fill(T t, index_type j, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; + for (int threadIdxGlobal = firstElementIdxGlobal[0u]; threadIdxGlobal < totbins(); threadIdxGlobal += gridDimensionGlobal) { + for (int i = threadIdxGlobal; i < endElementIdxGlobal[0u]; ++i) { + off[i] = n; + } + endElementIdxGlobal[0u] += gridDimensionGlobal; } + } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize(Counter *ws = nullptr) { - assert(off[totbins() - 1] == 0); - blockPrefixScan(off, totbins(), ws); - assert(off[totbins() - 1] == off[totbins() - 2]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc& acc, T t) { + uint32_t b = bin(t); + assert(b < nbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc& acc, T t, index_type j) { + uint32_t b = bin(t); + assert(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc& acc, T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc& acc, T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + auto w = atomicDecrement(acc, off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const T_Acc& acc, Counter *ws = nullptr) { + assert(off[totbins() - 1] == 0); + blockPrefixScan(acc, off, totbins(), ws); + assert(off[totbins() - 1] == off[totbins() - 2]); + } constexpr auto size() const { return uint32_t(off[totbins() - 1]); } constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc new file mode 100644 index 000000000..a98a4542a --- /dev/null +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" +#include "AlpakaCore/HistoContainer.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +void go(const DevHost& host, const DevAcc1& device, Queue& queue) { + std::mt19937 eng; + std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); + + constexpr unsigned int N = 12000; + auto v_buf = alpaka::mem::buf::alloc(host, N); + auto v = alpaka::mem::view::getPtrNative(v_buf); + auto v_d = alpaka::mem::buf::alloc(device, N); + alpaka::mem::view::copy(queue, v_d, v_buf, N); + + constexpr uint32_t nParts = 10; + constexpr uint32_t partSize = N / nParts; + + using Hist = cms::alpakatools::HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' ' + << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; + + + auto offsets_buf = alpaka::mem::buf::alloc(host, nParts + 1); + auto offsets = alpaka::mem::view::getPtrNative(offsets_buf); + auto off_d = alpaka::mem::buf::alloc(device, nParts + 1); + + auto h_buf = alpaka::mem::buf::alloc(host, 1u); + auto h = alpaka::mem::view::getPtrNative(h_buf); + auto h_d = alpaka::mem::buf::alloc(device, 1u); + alpaka::mem::view::set(queue, h_d, 0, Vec1::all(1u)); // TO DO: this was added!!!! + + for (int it = 0; it < 5; ++it) { + + offsets[0] = 0; + for (uint32_t j = 1; j < nParts + 1; ++j) { + offsets[j] = offsets[j - 1] + partSize - 3 * j; + assert(offsets[j] <= N); + } + + if (it == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + } + + alpaka::mem::view::copy(queue, off_d, offsets_buf, nParts + 1); + + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + + if (it == 2) { // big bin + for (long long j = 1000; j < 2000; j++) + v[j] = sizeof(T) == 1 ? 22 : 3456; + } + + alpaka::mem::view::copy(queue, v_d, v_buf, N); + + fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), + nParts, + alpaka::mem::view::getPtrNative(v_d), + alpaka::mem::view::getPtrNative(off_d), + offsets[10], + 256, + device, + queue); + alpaka::mem::view::copy(queue, h_buf, h_d, 1u); + alpaka::wait::wait(queue); + assert(0 == h->off[0]); + assert(offsets[10] == h->size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + assert(t1 < N); + assert(t2 < N); + if (T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); }; + + // make sure it spans 3 bins... + auto window = T(1300); + + for (uint32_t j = 0; j < nParts; ++j) { + auto off = Hist::histOff(j); + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + auto ii = i + off; + if (0 == h->size(ii)) + continue; + auto k = *h->begin(ii); + if (j % 2) + k = *(h->begin(ii) + (h->end(ii) - h->begin(ii)) / 2); + auto bk = h->bin(v[k]); + assert(bk == i); + assert(k < offsets[j + 1]); + auto kl = h->bin(v[k] - window); + auto kh = h->bin(v[k] + window); + assert(kl != i); + assert(kh != i); + // std::cout << kl << ' ' << kh << std::endl; + + auto me = v[k]; + auto tot = 0; + auto nm = 0; + bool l = true; + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { + if (kk != kl && kk != kh) + nm += h->size(kk + off); + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) { + if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) { + } else { + ++tot; + } + } + if (kk == i) { + l = false; + continue; + } + if (l) + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) + verify(i, k, k, (*p)); + else + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) + verify(i, k, (*p), k); + } + if (!(tot >= nm)) { + std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + } + if (l) + std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + assert(!l); + } + } + } +} + +int main() { + const DevHost host(alpaka::pltf::getDevByIdx(0u)); + const DevAcc1 device(alpaka::pltf::getDevByIdx(0u)); + Queue queue(device); + + go(host, device, queue); + go(host, device, queue); + + return 0; +} diff --git a/src/alpaka/test/alpaka/clustering_alpaka.cc b/src/alpaka/test/alpaka/clustering_alpaka.cc index 0907744c9..4f4982afa 100644 --- a/src/alpaka/test/alpaka/clustering_alpaka.cc +++ b/src/alpaka/test/alpaka/clustering_alpaka.cc @@ -1,8 +1,6 @@ #include #include "AlpakaCore/alpakaConfig.h" -#include "AlpakaCore/HistoContainer.h" - int main() { std::cout << "Exit success" << std::endl; From f548e2783f99814aad3cb4b129a59b00d66aabd5 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Wed, 20 Jan 2021 16:20:10 +0100 Subject: [PATCH 03/32] HistoContainer and its test now compiles properly. Still segfaults issues --- src/alpaka/AlpakaCore/HistoContainer.h | 64 +++++++++++----------- src/alpaka/test/alpaka/HistoContainer_t.cc | 3 +- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 4c0708a01..2d5f42d49 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -24,20 +24,21 @@ namespace cms { Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - int nt = offsets[nh]; - const uint32_t gridDimensionGlobal(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - for (int threadIdxGlobal = firstElementIdxGlobal[0u]; threadIdxGlobal < nt; threadIdxGlobal += gridDimensionGlobal) { - for (int i = threadIdxGlobal; i < endElementIdxGlobal[0u]; ++i) { + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdx, endElementIdx] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + uint32_t endElementIdxStrided = endElementIdx[0u]; + for (uint32_t threadIndexStrided = firstElementIdx[0u]; threadIndexStrided < nt; threadIndexStrided += gridDimension) { + for (uint32_t i = threadIndexStrided; i < endElementIdxStrided; ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; assert(ih >= 0); assert(ih < int(nh)); - (*h).count(v[i], ih); + (*h).count(acc, v[i], ih); } - endElementIdxGlobal[0u] += gridDimensionGlobal; + endElementIdxStrided += gridDimension; } } }; @@ -48,20 +49,22 @@ namespace cms { Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - int nt = offsets[nh]; - const uint32_t gridDimensionGlobal(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - for (int threadIdxGlobal = firstElementIdxGlobal[0u]; threadIdxGlobal < nt; threadIdxGlobal += gridDimensionGlobal) { - for (int i = threadIdxGlobal; i < endElementIdxGlobal[0u]; ++i) { + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdx, endElementIdx] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + + uint32_t endElementIdxStrided = endElementIdx[0u]; + for (uint32_t threadIdxStrided = firstElementIdx[0u]; threadIdxStrided < nt; threadIdxStrided += gridDimension) { + for (uint32_t i = threadIdxStrided; i < endElementIdxStrided; ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; assert(ih >= 0); assert(ih < int(nh)); - (*h).fill(v[i], i, ih); + (*h).fill(acc, v[i], i, ih); } - endElementIdxGlobal[0u] += gridDimensionGlobal; + endElementIdxStrided += gridDimension; } } }; @@ -146,7 +149,7 @@ namespace cms { struct finalizeBulk { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { + ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { assoc->bulkFinalizeFill(acc, *apc); } }; @@ -174,6 +177,8 @@ namespace cms { } } + + template &)(x); - return a++; - } - */ - template static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc& acc, Counter &x) { - return alpaka::atomic::atomicOp(acc, &x, 1); + return alpaka::atomic::atomicOp(acc, &x, 1u); } template static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc& acc, Counter &x) { - return alpaka::atomic::atomicOp(acc, &x, 1); + return alpaka::atomic::atomicOp(acc, &x, 1u); } template @@ -289,21 +287,21 @@ namespace cms { ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { auto m = apc.get().m; auto n = apc.get().n; + if (m >= nbins()) { // overflow! off[nbins()] = uint32_t(off[nbins() - 1]); return; } - const uint32_t gridDimensionGlobal(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); - firstElementIdxGlobal[0u] += m; - endElementIdxGlobal[0u] += m; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdx, endElementIdx] = cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); - for (int threadIdxGlobal = firstElementIdxGlobal[0u]; threadIdxGlobal < totbins(); threadIdxGlobal += gridDimensionGlobal) { - for (int i = threadIdxGlobal; i < endElementIdxGlobal[0u]; ++i) { + uint32_t endElementIdxStrided = m + endElementIdx[0u]; + for (uint32_t threadIdxStrided = m + firstElementIdx[0u]; threadIdxStrided < totbins(); threadIdxStrided += gridDimension) { + for (uint32_t i = threadIdxStrided; i < endElementIdxStrided; ++i) { off[i] = n; } - endElementIdxGlobal[0u] += gridDimensionGlobal; + endElementIdxStrided += gridDimension; } } diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index a98a4542a..4d3a77b29 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -28,7 +28,6 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' ' << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; - auto offsets_buf = alpaka::mem::buf::alloc(host, nParts + 1); auto offsets = alpaka::mem::view::getPtrNative(offsets_buf); @@ -37,7 +36,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { auto h_buf = alpaka::mem::buf::alloc(host, 1u); auto h = alpaka::mem::view::getPtrNative(h_buf); auto h_d = alpaka::mem::buf::alloc(device, 1u); - alpaka::mem::view::set(queue, h_d, 0, Vec1::all(1u)); // TO DO: this was added!!!! + //alpaka::mem::view::set(queue, h_d, Hist(), Vec1::all(1u)); // TO DO: this was added!!!! for (int it = 0; it < 5; ++it) { From c4878d5acafc172ce3ac187758328440b9a1de3e Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Thu, 21 Jan 2021 15:06:16 +0100 Subject: [PATCH 04/32] Tests now run smoothly and pass all assertions in serial and CUDA test. Still issue in TBB case. --- src/alpaka/AlpakaCore/HistoContainer.h | 129 +++++++++++++-------- src/alpaka/test/alpaka/HistoContainer_t.cc | 8 +- 2 files changed, 84 insertions(+), 53 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 2d5f42d49..67bff9513 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -36,7 +36,7 @@ namespace cms { int32_t ih = off - offsets - 1; assert(ih >= 0); assert(ih < int(nh)); - (*h).count(acc, v[i], ih); + h->count(acc, v[i], ih); } endElementIdxStrided += gridDimension; } @@ -62,72 +62,97 @@ namespace cms { int32_t ih = off - offsets - 1; assert(ih >= 0); assert(ih < int(nh)); - (*h).fill(acc, v[i], i, ih); + h->fill(acc, v[i], i, ih); } endElementIdxStrided += gridDimension; } } }; - template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void launchZero(Histo *__restrict__ h, - Queue& queue) { - uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - int32_t size = offsetof(Histo, bins) - offsetof(Histo, off); - assert(size >= int(sizeof(uint32_t) * Histo::totbins())); - - //auto c_dbuf = alpaka::mem::buf::alloc(device, sizeC); - //alpaka::mem::view::set(queue, poff, 0, Vec1::all(size)); // TO DOOOOOOO: this was removed!!! - } - - template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, - const DevAcc1& device, - Queue& queue) { + struct launchZero { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, + Histo *__restrict__ h) const { + //uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + //int32_t size = offsetof(Histo, bins) - offsetof(Histo, off); + //assert(size >= int(sizeof(uint32_t) * Histo::totbins())); + + // TO DO: USE A WORKDIV?????????????? + for (uint32_t i = 0; i < Histo::totbins(); ++i) { + h->off[i] = 0; + } + } + }; - uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - // NB: Why are we not interested in poff on device memory (cuda version as well, different from test). ?? + /* + struct multiBlockPrefixScanFirstStepHisto { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Histo *__restrict__ h, T* psum_d, int32_t size) const { + multiBlockPrefixScanFirstStepHisto( + h->sum, // TO DO: GetPointerNative?? + h->sum, // TO DO: ppws?? + psum_d, + size)); + };*/ + + + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, + const DevAcc1& device, + Queue& queue) { + + uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + // NB: Why are we not interested in poff on device memory (cuda version as well, different from test). ?? + //int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws)); // now unused??? - const int num_items = Histo::totbins(); - - auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); - uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); - - const unsigned int nthreads = 1024; - const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; - const Vec1 &blocksPerGrid(Vec1::all(nblocks)); - const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); - - const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - multiBlockPrefixScanFirstStep(), - poff, - poff, - psum_d, - num_items)); - - const WorkDiv1 &workDivWith1Block = cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivWith1Block, - multiBlockPrefixScanSecondStep(), - poff, - poff, - psum_d, - num_items, - nblocks)); + // ppsws ????????????????????????????????????????????????????????????????????????????????? + + + + const int num_items = Histo::totbins(); + + auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); + uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); + + const unsigned int nthreads = 1024; + const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; + const Vec1 &blocksPerGrid(Vec1::all(nblocks)); + const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); + + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + multiBlockPrefixScanFirstStep(), + poff, // TO DO: GetPointerNative?? + poff, // TO DO: ppws?? + psum_d, + num_items)); + + const WorkDiv1 &workDivWith1Block = cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDivWith1Block, + multiBlockPrefixScanSecondStep(), + poff, + poff, + psum_d, + num_items, + nblocks)); } template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, uint32_t const *__restrict__ offsets, uint32_t totSize, unsigned int nthreads, const DevAcc1& device, - Queue& queue) { - launchZero(h, queue); + Queue& queue) { + std::cout << "Start within fillManyFromVector" << std::endl; + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + launchZero(), + h)); unsigned int nblocks = (totSize + nthreads - 1) / nthreads; const Vec1 &blocksPerGrid(Vec1::all(nblocks)); @@ -168,7 +193,7 @@ namespace cms { // iteratate over bins containing all values in window wmin, wmax template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + ALPAKA_FN_HOST ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { auto bs = Hist::bin(wmin); auto be = Hist::bin(wmax); assert(be >= bs); @@ -187,7 +212,9 @@ namespace cms { uint32_t NHISTS = 1 // number of histos stored > class HistoContainer { + ALPAKA_FN_HOST_ACC HistoContainer() {}; // TO DO: not neeeded?????????? public: + using Counter = uint32_t; using CountersOnly = HistoContainer; diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index 4d3a77b29..872817a1c 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -34,9 +34,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { auto off_d = alpaka::mem::buf::alloc(device, nParts + 1); auto h_buf = alpaka::mem::buf::alloc(host, 1u); - auto h = alpaka::mem::view::getPtrNative(h_buf); auto h_d = alpaka::mem::buf::alloc(device, 1u); - //alpaka::mem::view::set(queue, h_d, Hist(), Vec1::all(1u)); // TO DO: this was added!!!! for (int it = 0; it < 5; ++it) { @@ -71,6 +69,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { } alpaka::mem::view::copy(queue, v_d, v_buf, N); + std::cout << "Calling fillManyFromVector" << std::endl; fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), nParts, @@ -80,8 +79,13 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { 256, device, queue); + alpaka::wait::wait(queue); + std::cout << "Prepare to copy results" << std::endl; alpaka::mem::view::copy(queue, h_buf, h_d, 1u); alpaka::wait::wait(queue); + std::cout << "Copied results" << std::endl; + + auto h = alpaka::mem::view::getPtrNative(h_buf); assert(0 == h->off[0]); assert(offsets[10] == h->size()); From d8c9ad6f1e612f9eaa94974a4e402fa6c5fd92cd Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Thu, 21 Jan 2021 19:19:51 +0100 Subject: [PATCH 05/32] Seems to need to add alpaka::wait::wait(queue) around host function calls when queue is passed as an argument (even as a reference) --- src/alpaka/AlpakaCore/HistoContainer.h | 8 +++++++- src/alpaka/test/alpaka/HistoContainer_t.cc | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 67bff9513..8793142cc 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -100,6 +100,7 @@ namespace cms { ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, const DevAcc1& device, Queue& queue) { + //alpaka::wait::wait(queue); uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); // NB: Why are we not interested in poff on device memory (cuda version as well, different from test). ?? @@ -137,6 +138,7 @@ namespace cms { psum_d, num_items, nblocks)); + //alpaka::wait::wait(queue); } template @@ -149,6 +151,7 @@ namespace cms { const DevAcc1& device, Queue& queue) { std::cout << "Start within fillManyFromVector" << std::endl; + //alpaka::wait::wait(queue); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, launchZero(), @@ -164,12 +167,15 @@ namespace cms { h, nh, v, offsets)); - + alpaka::wait::wait(queue); launchFinalize(h, device, queue); + alpaka::wait::wait(queue); + alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, fillFromVector(), h, nh, v, offsets)); + //alpaka::wait::wait(queue); } struct finalizeBulk { diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index 872817a1c..9301bc1de 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -71,6 +71,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { alpaka::mem::view::copy(queue, v_d, v_buf, N); std::cout << "Calling fillManyFromVector" << std::endl; + alpaka::wait::wait(queue); fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), nParts, alpaka::mem::view::getPtrNative(v_d), From 5bacd9c48770935709a1832e2d83e09d51509406 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Thu, 21 Jan 2021 19:22:05 +0100 Subject: [PATCH 06/32] Can also place them inside the host function. --- src/alpaka/AlpakaCore/HistoContainer.h | 12 ++++++------ src/alpaka/test/alpaka/HistoContainer_t.cc | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 8793142cc..22526e24f 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -100,7 +100,7 @@ namespace cms { ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, const DevAcc1& device, Queue& queue) { - //alpaka::wait::wait(queue); + alpaka::wait::wait(queue); uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); // NB: Why are we not interested in poff on device memory (cuda version as well, different from test). ?? @@ -138,7 +138,7 @@ namespace cms { psum_d, num_items, nblocks)); - //alpaka::wait::wait(queue); + alpaka::wait::wait(queue); } template @@ -151,7 +151,7 @@ namespace cms { const DevAcc1& device, Queue& queue) { std::cout << "Start within fillManyFromVector" << std::endl; - //alpaka::wait::wait(queue); + alpaka::wait::wait(queue); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, launchZero(), @@ -167,15 +167,15 @@ namespace cms { h, nh, v, offsets)); - alpaka::wait::wait(queue); + //alpaka::wait::wait(queue); launchFinalize(h, device, queue); - alpaka::wait::wait(queue); + //alpaka::wait::wait(queue); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, fillFromVector(), h, nh, v, offsets)); - //alpaka::wait::wait(queue); + alpaka::wait::wait(queue); } struct finalizeBulk { diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index 9301bc1de..f3e3dd1c8 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -71,7 +71,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { alpaka::mem::view::copy(queue, v_d, v_buf, N); std::cout << "Calling fillManyFromVector" << std::endl; - alpaka::wait::wait(queue); + //alpaka::wait::wait(queue); fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), nParts, alpaka::mem::view::getPtrNative(v_d), @@ -80,7 +80,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { 256, device, queue); - alpaka::wait::wait(queue); + //alpaka::wait::wait(queue); std::cout << "Prepare to copy results" << std::endl; alpaka::mem::view::copy(queue, h_buf, h_d, 1u); alpaka::wait::wait(queue); From 46805addfd37acf030dd96d6679849de4379dce9 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Thu, 21 Jan 2021 19:37:10 +0100 Subject: [PATCH 07/32] Remove commented code --- src/alpaka/AlpakaCore/HistoContainer.h | 22 ---------------------- src/alpaka/test/alpaka/HistoContainer_t.cc | 2 -- 2 files changed, 24 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 22526e24f..853eb26e6 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -84,18 +84,6 @@ namespace cms { } }; - /* - struct multiBlockPrefixScanFirstStepHisto { - template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, Histo *__restrict__ h, T* psum_d, int32_t size) const { - multiBlockPrefixScanFirstStepHisto( - h->sum, // TO DO: GetPointerNative?? - h->sum, // TO DO: ppws?? - psum_d, - size)); - };*/ - - template ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, const DevAcc1& device, @@ -103,12 +91,6 @@ namespace cms { alpaka::wait::wait(queue); uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - // NB: Why are we not interested in poff on device memory (cuda version as well, different from test). ?? - - //int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws)); // now unused??? - // ppsws ????????????????????????????????????????????????????????????????????????????????? - - const int num_items = Histo::totbins(); @@ -165,11 +147,7 @@ namespace cms { alpaka::kernel::createTaskKernel(workDiv, countFromVector(), h, nh, v, offsets)); - - - //alpaka::wait::wait(queue); launchFinalize(h, device, queue); - //alpaka::wait::wait(queue); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index f3e3dd1c8..9959c87af 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -71,7 +71,6 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { alpaka::mem::view::copy(queue, v_d, v_buf, N); std::cout << "Calling fillManyFromVector" << std::endl; - //alpaka::wait::wait(queue); fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), nParts, alpaka::mem::view::getPtrNative(v_d), @@ -80,7 +79,6 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { 256, device, queue); - //alpaka::wait::wait(queue); std::cout << "Prepare to copy results" << std::endl; alpaka::mem::view::copy(queue, h_buf, h_d, 1u); alpaka::wait::wait(queue); From 5b7b5ab3120e253cec916ccc0db8731e27d0eeca Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Mon, 25 Jan 2021 11:29:15 +0100 Subject: [PATCH 08/32] [alpaka] Also offload h->off initialization, as is done for Kokkos. h->psws also needs to be set --- src/alpaka/AlpakaCore/HistoContainer.h | 49 ++++++++++++++++---------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 853eb26e6..92c5ed425 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -71,25 +71,29 @@ namespace cms { struct launchZero { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, - Histo *__restrict__ h) const { - //uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - //int32_t size = offsetof(Histo, bins) - offsetof(Histo, off); - //assert(size >= int(sizeof(uint32_t) * Histo::totbins())); - - // TO DO: USE A WORKDIV?????????????? - for (uint32_t i = 0; i < Histo::totbins(); ++i) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, + Histo *__restrict__ h) const { + const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); + + for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { h->off[i] = 0; } } }; - template + struct storePrefixScanWorkingSpace { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + const unsigned int nblocks) const { + h->psws = nblocks; + } + }; + + template ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, const DevAcc1& device, Queue& queue) { - alpaka::wait::wait(queue); - uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); const int num_items = Histo::totbins(); @@ -102,12 +106,18 @@ namespace cms { const Vec1 &blocksPerGrid(Vec1::all(nblocks)); const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + storePrefixScanWorkingSpace(), + h, + nblocks)); + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, multiBlockPrefixScanFirstStep(), - poff, // TO DO: GetPointerNative?? - poff, // TO DO: ppws?? + poff, + poff, psum_d, num_items)); @@ -121,7 +131,7 @@ namespace cms { num_items, nblocks)); alpaka::wait::wait(queue); - } + } template ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, @@ -133,16 +143,17 @@ namespace cms { const DevAcc1& device, Queue& queue) { std::cout << "Start within fillManyFromVector" << std::endl; - alpaka::wait::wait(queue); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, - launchZero(), - h)); unsigned int nblocks = (totSize + nthreads - 1) / nthreads; const Vec1 &blocksPerGrid(Vec1::all(nblocks)); const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + launchZero(), + h)); + alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, countFromVector(), From 0cd5c71195664370f391d4a8d1af4cf4213a1d53 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Mon, 25 Jan 2021 15:39:57 +0100 Subject: [PATCH 09/32] minor cleaning --- src/alpaka/AlpakaCore/HistoContainer.h | 15 ++------------- src/alpaka/test/alpaka/HistoContainer_t.cc | 4 ++-- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 92c5ed425..e23b2430b 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -142,9 +142,7 @@ namespace cms { unsigned int nthreads, const DevAcc1& device, Queue& queue) { - std::cout << "Start within fillManyFromVector" << std::endl; - - unsigned int nblocks = (totSize + nthreads - 1) / nthreads; + const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; const Vec1 &blocksPerGrid(Vec1::all(nblocks)); const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); @@ -207,7 +205,6 @@ namespace cms { uint32_t NHISTS = 1 // number of histos stored > class HistoContainer { - ALPAKA_FN_HOST_ACC HistoContainer() {}; // TO DO: not neeeded?????????? public: using Counter = uint32_t; @@ -245,19 +242,11 @@ namespace cms { return (t >> shift) & mask; } - ALPAKA_FN_HOST ALPAKA_FN_INLINE void zero() { + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void zero() { for (auto &i : off) i = 0; } - /* - ALPAKA_FN_HOST ALPAKA_FN_INLINE void add(CountersOnly const &co) { - for (uint32_t i = 0; i < totbins(); ++i) { - auto &a = (std::atomic &)(off[i]); - a += co.off[i]; - } - }*/ - template ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc& acc, CountersOnly const &co) { for (uint32_t i = 0; i < totbins(); ++i) { diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index 9959c87af..2bf1325d3 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -69,8 +69,8 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { } alpaka::mem::view::copy(queue, v_d, v_buf, N); - std::cout << "Calling fillManyFromVector" << std::endl; + std::cout << "Calling fillManyFromVector" << std::endl; fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), nParts, alpaka::mem::view::getPtrNative(v_d), @@ -79,7 +79,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { 256, device, queue); - std::cout << "Prepare to copy results" << std::endl; + alpaka::mem::view::copy(queue, h_buf, h_d, 1u); alpaka::wait::wait(queue); std::cout << "Copied results" << std::endl; From e8c66b6d9d6518725913cfe00e0175a4070345c0 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 26 Jan 2021 13:18:08 +0100 Subject: [PATCH 10/32] [alpaka] Add OneToManyAssoc_t test --- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 412 +++++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 src/alpaka/test/alpaka/OneToManyAssoc_t.cc diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc new file mode 100644 index 000000000..e46fc17ba --- /dev/null +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -0,0 +1,412 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "AlpakaCore/HistoContainer.h" + +constexpr uint32_t MaxElem = 64000; +constexpr uint32_t MaxTk = 8000; +constexpr uint32_t MaxAssocs = 4 * MaxTk; + +using Assoc = cms::alpakatools::OneToManyAssoc; +using SmallAssoc = cms::alpakatools::OneToManyAssoc; +using Multiplicity = cms::alpakatools::OneToManyAssoc; +using TK = std::array; + +struct countMultiLocal { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < n; firstElementIdx += gridDimension) { + for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + auto&& local = alpaka::block::shared::st::allocVar(acc); + if (threadIdxLocal == 0) { // TO DO: use launchZero????? + local.zero(); + } + alpaka::block::sync::syncBlockThreads(acc); + local.countDirect(acc, 2 + i % 4); + alpaka::block::sync::syncBlockThreads(acc); + if (threadIdxLocal == 0) { + assoc->add(acc, local); + } + } + endElementIdx += gridDimension; + } + } +}; + +struct countMulti { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < n; firstElementIdx += gridDimension) { + for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + assoc->countDirect(acc, 2 + i % 4); + } + endElementIdx += gridDimension; + } + } +}; + +struct verifyMulti { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const { + const uint32_t maxNumberOfElements = Multiplicity::totbins(); + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < maxNumberOfElements; firstElementIdx += gridDimension) { + for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + assert(m1->off[i] == m2->off[i]); + } + endElementIdx += gridDimension; + } + } +}; + +struct count { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + const uint32_t maxNumberOfElements = 4 * n; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < maxNumberOfElements; firstElementIdx += gridDimension) { + for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->countDirect(acc, tk[k][j]); + } + } + endElementIdx += gridDimension; + } + } +}; + +struct fill { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + const uint32_t maxNumberOfElements = 4 * n; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < maxNumberOfElements; firstElementIdx += gridDimension) { + for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->fillDirect(acc, tk[k][j], k); + } + } + endElementIdx += gridDimension; + } + } +}; + +struct verify { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, Assoc* __restrict__ assoc) const { + assert(assoc->size() < Assoc::capacity()); + } +}; + +struct fillBulk { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, cms::alpakatools::AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < n; firstElementIdx += gridDimension) { + for (uint32_t k = firstElementIdx; k < endElementIdx; ++k) { + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(acc, *apc, &tk[k][0], m); + } + endElementIdx += gridDimension; + } + } +}; + +struct verifyBulk { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, Assoc const* __restrict__ assoc, cms::alpakatools::AtomicPairCounter const* apc) const { + if (apc->get().m >= Assoc::nbins()) { + printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); + } + assert(assoc->size() < Assoc::capacity()); + } +}; + +int main() { + const DevHost host(alpaka::pltf::getDevByIdx(0u)); + const DevAcc1 device(alpaka::pltf::getDevByIdx(0u)); + Queue queue(device); + + std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl; + std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' ' + << SmallAssoc::capacity() << std::endl; + + std::mt19937 eng; + std::geometric_distribution rdm(0.8); + + constexpr uint32_t N = 4000; + + auto tr_buf = alpaka::mem::buf::alloc, Idx>(host, N); + auto tr = alpaka::mem::view::getPtrNative(tr_buf); + // fill with "index" to element + long long ave = 0; + int imax = 0; + auto n = 0U; + auto z = 0U; + auto nz = 0U; + for (auto i = 0U; i < 4U; ++i) { + auto j = 0U; + while (j < N && n < MaxElem) { + if (z == 11) { + ++n; + z = 0; + ++nz; + continue; + } // a bit of not assoc + auto x = rdm(eng); + auto k = std::min(j + x + 1, N); + if (i == 3 && z == 3) { // some triplets time to time + for (; j < k; ++j) + tr[j][i] = MaxElem + 1; + } else { + ave += x + 1; + imax = std::max(imax, x); + for (; j < k; ++j) + tr[j][i] = n; + ++n; + } + ++z; + } + assert(n <= MaxElem); + assert(j <= N); + } + std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; + + + auto v_dbuf = alpaka::mem::buf::alloc, Idx>(device, N); + alpaka::mem::view::copy(queue, v_dbuf, tr_buf, N); + + auto a_dbuf = alpaka::mem::buf::alloc(device, 1u); + + const unsigned int nThreads = 256; + const Vec1 threadsPerBlockOrElementsPerThread(nThreads); + const unsigned int nBlocks4N = (4 * N + nThreads - 1) / nThreads; + const Vec1 blocksPerGrid4N(nBlocks4N); + const WorkDiv1 &workDiv4N = cms::alpakatools::make_workdiv(blocksPerGrid4N, threadsPerBlockOrElementsPerThread); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv4N, + cms::alpakatools::launchZero(), + alpaka::mem::view::getPtrNative(a_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv4N, + count(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf), + N + )); + + cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(a_dbuf), device, queue); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + verify(), + alpaka::mem::view::getPtrNative(a_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv4N, + fill(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf), + N + )); + + auto la_hbuf = alpaka::mem::buf::alloc(host, 1u); + alpaka::mem::view::copy(queue, la_hbuf, a_dbuf, 1u); + alpaka::wait::wait(queue); + + auto la = alpaka::mem::view::getPtrNative(la_hbuf); + + std::cout << la->size() << std::endl; + imax = 0; + ave = 0; + z = 0; + for (auto i = 0U; i < n; ++i) { + auto x = la->size(i); + if (x == 0) { + z++; + continue; + } + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la->size(n)); + std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; + + // now the inverse map (actually this is the direct....) + auto dc_dbuf = alpaka::mem::buf::alloc(device, 1u); + alpaka::mem::view::set(queue, dc_dbuf, 0, 1u); + + const unsigned int nBlocks = (N + nThreads - 1) / nThreads; + const Vec1 blocksPerGrid(nBlocks); + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + fillBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf), + N + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + cms::alpakatools::finalizeBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + verifyBulk(), + alpaka::mem::view::getPtrNative(a_dbuf), + alpaka::mem::view::getPtrNative(dc_dbuf) + )); + + alpaka::mem::view::copy(queue, la_hbuf, a_dbuf, 1u); + alpaka::wait::wait(queue); // TO DO: Needed??? + + auto dc_hbuf = alpaka::mem::buf::alloc(host, 1u); + alpaka::mem::view::copy(queue, dc_hbuf, dc_dbuf, 1u); + alpaka::wait::wait(queue); + auto dc = alpaka::mem::view::getPtrNative(dc_hbuf); + + + alpaka::mem::view::set(queue, dc_dbuf, 0, 1u); + auto sa_dbuf = alpaka::mem::buf::alloc(device, 1u); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + fillBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(sa_dbuf), + N + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + cms::alpakatools::finalizeBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(sa_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + verifyBulk(), + alpaka::mem::view::getPtrNative(sa_dbuf), + alpaka::mem::view::getPtrNative(dc_dbuf) + )); + + std::cout << "final counter value " << dc->get().n << ' ' << dc->get().m << std::endl; + + std::cout << la->size() << std::endl; + imax = 0; + ave = 0; + for (auto i = 0U; i < N; ++i) { + auto x = la->size(i); + if (!(x == 4 || x == 3)) { + std::cout << i << ' ' << x << std::endl; +} + assert(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); +} + assert(0 == la->size(N)); + std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; + + // here verify use of block local counters + auto m1_dbuf = alpaka::mem::buf::alloc(device, 1u); + auto m2_dbuf = alpaka::mem::buf::alloc(device, 1u); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv4N, + cms::alpakatools::launchZero(), + alpaka::mem::view::getPtrNative(m1_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv4N, + cms::alpakatools::launchZero(), + alpaka::mem::view::getPtrNative(m2_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv4N, + countMulti(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(m1_dbuf), + N + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv4N, + countMultiLocal(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf), + N + )); + + const unsigned int nBlocksTotBins = 1; + const Vec1 blocksPerGridTotBins(nBlocksTotBins); + const unsigned int nThreadsTotBins = Multiplicity::totbins(); + const Vec1 threadsPerBlockOrElementsPerThreadTotBins(nThreadsTotBins); + const WorkDiv1 &workDivTotBins = cms::alpakatools::make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDivTotBins, + verifyMulti(), + alpaka::mem::view::getPtrNative(m1_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf) + )); + + cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m1_dbuf), device, queue); + cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m2_dbuf), device, queue); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDivTotBins, + verifyMulti(), + alpaka::mem::view::getPtrNative(m1_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf) + )); + + alpaka::wait::wait(queue); + + return 0; +} From b4656631ab2e4c50b644943504a6477c2594a590 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 26 Jan 2021 13:32:03 +0100 Subject: [PATCH 11/32] Change index variables names for strided access. --- src/alpaka/AlpakaCore/HistoContainer.h | 30 +++++++++++----------- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 24 ++++++++--------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index e23b2430b..95546c288 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -27,10 +27,10 @@ namespace cms { uint32_t const *__restrict__ offsets) const { const uint32_t nt = offsets[nh]; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - uint32_t endElementIdxStrided = endElementIdx[0u]; - for (uint32_t threadIndexStrided = firstElementIdx[0u]; threadIndexStrided < nt; threadIndexStrided += gridDimension) { - for (uint32_t i = threadIndexStrided; i < endElementIdxStrided; ++i) { + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; @@ -38,7 +38,7 @@ namespace cms { assert(ih < int(nh)); h->count(acc, v[i], ih); } - endElementIdxStrided += gridDimension; + endElementIdx += gridDimension; } } }; @@ -52,11 +52,11 @@ namespace cms { uint32_t const *__restrict__ offsets) const { const uint32_t nt = offsets[nh]; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdx, endElementIdx] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - uint32_t endElementIdxStrided = endElementIdx[0u]; - for (uint32_t threadIdxStrided = firstElementIdx[0u]; threadIdxStrided < nt; threadIdxStrided += gridDimension) { - for (uint32_t i = threadIdxStrided; i < endElementIdxStrided; ++i) { + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; @@ -64,7 +64,7 @@ namespace cms { assert(ih < int(nh)); h->fill(acc, v[i], i, ih); } - endElementIdxStrided += gridDimension; + endElementIdx += gridDimension; } } }; @@ -305,14 +305,14 @@ namespace cms { } const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdx, endElementIdx] = cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); - uint32_t endElementIdxStrided = m + endElementIdx[0u]; - for (uint32_t threadIdxStrided = m + firstElementIdx[0u]; threadIdxStrided < totbins(); threadIdxStrided += gridDimension) { - for (uint32_t i = threadIdxStrided; i < endElementIdxStrided; ++i) { + uint32_t endElementIdx = m + endElementIdxNoStride[0u]; + for (uint32_t threadIdx = m + firstElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { off[i] = n; } - endElementIdxStrided += gridDimension; + endElementIdx += gridDimension; } } diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index e46fc17ba..6b2154a05 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -24,8 +24,8 @@ struct countMultiLocal { const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < n; firstElementIdx += gridDimension) { - for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { auto&& local = alpaka::block::shared::st::allocVar(acc); if (threadIdxLocal == 0) { // TO DO: use launchZero????? local.zero(); @@ -48,8 +48,8 @@ struct countMulti { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < n; firstElementIdx += gridDimension) { - for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { assoc->countDirect(acc, 2 + i % 4); } endElementIdx += gridDimension; @@ -64,8 +64,8 @@ struct verifyMulti { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < maxNumberOfElements; firstElementIdx += gridDimension) { - for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { assert(m1->off[i] == m2->off[i]); } endElementIdx += gridDimension; @@ -80,8 +80,8 @@ struct count { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < maxNumberOfElements; firstElementIdx += gridDimension) { - for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { auto k = i / 4; auto j = i - 4 * k; assert(j < 4); @@ -104,8 +104,8 @@ struct fill { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < maxNumberOfElements; firstElementIdx += gridDimension) { - for (uint32_t i = firstElementIdx; i < endElementIdx; ++i) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { auto k = i / 4; auto j = i - 4 * k; assert(j < 4); @@ -134,8 +134,8 @@ struct fillBulk { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t firstElementIdx = firstElementIdxNoStride[0u]; firstElementIdx < n; firstElementIdx += gridDimension) { - for (uint32_t k = firstElementIdx; k < endElementIdx; ++k) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { + for (uint32_t k = threadIdx; k < endElementIdx; ++k) { auto m = tk[k][3] < MaxElem ? 4 : 3; assoc->bulkFill(acc, *apc, &tk[k][0], m); } From c6e30b84ec052c3b00bbfc98719ab2df9553eb32 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 26 Jan 2021 16:27:34 +0100 Subject: [PATCH 12/32] Minor fixes --- src/alpaka/AlpakaCore/HistoContainer.h | 4 ++-- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 95546c288..6ab094abc 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -174,7 +174,7 @@ namespace cms { // iteratate over N bins left and right of the one containing "v" template - ALPAKA_FN_HOST ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { int bs = Hist::bin(value); int be = std::min(int(Hist::nbins() - 1), bs + n); bs = std::max(0, bs - n); @@ -186,7 +186,7 @@ namespace cms { // iteratate over bins containing all values in window wmin, wmax template - ALPAKA_FN_HOST ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { auto bs = Hist::bin(wmin); auto be = Hist::bin(wmax); assert(be >= bs); diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 6b2154a05..18376381b 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -27,7 +27,7 @@ struct countMultiLocal { for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { for (uint32_t i = threadIdx; i < endElementIdx; ++i) { auto&& local = alpaka::block::shared::st::allocVar(acc); - if (threadIdxLocal == 0) { // TO DO: use launchZero????? + if (threadIdxLocal == 0) { local.zero(); } alpaka::block::sync::syncBlockThreads(acc); @@ -168,8 +168,8 @@ int main() { constexpr uint32_t N = 4000; - auto tr_buf = alpaka::mem::buf::alloc, Idx>(host, N); - auto tr = alpaka::mem::view::getPtrNative(tr_buf); + auto tr_hbuf = alpaka::mem::buf::alloc, Idx>(host, N); + auto tr = alpaka::mem::view::getPtrNative(tr_hbuf); // fill with "index" to element long long ave = 0; int imax = 0; @@ -206,7 +206,7 @@ int main() { auto v_dbuf = alpaka::mem::buf::alloc, Idx>(device, N); - alpaka::mem::view::copy(queue, v_dbuf, tr_buf, N); + alpaka::mem::view::copy(queue, v_dbuf, tr_hbuf, N); auto a_dbuf = alpaka::mem::buf::alloc(device, 1u); @@ -300,7 +300,6 @@ int main() { )); alpaka::mem::view::copy(queue, la_hbuf, a_dbuf, 1u); - alpaka::wait::wait(queue); // TO DO: Needed??? auto dc_hbuf = alpaka::mem::buf::alloc(host, 1u); alpaka::mem::view::copy(queue, dc_hbuf, dc_dbuf, 1u); From 995af062e207e7a50fcea3a538d511068766a01d Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 26 Jan 2021 16:33:10 +0100 Subject: [PATCH 13/32] [alpaka] Add OneHistoContainer_t test --- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 src/alpaka/test/alpaka/OneHistoContainer_t.cc diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc new file mode 100644 index 000000000..d8efa063e --- /dev/null +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include + +#include "AlpakaCore/HistoContainer.h" + +struct mykernel { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, T const* __restrict__ v, uint32_t N) const { + assert(v); + assert(N == 12000); + + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); + if (threadIdxLocal == 0) { + printf("start kernel for %d data\n", N); + } + + using Hist = cms::alpakatools::HistoContainer; + + auto&& hist = alpaka::block::shared::st::allocVar(acc); + auto&& ws = alpaka::block::shared::st::allocVar(acc); + + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + + const auto& [firstElementIdxTotBins, endElementIdxTotBins] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Hist::totbins())); + for (uint32_t j = firstElementIdxTotBins[0u]; j < endElementIdxTotBins[0u]; j += blockDimension) { + hist.off[j] = 0; + } + alpaka::block::sync::syncBlockThreads(acc); + + const auto& [firstElementIdxCapacity, endElementIdxCapacity] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Hist::capacity())); + for (uint32_t j = firstElementIdxCapacity[0u]; j < endElementIdxCapacity[0u]; j += blockDimension) { + hist.off[j] = 0; + } + alpaka::block::sync::syncBlockThreads(acc); + + const auto& [firstElementIdxN, endElementIdxN] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); + for (uint32_t j = firstElementIdxN[0u]; j < endElementIdxN[0u]; j += blockDimension) { + hist.count(acc, v[j]); + } + alpaka::block::sync::syncBlockThreads(acc); + + assert(0 == hist.size()); + alpaka::block::sync::syncBlockThreads(acc); + + hist.finalize(acc, ws); + alpaka::block::sync::syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + printf("hist.size() = %u.\n", hist.size()); + } + //assert(N == hist.size()); + const auto& [firstElementIdxNBins, endElementIdxNBins] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Hist::nbins())); + for (uint32_t j = firstElementIdxNBins[0u]; j < endElementIdxNBins[0u]; j += blockDimension) { + assert(hist.off[j] <= hist.off[j + 1]); + } + alpaka::block::sync::syncBlockThreads(acc); + + if (threadIdxLocal < 32) { + ws[threadIdxLocal] = 0; // used by prefix scan... + } + alpaka::block::sync::syncBlockThreads(acc); + + for (uint32_t j = firstElementIdxN[0u]; j < endElementIdxN[0u]; j += blockDimension) { + hist.fill(acc, v[j], j); + } + alpaka::block::sync::syncBlockThreads(acc); + assert(0 == hist.off[0]); + //assert(N == hist.size()); + + const auto& [firstElementIdxSizeMinus1, endElementIdxSizeMinus1] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(hist.size() - 1)); + for (uint32_t j = firstElementIdxSizeMinus1[0u]; j < endElementIdxSizeMinus1[0u]; j += blockDimension) { + auto p = hist.begin() + j; + assert((*p) < N); + auto k1 = Hist::bin(v[*p]); + auto k2 = Hist::bin(v[*(p + 1)]); + assert(k2 >= k1); + } + + const auto& [firstElementIdxSize, endElementIdxSize] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(hist.size())); + for (uint32_t i = firstElementIdxSize[0u]; i < endElementIdxSize[0u]; i += blockDimension) { + auto p = hist.begin() + i; + auto j = *p; + auto b0 = Hist::bin(v[j]); + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k < N); + ++tot; + }; + cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); + int rtot = hist.size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); + assert(tot == rtot); + } + + } +}; + +template +void go(const DevHost& host, const DevAcc1& device, Queue& queue) { + std::mt19937 eng; + + int rmin = std::numeric_limits::min(); + int rmax = std::numeric_limits::max(); + if (NBINS != 128) { + rmin = 0; + rmax = NBINS * 2 - 1; + } + + std::uniform_int_distribution rgen(rmin, rmax); + constexpr unsigned int N = 12000; + + using Hist = cms::alpakatools::HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' + << (rmax - rmin) / Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + auto v_hbuf = alpaka::mem::buf::alloc(host, N); + auto v = alpaka::mem::view::getPtrNative(v_hbuf); + + for (int it = 0; it < 5; ++it) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) + v[j] = 4; + + + auto v_dbuf = alpaka::mem::buf::alloc(device, N); + alpaka::mem::view::copy(queue, v_dbuf, v_hbuf, N); + + const Vec1& threadsPerBlockOrElementsPerThread(Vec1::all(256)); + const Vec1& blocksPerGrid(Vec1::all(1)); + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + mykernel(), + alpaka::mem::view::getPtrNative(v_dbuf), + N + )); + alpaka::wait::wait(queue); + //launch(mykernel, {1, 256}, v_d.get(), N); + } +} + +int main() { + const DevHost host(alpaka::pltf::getDevByIdx(0u)); + const DevAcc1 device(alpaka::pltf::getDevByIdx(0u)); + Queue queue(device); + + go(host, device, queue); + //go(host, device, queue); + //go(host, device, queue); + + return 0; +} From 82ffd1443a801357206f47564e0b2b6c324e2f82 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 26 Jan 2021 18:33:48 +0100 Subject: [PATCH 14/32] Fixes in OneHistoContainer_t --- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 278 ++++++++++++------ 1 file changed, 196 insertions(+), 82 deletions(-) diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index d8efa063e..840e64ed3 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -6,117 +6,153 @@ #include "AlpakaCore/HistoContainer.h" -struct mykernel { - template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, T const* __restrict__ v, uint32_t N) const { - assert(v); - assert(N == 12000); - - const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); - if (threadIdxLocal == 0) { - printf("start kernel for %d data\n", N); - } +struct setZero { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist) const { - using Hist = cms::alpakatools::HistoContainer; + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdx, endElementIdx] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); + for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { + hist->off[j] = 0; + } + } +}; - auto&& hist = alpaka::block::shared::st::allocVar(acc); - auto&& ws = alpaka::block::shared::st::allocVar(acc); +struct setZeroBins { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - - const auto& [firstElementIdxTotBins, endElementIdxTotBins] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Hist::totbins())); - for (uint32_t j = firstElementIdxTotBins[0u]; j < endElementIdxTotBins[0u]; j += blockDimension) { - hist.off[j] = 0; + const auto& [firstElementIdx, endElementIdx] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::capacity())); + for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { + hist->bins[j] = 0; } - alpaka::block::sync::syncBlockThreads(acc); + } +}; - const auto& [firstElementIdxCapacity, endElementIdxCapacity] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Hist::capacity())); - for (uint32_t j = firstElementIdxCapacity[0u]; j < endElementIdxCapacity[0u]; j += blockDimension) { - hist.off[j] = 0; - } - alpaka::block::sync::syncBlockThreads(acc); +struct count { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist, + T* v, + uint32_t N) const { - const auto& [firstElementIdxN, endElementIdxN] = + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdx, endElementIdx] = cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); - for (uint32_t j = firstElementIdxN[0u]; j < endElementIdxN[0u]; j += blockDimension) { - hist.count(acc, v[j]); + for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { + hist->count(acc, v[j]); } - alpaka::block::sync::syncBlockThreads(acc); + } +}; - assert(0 == hist.size()); - alpaka::block::sync::syncBlockThreads(acc); +struct finalize { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist) const { + auto&& ws = alpaka::block::shared::st::allocVar(acc); + hist->finalize(acc, ws); + } +}; - hist.finalize(acc, ws); - alpaka::block::sync::syncBlockThreads(acc); +struct verify { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist) const { - if (threadIdxLocal == 0) { - printf("hist.size() = %u.\n", hist.size()); - } - //assert(N == hist.size()); - const auto& [firstElementIdxNBins, endElementIdxNBins] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Hist::nbins())); - for (uint32_t j = firstElementIdxNBins[0u]; j < endElementIdxNBins[0u]; j += blockDimension) { - assert(hist.off[j] <= hist.off[j + 1]); - } - alpaka::block::sync::syncBlockThreads(acc); + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdx, endElementIdx] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::nbins())); + for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { + assert(hist->off[j] <= hist->off[j + 1]); + } + } +}; - if (threadIdxLocal < 32) { - ws[threadIdxLocal] = 0; // used by prefix scan... - } - alpaka::block::sync::syncBlockThreads(acc); +struct fill { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist, + T* v, + uint32_t N) const { - for (uint32_t j = firstElementIdxN[0u]; j < endElementIdxN[0u]; j += blockDimension) { - hist.fill(acc, v[j], j); + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdx, endElementIdx] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); + for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { + hist->fill(acc, v[j], j); } - alpaka::block::sync::syncBlockThreads(acc); - assert(0 == hist.off[0]); - //assert(N == hist.size()); - - const auto& [firstElementIdxSizeMinus1, endElementIdxSizeMinus1] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(hist.size() - 1)); - for (uint32_t j = firstElementIdxSizeMinus1[0u]; j < endElementIdxSizeMinus1[0u]; j += blockDimension) { - auto p = hist.begin() + j; + } +}; + +struct bin { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist, + T* v, + uint32_t N) const { + + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdx, endElementIdx] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size() - 1)); + for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { + auto p = hist->begin() + j; assert((*p) < N); - auto k1 = Hist::bin(v[*p]); - auto k2 = Hist::bin(v[*(p + 1)]); + auto k1 = Histo::bin(v[*p]); + auto k2 = Histo::bin(v[*(p + 1)]); assert(k2 >= k1); } + } +}; + +struct forEachInWindow { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, + Histo *__restrict__ hist, + const T* v, + uint32_t N, + const int NBINS, + const int DELTA) const { - const auto& [firstElementIdxSize, endElementIdxSize] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(hist.size())); - for (uint32_t i = firstElementIdxSize[0u]; i < endElementIdxSize[0u]; i += blockDimension) { - auto p = hist.begin() + i; + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto& [firstElementIdx, endElementIdx] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size())); + for (uint32_t i = firstElementIdx[0u]; i < endElementIdx[0u]; i += blockDimension) { + auto p = hist->begin() + i; auto j = *p; - auto b0 = Hist::bin(v[j]); + auto b0 = Histo::bin(v[j]); int tot = 0; auto ftest = [&](unsigned int k) { assert(k < N); ++tot; }; - cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); - int rtot = hist.size(b0); + cms::alpakatools::forEachInWindow(*hist, v[j], v[j], ftest); + int rtot = hist->size(b0); assert(tot == rtot); tot = 0; auto vm = int(v[j]) - DELTA; auto vp = int(v[j]) + DELTA; - constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + const int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); vm = std::max(vm, 0); vm = std::min(vm, vmax); vp = std::min(vp, vmax); vp = std::max(vp, 0); assert(vp >= vm); - cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); - int bp = Hist::bin(vp); - int bm = Hist::bin(vm); - rtot = hist.end(bp) - hist.begin(bm); + cms::alpakatools::forEachInWindow(*hist, vm, vp, ftest); + int bp = Histo::bin(vp); + int bm = Histo::bin(vm); + rtot = hist->end(bp) - hist->begin(bm); assert(tot == rtot); - } - + } } }; + + template void go(const DevHost& host, const DevAcc1& device, Queue& queue) { std::mt19937 eng; @@ -140,27 +176,105 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { auto v = alpaka::mem::view::getPtrNative(v_hbuf); for (int it = 0; it < 5; ++it) { - for (long long j = 0; j < N; j++) + + for (long long j = 0; j < N; j++) { v[j] = rgen(eng); - if (it == 2) - for (long long j = N / 2; j < N / 2 + N / 4; j++) + } + if (it == 2) { + for (long long j = N / 2; j < N / 2 + N / 4; j++) { v[j] = 4; - + } + } auto v_dbuf = alpaka::mem::buf::alloc(device, N); alpaka::mem::view::copy(queue, v_dbuf, v_hbuf, N); - + + + printf("start kernel for %d data\n", N); + + using HistTeam = cms::alpakatools::HistoContainer; + auto hist_hbuf = alpaka::mem::buf::alloc(host, 1u); + auto hist = alpaka::mem::view::getPtrNative(hist_hbuf); + auto hist_dbuf = alpaka::mem::buf::alloc(device, 1u); + alpaka::mem::view::set(queue, hist_dbuf, 0, 1u); + const Vec1& threadsPerBlockOrElementsPerThread(Vec1::all(256)); const Vec1& blocksPerGrid(Vec1::all(1)); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + setZero(), + alpaka::mem::view::getPtrNative(hist_dbuf) + )); + alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, - mykernel(), + setZeroBins(), + alpaka::mem::view::getPtrNative(hist_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + count(), + alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N )); + + alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); + alpaka::wait::wait(queue); + assert(0 == hist->size()); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + finalize(), + alpaka::mem::view::getPtrNative(hist_dbuf) + )); + + printf("hist->size() = %u.\n", hist->size()); + alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); + alpaka::wait::wait(queue); + //assert(N == hist->size()); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + verify(), + alpaka::mem::view::getPtrNative(hist_dbuf) + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + fill(), + alpaka::mem::view::getPtrNative(hist_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + N + )); + + alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); + alpaka::wait::wait(queue); + assert(0 == hist->off[0]); + //assert(N == hist->size()); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + bin(), + alpaka::mem::view::getPtrNative(hist_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + N + )); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDiv, + forEachInWindow(), + alpaka::mem::view::getPtrNative(hist_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + N, + NBINS, + DELTA + )); + alpaka::wait::wait(queue); - //launch(mykernel, {1, 256}, v_d.get(), N); } } @@ -170,8 +284,8 @@ int main() { Queue queue(device); go(host, device, queue); - //go(host, device, queue); - //go(host, device, queue); + go(host, device, queue); + go(host, device, queue); return 0; } From c6477b6e6434e64df8d19ee07ac4fb67d8deac0a Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 26 Jan 2021 19:46:38 +0100 Subject: [PATCH 15/32] [alpaka] Important to initlize pointers properly, especially when located within a loop. --- src/alpaka/test/alpaka/HistoContainer_t.cc | 2 ++ src/alpaka/test/alpaka/OneHistoContainer_t.cc | 2 +- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index 2bf1325d3..b4b5b7b2f 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -70,6 +70,8 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { alpaka::mem::view::copy(queue, v_d, v_buf, N); + alpaka::mem::view::set(queue, h_d, 0, 1u); + std::cout << "Calling fillManyFromVector" << std::endl; fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), nParts, diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 840e64ed3..ee95282c3 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -235,7 +235,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { printf("hist->size() = %u.\n", hist->size()); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); - //assert(N == hist->size()); + //assert(N == hist->size()); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 18376381b..2d0ef8807 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -209,6 +209,7 @@ int main() { alpaka::mem::view::copy(queue, v_dbuf, tr_hbuf, N); auto a_dbuf = alpaka::mem::buf::alloc(device, 1u); + alpaka::mem::view::set(queue, a_dbuf, 0, 1u); const unsigned int nThreads = 256; const Vec1 threadsPerBlockOrElementsPerThread(nThreads); @@ -309,6 +310,7 @@ int main() { alpaka::mem::view::set(queue, dc_dbuf, 0, 1u); auto sa_dbuf = alpaka::mem::buf::alloc(device, 1u); + alpaka::mem::view::set(queue, sa_dbuf, 0, 1u); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, @@ -352,7 +354,9 @@ int main() { // here verify use of block local counters auto m1_dbuf = alpaka::mem::buf::alloc(device, 1u); + alpaka::mem::view::set(queue, m1_dbuf, 0, 1u); auto m2_dbuf = alpaka::mem::buf::alloc(device, 1u); + alpaka::mem::view::set(queue, m2_dbuf, 0, 1u); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv4N, From 94f94e68337d7530d55d864f0ae0240a581b67f6 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Wed, 27 Jan 2021 11:57:43 +0100 Subject: [PATCH 16/32] Important fix in OneHistoContainer_t test: correct strided access, now all assert pass and histo values are identical as with CUDA (for same input matrix v). --- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 135 +++++++++++------- 1 file changed, 81 insertions(+), 54 deletions(-) diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index ee95282c3..bda063409 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -12,10 +12,14 @@ struct setZero { Histo *__restrict__ hist) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); - for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { - hist->off[j] = 0; + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::totbins(); threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(Histo::totbins(), endElementIdx); ++j) { + hist->off[j] = 0; + } + endElementIdx += blockDimension; } } }; @@ -26,10 +30,14 @@ struct setZeroBins { Histo *__restrict__ hist) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::capacity())); - for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { - hist->bins[j] = 0; + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::capacity(); threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::capacity()); ++j) { + hist->bins[j] = 0; + } + endElementIdx += blockDimension; } } }; @@ -42,10 +50,14 @@ struct count { uint32_t N) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); - for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { - hist->count(acc, v[j]); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { + hist->count(acc, v[j]); + } + endElementIdx += blockDimension; } } }; @@ -65,11 +77,15 @@ struct verify { Histo *__restrict__ hist) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::nbins())); - for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { - assert(hist->off[j] <= hist->off[j + 1]); - } + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::nbins(); threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::nbins()); ++j) { + assert(hist->off[j] <= hist->off[j + 1]); + } + endElementIdx += blockDimension; + } } }; @@ -81,10 +97,14 @@ struct fill { uint32_t N) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); - for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { - hist->fill(acc, v[j], j); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { + hist->fill(acc, v[j], j); + } + endElementIdx += blockDimension; } } }; @@ -97,14 +117,18 @@ struct bin { uint32_t N) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size() - 1)); - for (uint32_t j = firstElementIdx[0u]; j < endElementIdx[0u]; j += blockDimension) { - auto p = hist->begin() + j; - assert((*p) < N); - auto k1 = Histo::bin(v[*p]); - auto k2 = Histo::bin(v[*(p + 1)]); - assert(k2 >= k1); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist->size() - 1; threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, hist->size() - 1); ++j) { + auto p = hist->begin() + j; + assert((*p) < N); + auto k1 = Histo::bin(v[*p]); + auto k2 = Histo::bin(v[*(p + 1)]); + assert(k2 >= k1); + } + endElementIdx += blockDimension; } } }; @@ -119,35 +143,39 @@ struct forEachInWindow { const int DELTA) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdx, endElementIdx] = + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size())); - for (uint32_t i = firstElementIdx[0u]; i < endElementIdx[0u]; i += blockDimension) { - auto p = hist->begin() + i; - auto j = *p; - auto b0 = Histo::bin(v[j]); - int tot = 0; - auto ftest = [&](unsigned int k) { - assert(k < N); - ++tot; - }; - cms::alpakatools::forEachInWindow(*hist, v[j], v[j], ftest); - int rtot = hist->size(b0); - assert(tot == rtot); - tot = 0; - auto vm = int(v[j]) - DELTA; - auto vp = int(v[j]) + DELTA; - const int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); - vm = std::max(vm, 0); - vm = std::min(vm, vmax); - vp = std::min(vp, vmax); - vp = std::max(vp, 0); - assert(vp >= vm); - cms::alpakatools::forEachInWindow(*hist, vm, vp, ftest); - int bp = Histo::bin(vp); - int bm = Histo::bin(vm); - rtot = hist->end(bp) - hist->begin(bm); - assert(tot == rtot); - } + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist->size(); threadIdx += blockDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, hist->size()); ++i) { + auto p = hist->begin() + i; + auto j = *p; + auto b0 = Histo::bin(v[j]); + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k < N); + ++tot; + }; + cms::alpakatools::forEachInWindow(*hist, v[j], v[j], ftest); + int rtot = hist->size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + const int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + cms::alpakatools::forEachInWindow(*hist, vm, vp, ftest); + int bp = Histo::bin(vp); + int bm = Histo::bin(vm); + rtot = hist->end(bp) - hist->begin(bm); + assert(tot == rtot); + } + endElementIdx += blockDimension; + } } }; @@ -232,10 +260,9 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { alpaka::mem::view::getPtrNative(hist_dbuf) )); - printf("hist->size() = %u.\n", hist->size()); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); - //assert(N == hist->size()); + assert(N == hist->size()); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, @@ -254,7 +281,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); assert(0 == hist->off[0]); - //assert(N == hist->size()); + assert(N == hist->size()); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, From 2ff31f9cef66d6437aa90078be9c7975ae0a53ff Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Wed, 27 Jan 2021 12:12:58 +0100 Subject: [PATCH 17/32] Also correct strided access in serial and TBB cases in HistoContainer and OneToManyAssoc_t --- src/alpaka/AlpakaCore/HistoContainer.h | 6 +++--- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 2 +- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 6ab094abc..a892363a9 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -30,7 +30,7 @@ namespace cms { const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; @@ -56,7 +56,7 @@ namespace cms { uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; @@ -309,7 +309,7 @@ namespace cms { uint32_t endElementIdx = m + endElementIdxNoStride[0u]; for (uint32_t threadIdx = m + firstElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins());; ++i) { off[i] = n; } endElementIdx += gridDimension; diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index bda063409..4116e2fff 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -16,7 +16,7 @@ struct setZero { cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::totbins(); threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(Histo::totbins(), endElementIdx); ++j) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::totbins()); ++j) { hist->off[j] = 0; } endElementIdx += blockDimension; diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 2d0ef8807..c9f6110e2 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -25,7 +25,7 @@ struct countMultiLocal { const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { auto&& local = alpaka::block::shared::st::allocVar(acc); if (threadIdxLocal == 0) { local.zero(); @@ -49,7 +49,7 @@ struct countMulti { const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { assoc->countDirect(acc, 2 + i % 4); } endElementIdx += gridDimension; @@ -65,7 +65,7 @@ struct verifyMulti { const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { assert(m1->off[i] == m2->off[i]); } endElementIdx += gridDimension; @@ -81,7 +81,7 @@ struct count { const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { auto k = i / 4; auto j = i - 4 * k; assert(j < 4); @@ -105,7 +105,7 @@ struct fill { const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < endElementIdx; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { auto k = i / 4; auto j = i - 4 * k; assert(j < 4); @@ -135,7 +135,7 @@ struct fillBulk { const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { - for (uint32_t k = threadIdx; k < endElementIdx; ++k) { + for (uint32_t k = threadIdx; k < std::min(endElementIdx, n); ++k) { auto m = tk[k][3] < MaxElem ? 4 : 3; assoc->bulkFill(acc, *apc, &tk[k][0], m); } From 8e9e776e99c7b8f27e4cb20555029db5cc6f3846 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Wed, 27 Jan 2021 12:30:07 +0100 Subject: [PATCH 18/32] Simplify Vec construction --- src/alpaka/AlpakaCore/HistoContainer.h | 10 +++++----- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 4 ++-- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 6 ++---- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index a892363a9..a9ec8d98c 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -102,9 +102,9 @@ namespace cms { uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); const unsigned int nthreads = 1024; + const Vec1 threadsPerBlockOrElementsPerThread(nthreads); const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; - const Vec1 &blocksPerGrid(Vec1::all(nblocks)); - const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); + const Vec1 blocksPerGrid(nblocks); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, @@ -143,8 +143,8 @@ namespace cms { const DevAcc1& device, Queue& queue) { const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; - const Vec1 &blocksPerGrid(Vec1::all(nblocks)); - const Vec1 &threadsPerBlockOrElementsPerThread(Vec1::all(nthreads)); + const Vec1 blocksPerGrid(nblocks); + const Vec1 threadsPerBlockOrElementsPerThread(nthreads); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, @@ -309,7 +309,7 @@ namespace cms { uint32_t endElementIdx = m + endElementIdxNoStride[0u]; for (uint32_t threadIdx = m + firstElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins());; ++i) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) { off[i] = n; } endElementIdx += gridDimension; diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 4116e2fff..d062cf64e 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -226,8 +226,8 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { auto hist_dbuf = alpaka::mem::buf::alloc(device, 1u); alpaka::mem::view::set(queue, hist_dbuf, 0, 1u); - const Vec1& threadsPerBlockOrElementsPerThread(Vec1::all(256)); - const Vec1& blocksPerGrid(Vec1::all(1)); + const Vec1 threadsPerBlockOrElementsPerThread(256u); + const Vec1 blocksPerGrid(1u); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index c9f6110e2..fe1c10f01 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -386,10 +386,8 @@ int main() { N )); - const unsigned int nBlocksTotBins = 1; - const Vec1 blocksPerGridTotBins(nBlocksTotBins); - const unsigned int nThreadsTotBins = Multiplicity::totbins(); - const Vec1 threadsPerBlockOrElementsPerThreadTotBins(nThreadsTotBins); + const Vec1 blocksPerGridTotBins(1u); + const Vec1 threadsPerBlockOrElementsPerThreadTotBins(Multiplicity::totbins()); const WorkDiv1 &workDivTotBins = cms::alpakatools::make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); alpaka::queue::enqueue(queue, From 8aaa1f3a13b265a8e0676e98ca202a63cbe3f0d1 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Wed, 27 Jan 2021 12:50:33 +0100 Subject: [PATCH 19/32] clang-format --- src/alpaka/AlpakaCore/HistoContainer.h | 421 +++++++++--------- src/alpaka/test/alpaka/HistoContainer_t.cc | 21 +- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 229 ++++------ src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 311 ++++++------- 4 files changed, 460 insertions(+), 522 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index a9ec8d98c..670e7a82b 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -6,7 +6,6 @@ #include #include - #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/alpakaWorkDivHelper.h" #include "AlpakaCore/AtomicPairCounter.h" @@ -20,161 +19,148 @@ namespace cms { struct countFromVector { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, - Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) const { - const uint32_t nt = offsets[nh]; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - h->count(acc, v[i], ih); - } - endElementIdx += gridDimension; - } + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + h->count(acc, v[i], ih); + } + endElementIdx += gridDimension; + } } }; struct fillFromVector { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, - Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) const { - const uint32_t nt = offsets[nh]; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - h->fill(acc, v[i], i, ih); - } - endElementIdx += gridDimension; - } + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + h->fill(acc, v[i], i, ih); + } + endElementIdx += gridDimension; + } } }; struct launchZero { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, - Histo *__restrict__ h) const { - const auto &[firstElementIdxGlobal, endElementIdxGlobal] = cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { - h->off[i] = 0; - } + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, + Histo *__restrict__ h) const { + const auto &[firstElementIdxGlobal, endElementIdxGlobal] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); + + for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { + h->off[i] = 0; + } } }; struct storePrefixScanWorkingSpace { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, - Histo *__restrict__ h, - const unsigned int nblocks) const { - h->psws = nblocks; + ALPAKA_FN_ACC void operator()(const T_Acc &acc, Histo *__restrict__ h, const unsigned int nblocks) const { + h->psws = nblocks; } }; template - ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, - const DevAcc1& device, - Queue& queue) { - uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - - const int num_items = Histo::totbins(); - - auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); - uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); - - const unsigned int nthreads = 1024; - const Vec1 threadsPerBlockOrElementsPerThread(nthreads); - const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; - const Vec1 blocksPerGrid(nblocks); - - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, - storePrefixScanWorkingSpace(), - h, - nblocks)); - - const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - multiBlockPrefixScanFirstStep(), - poff, - poff, - psum_d, - num_items)); - - const WorkDiv1 &workDivWith1Block = cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivWith1Block, - multiBlockPrefixScanSecondStep(), - poff, - poff, - psum_d, - num_items, - nblocks)); - alpaka::wait::wait(queue); - } + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, + const DevAcc1 &device, + Queue &queue) { + uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + + const int num_items = Histo::totbins(); + + auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); + uint32_t *psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); + + const unsigned int nthreads = 1024; + const Vec1 threadsPerBlockOrElementsPerThread(nthreads); + const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; + const Vec1 blocksPerGrid(nblocks); + + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, storePrefixScanWorkingSpace(), h, nblocks)); + + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDiv, multiBlockPrefixScanFirstStep(), poff, poff, psum_d, num_items)); + + const WorkDiv1 &workDivWith1Block = + cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, psum_d, num_items, nblocks)); + alpaka::wait::wait(queue); + } template - ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - uint32_t totSize, - unsigned int nthreads, - const DevAcc1& device, - Queue& queue) { + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector( + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + unsigned int nthreads, + const DevAcc1 &device, + Queue &queue) { const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; - const Vec1 blocksPerGrid(nblocks); + const Vec1 blocksPerGrid(nblocks); const Vec1 threadsPerBlockOrElementsPerThread(nthreads); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - launchZero(), - h)); + alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, launchZero(), h)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - countFromVector(), - h, nh, v, offsets)); + alpaka::kernel::createTaskKernel(workDiv, countFromVector(), h, nh, v, offsets)); launchFinalize(h, device, queue); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - fillFromVector(), - h, nh, v, offsets)); + alpaka::kernel::createTaskKernel(workDiv, fillFromVector(), h, nh, v, offsets)); alpaka::wait::wait(queue); } struct finalizeBulk { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { - assoc->bulkFinalizeFill(acc, *apc); + ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { + assoc->bulkFinalizeFill(acc, *apc); } }; // iteratate over N bins left and right of the one containing "v" template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { int bs = Hist::bin(value); int be = std::min(int(Hist::nbins() - 1), bs + n); bs = std::max(0, bs - n); @@ -186,7 +172,7 @@ namespace cms { // iteratate over bins containing all values in window wmin, wmax template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { auto bs = Hist::bin(wmin); auto be = Hist::bin(wmax); assert(be >= bs); @@ -195,8 +181,6 @@ namespace cms { } } - - template class HistoContainer { public: - using Counter = uint32_t; using CountersOnly = HistoContainer; @@ -242,122 +225,124 @@ namespace cms { return (t >> shift) & mask; } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void zero() { - for (auto &i : off) - i = 0; - } + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void zero() { + for (auto &i : off) + i = 0; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc& acc, CountersOnly const &co) { - for (uint32_t i = 0; i < totbins(); ++i) { - alpaka::atomic::atomicOp(acc, off + i, co.off[i]); + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc &acc, CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { + alpaka::atomic::atomicOp(acc, off + i, co.off[i]); + } } - } - template - static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc& acc, Counter &x) { - return alpaka::atomic::atomicOp(acc, &x, 1u); - } + template + static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc &acc, Counter &x) { + return alpaka::atomic::atomicOp(acc, &x, 1u); + } - template - static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc& acc, Counter &x) { - return alpaka::atomic::atomicOp(acc, &x, 1u); - } + template + static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc &acc, Counter &x) { + return alpaka::atomic::atomicOp(acc, &x, 1u); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const T_Acc& acc, T b) { - assert(b < nbins()); - atomicIncrement(acc, off[b]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const T_Acc &acc, T b) { + assert(b < nbins()); + atomicIncrement(acc, off[b]); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const T_Acc& acc, T b, index_type j) { - assert(b < nbins()); - auto w = atomicDecrement(acc, off[b]); - assert(w > 0); - bins[w - 1] = j; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const T_Acc &acc, T b, index_type j) { + assert(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + assert(w > 0); + bins[w - 1] = j; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t bulkFill(const T_Acc& acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { - auto c = apc.add(acc, n); - if (c.m >= nbins()) - return -int32_t(c.m); - off[c.m] = c.n; - for (uint32_t j = 0; j < n; ++j) - bins[c.n + j] = v[j]; - return c.m; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t + bulkFill(const T_Acc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(acc, n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const T_Acc& acc, AtomicPairCounter const &apc) { - off[apc.get().m] = apc.get().n; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const T_Acc &acc, AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { - auto m = apc.get().m; - auto n = apc.get().n; + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; - if (m >= nbins()) { // overflow! - off[nbins()] = uint32_t(off[nbins() - 1]); - return; - } + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); - uint32_t endElementIdx = m + endElementIdxNoStride[0u]; - for (uint32_t threadIdx = m + firstElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) { - off[i] = n; - } - endElementIdx += gridDimension; + uint32_t endElementIdx = m + endElementIdxNoStride[0u]; + for (uint32_t threadIdx = m + firstElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) { + off[i] = n; + } + endElementIdx += gridDimension; + } } - } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc& acc, T t) { - uint32_t b = bin(t); - assert(b < nbins()); - atomicIncrement(acc, off[b]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t) { + uint32_t b = bin(t); + assert(b < nbins()); + atomicIncrement(acc, off[b]); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc& acc, T t, index_type j) { - uint32_t b = bin(t); - assert(b < nbins()); - auto w = atomicDecrement(acc, off[b]); - assert(w > 0); - bins[w - 1] = j; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j) { + uint32_t b = bin(t); + assert(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + assert(w > 0); + bins[w - 1] = j; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc& acc, T t, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - atomicIncrement(acc, off[b]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + atomicIncrement(acc, off[b]); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc& acc, T t, index_type j, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - auto w = atomicDecrement(acc, off[b]); - assert(w > 0); - bins[w - 1] = j; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + auto w = atomicDecrement(acc, off[b]); + assert(w > 0); + bins[w - 1] = j; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const T_Acc& acc, Counter *ws = nullptr) { - assert(off[totbins() - 1] == 0); - blockPrefixScan(acc, off, totbins(), ws); - assert(off[totbins() - 1] == off[totbins() - 2]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const T_Acc &acc, Counter *ws = nullptr) { + assert(off[totbins() - 1] == 0); + blockPrefixScan(acc, off, totbins(), ws); + assert(off[totbins() - 1] == off[totbins() - 2]); + } constexpr auto size() const { return uint32_t(off[totbins() - 1]); } constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index b4b5b7b2f..237642829 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -23,12 +23,12 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { constexpr uint32_t nParts = 10; constexpr uint32_t partSize = N / nParts; - + using Hist = cms::alpakatools::HistoContainer; std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' ' << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; - + auto offsets_buf = alpaka::mem::buf::alloc(host, nParts + 1); auto offsets = alpaka::mem::view::getPtrNative(offsets_buf); auto off_d = alpaka::mem::buf::alloc(device, nParts + 1); @@ -37,7 +37,6 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { auto h_d = alpaka::mem::buf::alloc(device, 1u); for (int it = 0; it < 5; ++it) { - offsets[0] = 0; for (uint32_t j = 1; j < nParts + 1; ++j) { offsets[j] = offsets[j - 1] + partSize - 3 * j; @@ -73,14 +72,14 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { alpaka::mem::view::set(queue, h_d, 0, 1u); std::cout << "Calling fillManyFromVector" << std::endl; - fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), - nParts, - alpaka::mem::view::getPtrNative(v_d), - alpaka::mem::view::getPtrNative(off_d), - offsets[10], - 256, - device, - queue); + fillManyFromVector(alpaka::mem::view::getPtrNative(h_d), + nParts, + alpaka::mem::view::getPtrNative(v_d), + alpaka::mem::view::getPtrNative(off_d), + offsets[10], + 256, + device, + queue); alpaka::mem::view::copy(queue, h_buf, h_d, 1u); alpaka::wait::wait(queue); diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index d062cf64e..349c3b665 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -8,16 +8,14 @@ struct setZero { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist) const { - + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::totbins(); threadIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::totbins()); ++j) { - hist->off[j] = 0; + hist->off[j] = 0; } endElementIdx += blockDimension; } @@ -26,16 +24,14 @@ struct setZero { struct setZeroBins { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist) const { - + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::capacity())); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::capacity())); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::capacity(); threadIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::capacity()); ++j) { - hist->bins[j] = 0; + hist->bins[j] = 0; } endElementIdx += blockDimension; } @@ -44,18 +40,14 @@ struct setZeroBins { struct count { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist, - T* v, - uint32_t N) const { - + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist, T *v, uint32_t N) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { - hist->count(acc, v[j]); + hist->count(acc, v[j]); } endElementIdx += blockDimension; } @@ -64,25 +56,22 @@ struct count { struct finalize { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist) const { - auto&& ws = alpaka::block::shared::st::allocVar(acc); + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { + auto &&ws = alpaka::block::shared::st::allocVar(acc); hist->finalize(acc, ws); } }; struct verify { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist) const { - + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::nbins())); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::nbins())); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::nbins(); threadIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::nbins()); ++j) { - assert(hist->off[j] <= hist->off[j + 1]); + assert(hist->off[j] <= hist->off[j + 1]); } endElementIdx += blockDimension; } @@ -91,18 +80,14 @@ struct verify { struct fill { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist, - T* v, - uint32_t N) const { - + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist, T *v, uint32_t N) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { - hist->fill(acc, v[j], j); + hist->fill(acc, v[j], j); } endElementIdx += blockDimension; } @@ -111,22 +96,18 @@ struct fill { struct bin { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist, - T* v, - uint32_t N) const { - + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist, T *v, uint32_t N) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size() - 1)); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size() - 1)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist->size() - 1; threadIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, hist->size() - 1); ++j) { - auto p = hist->begin() + j; - assert((*p) < N); - auto k1 = Histo::bin(v[*p]); - auto k2 = Histo::bin(v[*(p + 1)]); - assert(k2 >= k1); + auto p = hist->begin() + j; + assert((*p) < N); + auto k1 = Histo::bin(v[*p]); + auto k2 = Histo::bin(v[*(p + 1)]); + assert(k2 >= k1); } endElementIdx += blockDimension; } @@ -135,54 +116,47 @@ struct bin { struct forEachInWindow { template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, - Histo *__restrict__ hist, - const T* v, - uint32_t N, - const int NBINS, - const int DELTA) const { - + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()( + const T_Acc &acc, Histo *__restrict__ hist, const T *v, uint32_t N, const int NBINS, const int DELTA) const { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size())); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size())); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist->size(); threadIdx += blockDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, hist->size()); ++i) { - auto p = hist->begin() + i; - auto j = *p; - auto b0 = Histo::bin(v[j]); - int tot = 0; - auto ftest = [&](unsigned int k) { - assert(k < N); - ++tot; - }; - cms::alpakatools::forEachInWindow(*hist, v[j], v[j], ftest); - int rtot = hist->size(b0); - assert(tot == rtot); - tot = 0; - auto vm = int(v[j]) - DELTA; - auto vp = int(v[j]) + DELTA; - const int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); - vm = std::max(vm, 0); - vm = std::min(vm, vmax); - vp = std::min(vp, vmax); - vp = std::max(vp, 0); - assert(vp >= vm); - cms::alpakatools::forEachInWindow(*hist, vm, vp, ftest); - int bp = Histo::bin(vp); - int bm = Histo::bin(vm); - rtot = hist->end(bp) - hist->begin(bm); - assert(tot == rtot); + auto p = hist->begin() + i; + auto j = *p; + auto b0 = Histo::bin(v[j]); + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k < N); + ++tot; + }; + cms::alpakatools::forEachInWindow(*hist, v[j], v[j], ftest); + int rtot = hist->size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + const int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + cms::alpakatools::forEachInWindow(*hist, vm, vp, ftest); + int bp = Histo::bin(vp); + int bm = Histo::bin(vm); + rtot = hist->end(bp) - hist->begin(bm); + assert(tot == rtot); } endElementIdx += blockDimension; } } }; - - template -void go(const DevHost& host, const DevAcc1& device, Queue& queue) { +void go(const DevHost &host, const DevAcc1 &device, Queue &queue) { std::mt19937 eng; int rmin = std::numeric_limits::min(); @@ -193,7 +167,7 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { } std::uniform_int_distribution rgen(rmin, rmax); - constexpr unsigned int N = 12000; + constexpr unsigned int N = 12000; using Hist = cms::alpakatools::HistoContainer; std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' @@ -204,7 +178,6 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { auto v = alpaka::mem::view::getPtrNative(v_hbuf); for (int it = 0; it < 5; ++it) { - for (long long j = 0; j < N; j++) { v[j] = rgen(eng); } @@ -217,7 +190,6 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { auto v_dbuf = alpaka::mem::buf::alloc(device, N); alpaka::mem::view::copy(queue, v_dbuf, v_hbuf, N); - printf("start kernel for %d data\n", N); using HistTeam = cms::alpakatools::HistoContainer; @@ -230,77 +202,56 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { const Vec1 blocksPerGrid(1u); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - setZero(), - alpaka::mem::view::getPtrNative(hist_dbuf) - )); + alpaka::queue::enqueue( + queue, alpaka::kernel::createTaskKernel(workDiv, setZero(), alpaka::mem::view::getPtrNative(hist_dbuf))); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - setZeroBins(), - alpaka::mem::view::getPtrNative(hist_dbuf) - )); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv, setZeroBins(), alpaka::mem::view::getPtrNative(hist_dbuf))); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - count(), - alpaka::mem::view::getPtrNative(hist_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - N - )); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + workDiv, count(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); assert(0 == hist->size()); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - finalize(), - alpaka::mem::view::getPtrNative(hist_dbuf) - )); + alpaka::queue::enqueue( + queue, alpaka::kernel::createTaskKernel(workDiv, finalize(), alpaka::mem::view::getPtrNative(hist_dbuf))); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); assert(N == hist->size()); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - verify(), - alpaka::mem::view::getPtrNative(hist_dbuf) - )); + alpaka::queue::enqueue( + queue, alpaka::kernel::createTaskKernel(workDiv, verify(), alpaka::mem::view::getPtrNative(hist_dbuf))); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - fill(), - alpaka::mem::view::getPtrNative(hist_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - N - )); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + workDiv, fill(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); assert(0 == hist->off[0]); assert(N == hist->size()); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - bin(), - alpaka::mem::view::getPtrNative(hist_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - N - )); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + workDiv, bin(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - forEachInWindow(), - alpaka::mem::view::getPtrNative(hist_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - N, - NBINS, - DELTA - )); - + alpaka::kernel::createTaskKernel(workDiv, + forEachInWindow(), + alpaka::mem::view::getPtrNative(hist_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + N, + NBINS, + DELTA)); + alpaka::wait::wait(queue); } } diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index fe1c10f01..b8bfdf3c2 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -19,23 +19,27 @@ using TK = std::array; struct countMultiLocal { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + TK const* __restrict__ tk, + Multiplicity* __restrict__ assoc, + uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { - auto&& local = alpaka::block::shared::st::allocVar(acc); - if (threadIdxLocal == 0) { - local.zero(); - } - alpaka::block::sync::syncBlockThreads(acc); - local.countDirect(acc, 2 + i % 4); - alpaka::block::sync::syncBlockThreads(acc); - if (threadIdxLocal == 0) { - assoc->add(acc, local); - } + auto&& local = alpaka::block::shared::st::allocVar(acc); + if (threadIdxLocal == 0) { + local.zero(); + } + alpaka::block::sync::syncBlockThreads(acc); + local.countDirect(acc, 2 + i % 4); + alpaka::block::sync::syncBlockThreads(acc); + if (threadIdxLocal == 0) { + assoc->add(acc, local); + } } endElementIdx += gridDimension; } @@ -44,13 +48,17 @@ struct countMultiLocal { struct countMulti { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + TK const* __restrict__ tk, + Multiplicity* __restrict__ assoc, + uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { - assoc->countDirect(acc, 2 + i % 4); + assoc->countDirect(acc, 2 + i % 4); } endElementIdx += gridDimension; } @@ -59,14 +67,16 @@ struct countMulti { struct verifyMulti { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const { const uint32_t maxNumberOfElements = Multiplicity::totbins(); const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + threadIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { - assert(m1->off[i] == m2->off[i]); + assert(m1->off[i] == m2->off[i]); } endElementIdx += gridDimension; } @@ -75,22 +85,27 @@ struct verifyMulti { struct count { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + TK const* __restrict__ tk, + Assoc* __restrict__ assoc, + uint32_t n) const { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + threadIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { - auto k = i / 4; - auto j = i - 4 * k; - assert(j < 4); - if (k >= n) { - return; - } - if (tk[k][j] < MaxElem) { - assoc->countDirect(acc, tk[k][j]); - } + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->countDirect(acc, tk[k][j]); + } } endElementIdx += gridDimension; } @@ -99,22 +114,27 @@ struct count { struct fill { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + TK const* __restrict__ tk, + Assoc* __restrict__ assoc, + uint32_t n) const { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + threadIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { - auto k = i / 4; - auto j = i - 4 * k; - assert(j < 4); - if (k >= n) { - return; - } - if (tk[k][j] < MaxElem) { - assoc->fillDirect(acc, tk[k][j], k); - } + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->fillDirect(acc, tk[k][j], k); + } } endElementIdx += gridDimension; } @@ -123,21 +143,26 @@ struct fill { struct verify { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, Assoc* __restrict__ assoc) const { - assert(assoc->size() < Assoc::capacity()); + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Assoc* __restrict__ assoc) const { + assert(assoc->size() < Assoc::capacity()); } }; struct fillBulk { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, cms::alpakatools::AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + cms::alpakatools::AtomicPairCounter* apc, + TK const* __restrict__ tk, + Assoc* __restrict__ assoc, + uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); uint32_t endElementIdx = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { for (uint32_t k = threadIdx; k < std::min(endElementIdx, n); ++k) { - auto m = tk[k][3] < MaxElem ? 4 : 3; - assoc->bulkFill(acc, *apc, &tk[k][0], m); + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(acc, *apc, &tk[k][0], m); } endElementIdx += gridDimension; } @@ -146,7 +171,9 @@ struct fillBulk { struct verifyBulk { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, Assoc const* __restrict__ assoc, cms::alpakatools::AtomicPairCounter const* apc) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + Assoc const* __restrict__ assoc, + cms::alpakatools::AtomicPairCounter const* apc) const { if (apc->get().m >= Assoc::nbins()) { printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); } @@ -204,48 +231,38 @@ int main() { } std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; - auto v_dbuf = alpaka::mem::buf::alloc, Idx>(device, N); alpaka::mem::view::copy(queue, v_dbuf, tr_hbuf, N); auto a_dbuf = alpaka::mem::buf::alloc(device, 1u); alpaka::mem::view::set(queue, a_dbuf, 0, 1u); - + const unsigned int nThreads = 256; const Vec1 threadsPerBlockOrElementsPerThread(nThreads); const unsigned int nBlocks4N = (4 * N + nThreads - 1) / nThreads; const Vec1 blocksPerGrid4N(nBlocks4N); - const WorkDiv1 &workDiv4N = cms::alpakatools::make_workdiv(blocksPerGrid4N, threadsPerBlockOrElementsPerThread); + const WorkDiv1& workDiv4N = cms::alpakatools::make_workdiv(blocksPerGrid4N, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - cms::alpakatools::launchZero(), - alpaka::mem::view::getPtrNative(a_dbuf) - )); + alpaka::kernel::createTaskKernel( + workDiv4N, cms::alpakatools::launchZero(), alpaka::mem::view::getPtrNative(a_dbuf))); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - count(), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(a_dbuf), - N - )); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + workDiv4N, count(), alpaka::mem::view::getPtrNative(v_dbuf), alpaka::mem::view::getPtrNative(a_dbuf), N)); cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(a_dbuf), device, queue); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, - verify(), - alpaka::mem::view::getPtrNative(a_dbuf) - )); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, verify(), alpaka::mem::view::getPtrNative(a_dbuf))); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - fill(), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(a_dbuf), - N - )); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel( + workDiv4N, fill(), alpaka::mem::view::getPtrNative(v_dbuf), alpaka::mem::view::getPtrNative(a_dbuf), N)); auto la_hbuf = alpaka::mem::buf::alloc(host, 1u); alpaka::mem::view::copy(queue, la_hbuf, a_dbuf, 1u); @@ -274,31 +291,28 @@ int main() { alpaka::mem::view::set(queue, dc_dbuf, 0, 1u); const unsigned int nBlocks = (N + nThreads - 1) / nThreads; - const Vec1 blocksPerGrid(nBlocks); - const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + const Vec1 blocksPerGrid(nBlocks); + const WorkDiv1& workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - fillBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(a_dbuf), - N - )); + alpaka::kernel::createTaskKernel(workDiv, + fillBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf), + N)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - cms::alpakatools::finalizeBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(a_dbuf) - )); + alpaka::kernel::createTaskKernel(workDiv, + cms::alpakatools::finalizeBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf))); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, - verifyBulk(), - alpaka::mem::view::getPtrNative(a_dbuf), - alpaka::mem::view::getPtrNative(dc_dbuf) - )); + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + verifyBulk(), + alpaka::mem::view::getPtrNative(a_dbuf), + alpaka::mem::view::getPtrNative(dc_dbuf))); alpaka::mem::view::copy(queue, la_hbuf, a_dbuf, 1u); @@ -307,33 +321,29 @@ int main() { alpaka::wait::wait(queue); auto dc = alpaka::mem::view::getPtrNative(dc_hbuf); - alpaka::mem::view::set(queue, dc_dbuf, 0, 1u); auto sa_dbuf = alpaka::mem::buf::alloc(device, 1u); alpaka::mem::view::set(queue, sa_dbuf, 0, 1u); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - fillBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(sa_dbuf), - N - )); + alpaka::kernel::createTaskKernel(workDiv, + fillBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(sa_dbuf), + N)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - cms::alpakatools::finalizeBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(sa_dbuf) - )); + alpaka::kernel::createTaskKernel(workDiv, + cms::alpakatools::finalizeBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(sa_dbuf))); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, - verifyBulk(), - alpaka::mem::view::getPtrNative(sa_dbuf), - alpaka::mem::view::getPtrNative(dc_dbuf) - )); + alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + verifyBulk(), + alpaka::mem::view::getPtrNative(sa_dbuf), + alpaka::mem::view::getPtrNative(dc_dbuf))); std::cout << "final counter value " << dc->get().n << ' ' << dc->get().m << std::endl; @@ -341,14 +351,14 @@ int main() { imax = 0; ave = 0; for (auto i = 0U; i < N; ++i) { - auto x = la->size(i); - if (!(x == 4 || x == 3)) { - std::cout << i << ' ' << x << std::endl; -} - assert(x == 4 || x == 3); - ave += x; - imax = std::max(imax, int(x)); -} + auto x = la->size(i); + if (!(x == 4 || x == 3)) { + std::cout << i << ' ' << x << std::endl; + } + assert(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); + } assert(0 == la->size(N)); std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; @@ -359,55 +369,48 @@ int main() { alpaka::mem::view::set(queue, m2_dbuf, 0, 1u); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - cms::alpakatools::launchZero(), - alpaka::mem::view::getPtrNative(m1_dbuf) - )); + alpaka::kernel::createTaskKernel( + workDiv4N, cms::alpakatools::launchZero(), alpaka::mem::view::getPtrNative(m1_dbuf))); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - cms::alpakatools::launchZero(), - alpaka::mem::view::getPtrNative(m2_dbuf) - )); + alpaka::kernel::createTaskKernel( + workDiv4N, cms::alpakatools::launchZero(), alpaka::mem::view::getPtrNative(m2_dbuf))); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - countMulti(), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(m1_dbuf), - N - )); + alpaka::kernel::createTaskKernel(workDiv4N, + countMulti(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(m1_dbuf), + N)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - countMultiLocal(), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(m2_dbuf), - N - )); + alpaka::kernel::createTaskKernel(workDiv4N, + countMultiLocal(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf), + N)); const Vec1 blocksPerGridTotBins(1u); const Vec1 threadsPerBlockOrElementsPerThreadTotBins(Multiplicity::totbins()); - const WorkDiv1 &workDivTotBins = cms::alpakatools::make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); + const WorkDiv1& workDivTotBins = + cms::alpakatools::make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivTotBins, - verifyMulti(), - alpaka::mem::view::getPtrNative(m1_dbuf), - alpaka::mem::view::getPtrNative(m2_dbuf) - )); + alpaka::kernel::createTaskKernel(workDivTotBins, + verifyMulti(), + alpaka::mem::view::getPtrNative(m1_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf))); cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m1_dbuf), device, queue); cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m2_dbuf), device, queue); - + alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivTotBins, - verifyMulti(), - alpaka::mem::view::getPtrNative(m1_dbuf), - alpaka::mem::view::getPtrNative(m2_dbuf) - )); + alpaka::kernel::createTaskKernel(workDivTotBins, + verifyMulti(), + alpaka::mem::view::getPtrNative(m1_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf))); alpaka::wait::wait(queue); - + return 0; } From b6e9adcd3a7a5e3c7ea2d78a63c35dbeef8f4f5d Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Thu, 28 Jan 2021 15:36:57 +0100 Subject: [PATCH 20/32] Remove alpaka::wait::wait(queue); before host function end of scope. All workdiv / function object / arguments info are copied anyway, and the owning pointer to the histogram is defined outside the function scope --- src/alpaka/AlpakaCore/HistoContainer.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 670e7a82b..d85b9dc7e 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -122,7 +122,6 @@ namespace cms { queue, alpaka::kernel::createTaskKernel( workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, psum_d, num_items, nblocks)); - alpaka::wait::wait(queue); } template @@ -148,7 +147,6 @@ namespace cms { alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, fillFromVector(), h, nh, v, offsets)); - alpaka::wait::wait(queue); } struct finalizeBulk { From 94105db5c744ef3d981408b240f94970cdaf7594 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Thu, 28 Jan 2021 16:36:51 +0100 Subject: [PATCH 21/32] Remove using namespace ALPAKA_ACCELERATOR_NAMESPACE, and directly prepend ALPAKA_ACCELERATOR_NAMESPACE when needed. Could also place entire callers within ALPAKA_ACCELERATOR_NAMESPACE namespace. --- src/alpaka/AlpakaCore/HistoContainer.h | 28 ++-- src/alpaka/test/alpaka/HistoContainer_t.cc | 11 +- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 53 ++++--- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 135 ++++++++++-------- 4 files changed, 124 insertions(+), 103 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index d85b9dc7e..b0bed60b5 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -12,8 +12,6 @@ #include "AlpakaCore/alpakastdAlgorithm.h" #include "AlpakaCore/prefixScan.h" -using namespace ALPAKA_ACCELERATOR_NAMESPACE; - namespace cms { namespace alpakatools { @@ -91,9 +89,10 @@ namespace cms { }; template - ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, - const DevAcc1 &device, - Queue &queue) { + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize( + Histo *__restrict__ h, + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, + ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); const int num_items = Histo::totbins(); @@ -108,19 +107,19 @@ namespace cms { alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, storePrefixScanWorkingSpace(), h, nblocks)); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv, multiBlockPrefixScanFirstStep(), poff, poff, psum_d, num_items)); const WorkDiv1 &workDivWith1Block = cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, psum_d, num_items, nblocks)); } @@ -132,21 +131,24 @@ namespace cms { uint32_t const *__restrict__ offsets, uint32_t totSize, unsigned int nthreads, - const DevAcc1 &device, - Queue &queue) { + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, + ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; const Vec1 blocksPerGrid(nblocks); const Vec1 threadsPerBlockOrElementsPerThread(nthreads); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, launchZero(), h)); + alpaka::queue::enqueue( + queue, alpaka::kernel::createTaskKernel(workDiv, launchZero(), h)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, countFromVector(), h, nh, v, offsets)); + alpaka::kernel::createTaskKernel( + workDiv, countFromVector(), h, nh, v, offsets)); launchFinalize(h, device, queue); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, fillFromVector(), h, nh, v, offsets)); + alpaka::kernel::createTaskKernel( + workDiv, fillFromVector(), h, nh, v, offsets)); } struct finalizeBulk { diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index 237642829..64fc5e3cf 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -8,10 +8,10 @@ #include "AlpakaCore/alpakaWorkDivHelper.h" #include "AlpakaCore/HistoContainer.h" -using namespace ALPAKA_ACCELERATOR_NAMESPACE; - template -void go(const DevHost& host, const DevAcc1& device, Queue& queue) { +void go(const DevHost& host, + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1& device, + ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { std::mt19937 eng; std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); @@ -162,8 +162,9 @@ void go(const DevHost& host, const DevAcc1& device, Queue& queue) { int main() { const DevHost host(alpaka::pltf::getDevByIdx(0u)); - const DevAcc1 device(alpaka::pltf::getDevByIdx(0u)); - Queue queue(device); + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 device( + alpaka::pltf::getDevByIdx(0u)); + ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); go(host, device, queue); go(host, device, queue); diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 349c3b665..aa75b7a37 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -156,7 +156,9 @@ struct forEachInWindow { }; template -void go(const DevHost &host, const DevAcc1 &device, Queue &queue) { +void go(const DevHost &host, + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, + ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { std::mt19937 eng; int rmin = std::numeric_limits::min(); @@ -202,35 +204,38 @@ void go(const DevHost &host, const DevAcc1 &device, Queue &queue) { const Vec1 blocksPerGrid(1u); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue( - queue, alpaka::kernel::createTaskKernel(workDiv, setZero(), alpaka::mem::view::getPtrNative(hist_dbuf))); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDiv, setZero(), alpaka::mem::view::getPtrNative(hist_dbuf))); - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel(workDiv, setZeroBins(), alpaka::mem::view::getPtrNative(hist_dbuf))); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDiv, setZeroBins(), alpaka::mem::view::getPtrNative(hist_dbuf))); alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv, count(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); assert(0 == hist->size()); - alpaka::queue::enqueue( - queue, alpaka::kernel::createTaskKernel(workDiv, finalize(), alpaka::mem::view::getPtrNative(hist_dbuf))); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDiv, finalize(), alpaka::mem::view::getPtrNative(hist_dbuf))); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); alpaka::wait::wait(queue); assert(N == hist->size()); - alpaka::queue::enqueue( - queue, alpaka::kernel::createTaskKernel(workDiv, verify(), alpaka::mem::view::getPtrNative(hist_dbuf))); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDiv, verify(), alpaka::mem::view::getPtrNative(hist_dbuf))); alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv, fill(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); @@ -240,17 +245,18 @@ void go(const DevHost &host, const DevAcc1 &device, Queue &queue) { alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv, bin(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - forEachInWindow(), - alpaka::mem::view::getPtrNative(hist_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - N, - NBINS, - DELTA)); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv, + forEachInWindow(), + alpaka::mem::view::getPtrNative(hist_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + N, + NBINS, + DELTA)); alpaka::wait::wait(queue); } @@ -258,8 +264,9 @@ void go(const DevHost &host, const DevAcc1 &device, Queue &queue) { int main() { const DevHost host(alpaka::pltf::getDevByIdx(0u)); - const DevAcc1 device(alpaka::pltf::getDevByIdx(0u)); - Queue queue(device); + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 device( + alpaka::pltf::getDevByIdx(0u)); + ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); go(host, device, queue); go(host, device, queue); diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index b8bfdf3c2..a80f09bee 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -183,8 +183,9 @@ struct verifyBulk { int main() { const DevHost host(alpaka::pltf::getDevByIdx(0u)); - const DevAcc1 device(alpaka::pltf::getDevByIdx(0u)); - Queue queue(device); + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 device( + alpaka::pltf::getDevByIdx(0u)); + ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl; std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' ' @@ -244,24 +245,24 @@ int main() { const WorkDiv1& workDiv4N = cms::alpakatools::make_workdiv(blocksPerGrid4N, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv4N, cms::alpakatools::launchZero(), alpaka::mem::view::getPtrNative(a_dbuf))); alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv4N, count(), alpaka::mem::view::getPtrNative(v_dbuf), alpaka::mem::view::getPtrNative(a_dbuf), N)); cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(a_dbuf), device, queue); alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, verify(), alpaka::mem::view::getPtrNative(a_dbuf))); alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv4N, fill(), alpaka::mem::view::getPtrNative(v_dbuf), alpaka::mem::view::getPtrNative(a_dbuf), N)); auto la_hbuf = alpaka::mem::buf::alloc(host, 1u); @@ -294,25 +295,28 @@ int main() { const Vec1 blocksPerGrid(nBlocks); const WorkDiv1& workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - fillBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(a_dbuf), - N)); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv, + fillBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf), + N)); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - cms::alpakatools::finalizeBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(a_dbuf))); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv, + cms::alpakatools::finalizeBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(a_dbuf))); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, - verifyBulk(), - alpaka::mem::view::getPtrNative(a_dbuf), - alpaka::mem::view::getPtrNative(dc_dbuf))); + alpaka::kernel::createTaskKernel( + WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + verifyBulk(), + alpaka::mem::view::getPtrNative(a_dbuf), + alpaka::mem::view::getPtrNative(dc_dbuf))); alpaka::mem::view::copy(queue, la_hbuf, a_dbuf, 1u); @@ -325,25 +329,28 @@ int main() { auto sa_dbuf = alpaka::mem::buf::alloc(device, 1u); alpaka::mem::view::set(queue, sa_dbuf, 0, 1u); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - fillBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(sa_dbuf), - N)); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv, + fillBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(sa_dbuf), + N)); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - cms::alpakatools::finalizeBulk(), - alpaka::mem::view::getPtrNative(dc_dbuf), - alpaka::mem::view::getPtrNative(sa_dbuf))); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv, + cms::alpakatools::finalizeBulk(), + alpaka::mem::view::getPtrNative(dc_dbuf), + alpaka::mem::view::getPtrNative(sa_dbuf))); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, - verifyBulk(), - alpaka::mem::view::getPtrNative(sa_dbuf), - alpaka::mem::view::getPtrNative(dc_dbuf))); + alpaka::kernel::createTaskKernel( + WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, + verifyBulk(), + alpaka::mem::view::getPtrNative(sa_dbuf), + alpaka::mem::view::getPtrNative(dc_dbuf))); std::cout << "final counter value " << dc->get().n << ' ' << dc->get().m << std::endl; @@ -369,46 +376,50 @@ int main() { alpaka::mem::view::set(queue, m2_dbuf, 0, 1u); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv4N, cms::alpakatools::launchZero(), alpaka::mem::view::getPtrNative(m1_dbuf))); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( + alpaka::kernel::createTaskKernel( workDiv4N, cms::alpakatools::launchZero(), alpaka::mem::view::getPtrNative(m2_dbuf))); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - countMulti(), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(m1_dbuf), - N)); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv4N, + countMulti(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(m1_dbuf), + N)); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv4N, - countMultiLocal(), - alpaka::mem::view::getPtrNative(v_dbuf), - alpaka::mem::view::getPtrNative(m2_dbuf), - N)); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDiv4N, + countMultiLocal(), + alpaka::mem::view::getPtrNative(v_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf), + N)); const Vec1 blocksPerGridTotBins(1u); const Vec1 threadsPerBlockOrElementsPerThreadTotBins(Multiplicity::totbins()); const WorkDiv1& workDivTotBins = cms::alpakatools::make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivTotBins, - verifyMulti(), - alpaka::mem::view::getPtrNative(m1_dbuf), - alpaka::mem::view::getPtrNative(m2_dbuf))); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDivTotBins, + verifyMulti(), + alpaka::mem::view::getPtrNative(m1_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf))); cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m1_dbuf), device, queue); cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m2_dbuf), device, queue); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivTotBins, - verifyMulti(), - alpaka::mem::view::getPtrNative(m1_dbuf), - alpaka::mem::view::getPtrNative(m2_dbuf))); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDivTotBins, + verifyMulti(), + alpaka::mem::view::getPtrNative(m1_dbuf), + alpaka::mem::view::getPtrNative(m2_dbuf))); alpaka::wait::wait(queue); From c08d2052a4b4fc2a6b31e15f79ee62ba48aaada5 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Fri, 29 Jan 2021 12:33:23 +0100 Subject: [PATCH 22/32] OneHistoContainer: Add 1-kernel version --- src/alpaka/AlpakaCore/alpakaWorkDivHelper.h | 29 ++ src/alpaka/test/alpaka/OneHistoContainer_t.cc | 326 +++++++----------- 2 files changed, 159 insertions(+), 196 deletions(-) diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index 4dc36caa0..fa90d91bf 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -56,6 +56,35 @@ namespace cms { return {firstElementIdxGlobalVec, endElementIdxGlobalVec}; } + + + /* + * Computes the range of the element(s) global index(es) in grid. + */ + template + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_uncut(const T_Acc& acc, const Vec& maxNumberOfElements) { + Vec firstElementIdxGlobalVec = Vec::zeros(); + Vec endElementIdxGlobalVec = Vec::zeros(); + + for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { + // Global thread index in grid (along dimension dimIndex). + const uint32_t threadIdxGlobal(alpaka::idx::getIdx(acc)[dimIndex]); + const uint32_t threadDimension(alpaka::workdiv::getWorkDiv(acc)[dimIndex]); + + // Global element index in grid (along dimension dimIndex). + // Obviously relevant for CPU only. + // For GPU, threadDimension = 1, and firstElementIdxGlobal = endElementIdxGlobal = threadIndexGlobal. + const uint32_t firstElementIdxGlobal = threadIdxGlobal * threadDimension; + const uint32_t endElementIdxGlobal = firstElementIdxGlobal + threadDimension; + + firstElementIdxGlobalVec[dimIndex] = firstElementIdxGlobal; + endElementIdxGlobalVec[dimIndex] = endElementIdxGlobal; + } + + return {firstElementIdxGlobalVec, endElementIdxGlobalVec}; + } + + } // namespace alpakatools } // namespace cms diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index aa75b7a37..683839d7e 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -6,159 +6,150 @@ #include "AlpakaCore/HistoContainer.h" -struct setZero { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { +template +struct mykernel { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, T const* __restrict__ v, uint32_t N) const { + assert(v); + assert(N == 12000); + + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); + if (threadIdxLocal == 0) { + printf("start kernel for %d data\n", N); + } + + using Hist = cms::alpakatools::HistoContainer; + + auto&& hist = alpaka::block::shared::st::allocVar(acc); + auto&& ws = alpaka::block::shared::st::allocVar(acc); + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::totbins(); threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::totbins()); ++j) { - hist->off[j] = 0; + + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range_uncut(acc, Vec1::all(Hist::totbins())); + + // set off zero + uint32_t endElementIdx0 = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Hist::totbins(); threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx0, Hist::totbins()); ++j) { + hist.off[j] = 0; } - endElementIdx += blockDimension; + endElementIdx0 += blockDimension; } - } -}; + alpaka::block::sync::syncBlockThreads(acc); -struct setZeroBins { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { - const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::capacity())); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::capacity(); threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::capacity()); ++j) { - hist->bins[j] = 0; + // set bins zero + uint32_t endElementIdx1 = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Hist::capacity(); threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx1, Hist::totbins()); ++j) { + hist.bins[j] = 0; } - endElementIdx += blockDimension; + endElementIdx1 += blockDimension; } - } -}; + alpaka::block::sync::syncBlockThreads(acc); -struct count { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist, T *v, uint32_t N) const { - const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; + // count + uint32_t endElementIdx2 = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { - hist->count(acc, v[j]); + for (uint32_t j = threadIdx; j < std::min(endElementIdx2, N); ++j) { + hist.count(acc, v[j]); } - endElementIdx += blockDimension; + endElementIdx2 += blockDimension; } - } -}; + alpaka::block::sync::syncBlockThreads(acc); -struct finalize { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { - auto &&ws = alpaka::block::shared::st::allocVar(acc); - hist->finalize(acc, ws); - } -}; + assert(0 == hist.size()); + alpaka::block::sync::syncBlockThreads(acc); -struct verify { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist) const { - const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::nbins())); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Histo::nbins(); threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, Histo::nbins()); ++j) { - assert(hist->off[j] <= hist->off[j + 1]); + // finalize + hist.finalize(acc, ws); + alpaka::block::sync::syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + printf("hist.size() = %u.\n", hist.size()); + } + assert(N == hist.size()); + + // verify + uint32_t endElementIdx8 = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Hist::nbins(); threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx8, Hist::nbins()); ++j) { + assert(hist.off[j] <= hist.off[j + 1]); } - endElementIdx += blockDimension; + endElementIdx8 += blockDimension; } - } -}; + alpaka::block::sync::syncBlockThreads(acc); -struct fill { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist, T *v, uint32_t N) const { - const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(N)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; + if (threadIdxLocal < 32) { + ws[threadIdxLocal] = 0; // used by prefix scan... + } + alpaka::block::sync::syncBlockThreads(acc); + + // fill + uint32_t endElementIdx3 = endElementIdxNoStride[0u]; for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { - hist->fill(acc, v[j], j); + for (uint32_t j = threadIdx; j < std::min(endElementIdx3, N); ++j) { + hist.fill(acc, v[j], j); } - endElementIdx += blockDimension; + endElementIdx3 += blockDimension; } - } -}; + alpaka::block::sync::syncBlockThreads(acc); -struct bin { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const T_Acc &acc, Histo *__restrict__ hist, T *v, uint32_t N) const { - const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size() - 1)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist->size() - 1; threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, hist->size() - 1); ++j) { - auto p = hist->begin() + j; + assert(0 == hist.off[0]); + assert(N == hist.size()); + + // bin + uint32_t endElementIdx4 = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist.size() - 1; threadIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx4, hist.size() - 1); ++j) { + auto p = hist.begin() + j; assert((*p) < N); - auto k1 = Histo::bin(v[*p]); - auto k2 = Histo::bin(v[*(p + 1)]); + auto k1 = Hist::bin(v[*p]); + auto k2 = Hist::bin(v[*(p + 1)]); assert(k2 >= k1); } - endElementIdx += blockDimension; + endElementIdx4 += blockDimension; } - } -}; -struct forEachInWindow { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()( - const T_Acc &acc, Histo *__restrict__ hist, const T *v, uint32_t N, const int NBINS, const int DELTA) const { - const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(hist->size())); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist->size(); threadIdx += blockDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, hist->size()); ++i) { - auto p = hist->begin() + i; - auto j = *p; - auto b0 = Histo::bin(v[j]); - int tot = 0; - auto ftest = [&](unsigned int k) { - assert(k < N); - ++tot; - }; - cms::alpakatools::forEachInWindow(*hist, v[j], v[j], ftest); - int rtot = hist->size(b0); - assert(tot == rtot); - tot = 0; - auto vm = int(v[j]) - DELTA; - auto vp = int(v[j]) + DELTA; - const int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); - vm = std::max(vm, 0); - vm = std::min(vm, vmax); - vp = std::min(vp, vmax); - vp = std::max(vp, 0); - assert(vp >= vm); - cms::alpakatools::forEachInWindow(*hist, vm, vp, ftest); - int bp = Histo::bin(vp); - int bm = Histo::bin(vm); - rtot = hist->end(bp) - hist->begin(bm); + // forEachInWindow + uint32_t endElementIdx5 = endElementIdxNoStride[0u]; + for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist.size(); threadIdx += blockDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx5, hist.size()); ++i) { + auto p = hist.begin() + i; + auto j = *p; + auto b0 = Hist::bin(v[j]); + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k >= 0 && k < N); + ++tot; + }; + cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); + int rtot = hist.size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); assert(tot == rtot); } - endElementIdx += blockDimension; + endElementIdx5 += blockDimension; } + } }; + template -void go(const DevHost &host, - const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, - ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { +void go(const DevHost& host, const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1& device, ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { std::mt19937 eng; int rmin = std::numeric_limits::min(); @@ -169,7 +160,7 @@ void go(const DevHost &host, } std::uniform_int_distribution rgen(rmin, rmax); - constexpr unsigned int N = 12000; + constexpr unsigned int N = 12000; using Hist = cms::alpakatools::HistoContainer; std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' @@ -178,94 +169,37 @@ void go(const DevHost &host, auto v_hbuf = alpaka::mem::buf::alloc(host, N); auto v = alpaka::mem::view::getPtrNative(v_hbuf); + auto v_dbuf = alpaka::mem::buf::alloc(device, N); for (int it = 0; it < 5; ++it) { - for (long long j = 0; j < N; j++) { + for (long long j = 0; j < N; j++) v[j] = rgen(eng); - } - if (it == 2) { - for (long long j = N / 2; j < N / 2 + N / 4; j++) { + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) v[j] = 4; - } - } - - auto v_dbuf = alpaka::mem::buf::alloc(device, N); - alpaka::mem::view::copy(queue, v_dbuf, v_hbuf, N); - printf("start kernel for %d data\n", N); - using HistTeam = cms::alpakatools::HistoContainer; - auto hist_hbuf = alpaka::mem::buf::alloc(host, 1u); - auto hist = alpaka::mem::view::getPtrNative(hist_hbuf); - auto hist_dbuf = alpaka::mem::buf::alloc(device, 1u); - alpaka::mem::view::set(queue, hist_dbuf, 0, 1u); + alpaka::mem::view::copy(queue, v_dbuf, v_hbuf, N); - const Vec1 threadsPerBlockOrElementsPerThread(256u); - const Vec1 blocksPerGrid(1u); + const Vec1& threadsPerBlockOrElementsPerThread(Vec1::all(256)); + const Vec1& blocksPerGrid(Vec1::all(1)); const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( - workDiv, setZero(), alpaka::mem::view::getPtrNative(hist_dbuf))); - - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( - workDiv, setZeroBins(), alpaka::mem::view::getPtrNative(hist_dbuf))); - - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel( - workDiv, count(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); - - alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); - alpaka::wait::wait(queue); - assert(0 == hist->size()); - - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( - workDiv, finalize(), alpaka::mem::view::getPtrNative(hist_dbuf))); - - alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); - alpaka::wait::wait(queue); - assert(N == hist->size()); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( - workDiv, verify(), alpaka::mem::view::getPtrNative(hist_dbuf))); - - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel( - workDiv, fill(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); - - alpaka::mem::view::copy(queue, hist_hbuf, hist_dbuf, 1u); - alpaka::wait::wait(queue); - assert(0 == hist->off[0]); - assert(N == hist->size()); - - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel( - workDiv, bin(), alpaka::mem::view::getPtrNative(hist_dbuf), alpaka::mem::view::getPtrNative(v_dbuf), N)); - - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel(workDiv, - forEachInWindow(), - alpaka::mem::view::getPtrNative(hist_dbuf), - alpaka::mem::view::getPtrNative(v_dbuf), - N, - NBINS, - DELTA)); - - alpaka::wait::wait(queue); + alpaka::kernel::createTaskKernel(workDiv, + mykernel(), + alpaka::mem::view::getPtrNative(v_dbuf), + N + )); + } + alpaka::wait::wait(queue); } + int main() { const DevHost host(alpaka::pltf::getDevByIdx(0u)); const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 device( - alpaka::pltf::getDevByIdx(0u)); + alpaka::pltf::getDevByIdx(0u)); ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); go(host, device, queue); From 88545ea3c24d3a443a19fe418116deda241de3d3 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Fri, 29 Jan 2021 13:26:53 +0100 Subject: [PATCH 23/32] Add cms::alpakatools::element_global_index_range_uncut to avoid having to call element_global_index_range for each possible max number of elements. --- src/alpaka/AlpakaCore/alpakaWorkDivHelper.h | 41 ++++++++----------- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 3 +- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index fa90d91bf..f1701e6ae 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -8,6 +8,7 @@ using namespace alpaka_common; namespace cms { namespace alpakatools { + /* * Creates the accelerator-dependent workdiv. */ @@ -28,14 +29,16 @@ namespace cms { #endif } + /* * Computes the range of the element(s) global index(es) in grid. + * Warning: the max index is not truncated by any max number of elements. */ - template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc, - const Vec& maxNumberOfElements) { + template > + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_uncut(const T_Acc& acc) { + Vec firstElementIdxGlobalVec = Vec::zeros(); - Vec endElementIdxGlobalVec = Vec::zeros(); + Vec endElementIdxUncutGlobalVec = Vec::zeros(); for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { // Global thread index in grid (along dimension dimIndex). @@ -46,39 +49,29 @@ namespace cms { // Obviously relevant for CPU only. // For GPU, threadDimension = 1, and firstElementIdxGlobal = endElementIdxGlobal = threadIndexGlobal. const uint32_t firstElementIdxGlobal = threadIdxGlobal * threadDimension; - const uint32_t endElementIdxGlobalUncut = firstElementIdxGlobal + threadDimension; - const uint32_t endElementIdxGlobal = std::min(endElementIdxGlobalUncut, maxNumberOfElements[dimIndex]); + const uint32_t endElementIdxUncutGlobal = firstElementIdxGlobal + threadDimension; firstElementIdxGlobalVec[dimIndex] = firstElementIdxGlobal; - endElementIdxGlobalVec[dimIndex] = endElementIdxGlobal; + endElementIdxUncutGlobalVec[dimIndex] = endElementIdxUncutGlobal; } - return {firstElementIdxGlobalVec, endElementIdxGlobalVec}; + return {firstElementIdxGlobalVec, endElementIdxUncutGlobalVec}; } - /* * Computes the range of the element(s) global index(es) in grid. */ template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_uncut(const T_Acc& acc, const Vec& maxNumberOfElements) { - Vec firstElementIdxGlobalVec = Vec::zeros(); - Vec endElementIdxGlobalVec = Vec::zeros(); + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc, + const Vec& maxNumberOfElements) { + + static_assert(alpaka::dim::Dim::value == T_Dim::value, + "Accelerator and maxNumberOfElements need to have same dimension."); + auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_global_index_range_uncut(acc); for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { - // Global thread index in grid (along dimension dimIndex). - const uint32_t threadIdxGlobal(alpaka::idx::getIdx(acc)[dimIndex]); - const uint32_t threadDimension(alpaka::workdiv::getWorkDiv(acc)[dimIndex]); - - // Global element index in grid (along dimension dimIndex). - // Obviously relevant for CPU only. - // For GPU, threadDimension = 1, and firstElementIdxGlobal = endElementIdxGlobal = threadIndexGlobal. - const uint32_t firstElementIdxGlobal = threadIdxGlobal * threadDimension; - const uint32_t endElementIdxGlobal = firstElementIdxGlobal + threadDimension; - - firstElementIdxGlobalVec[dimIndex] = firstElementIdxGlobal; - endElementIdxGlobalVec[dimIndex] = endElementIdxGlobal; + endElementIdxGlobalVec[dimIndex] = std::min(endElementIdxGlobalVec[dimIndex], maxNumberOfElements[dimIndex]); } return {firstElementIdxGlobalVec, endElementIdxGlobalVec}; diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 683839d7e..17e254a10 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -24,9 +24,8 @@ struct mykernel { auto&& ws = alpaka::block::shared::st::allocVar(acc); const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_uncut(acc, Vec1::all(Hist::totbins())); + cms::alpakatools::element_global_index_range_uncut(acc); // set off zero uint32_t endElementIdx0 = endElementIdxNoStride[0u]; From ba641ee048e4fae153d70dba3fcd98516cb2f871 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Fri, 29 Jan 2021 13:37:22 +0100 Subject: [PATCH 24/32] Include endElementIdx within loop. NB: Will need to add a dedicated helper function in cms::alpakatools, but this is already nicer. --- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 44 +++++++------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 17e254a10..77770cc93 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -28,32 +28,26 @@ struct mykernel { cms::alpakatools::element_global_index_range_uncut(acc); // set off zero - uint32_t endElementIdx0 = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Hist::totbins(); threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx0, Hist::totbins()); ++j) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < Hist::totbins(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::totbins()); ++j) { hist.off[j] = 0; } - endElementIdx0 += blockDimension; } alpaka::block::sync::syncBlockThreads(acc); // set bins zero - uint32_t endElementIdx1 = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Hist::capacity(); threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx1, Hist::totbins()); ++j) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < Hist::capacity(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::totbins()); ++j) { hist.bins[j] = 0; } - endElementIdx1 += blockDimension; } alpaka::block::sync::syncBlockThreads(acc); // count - uint32_t endElementIdx2 = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx2, N); ++j) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { hist.count(acc, v[j]); } - endElementIdx2 += blockDimension; } alpaka::block::sync::syncBlockThreads(acc); @@ -70,12 +64,10 @@ struct mykernel { assert(N == hist.size()); // verify - uint32_t endElementIdx8 = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < Hist::nbins(); threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx8, Hist::nbins()); ++j) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < Hist::nbins(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::nbins()); ++j) { assert(hist.off[j] <= hist.off[j + 1]); } - endElementIdx8 += blockDimension; } alpaka::block::sync::syncBlockThreads(acc); @@ -85,12 +77,10 @@ struct mykernel { alpaka::block::sync::syncBlockThreads(acc); // fill - uint32_t endElementIdx3 = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx3, N); ++j) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { hist.fill(acc, v[j], j); } - endElementIdx3 += blockDimension; } alpaka::block::sync::syncBlockThreads(acc); @@ -98,28 +88,25 @@ struct mykernel { assert(N == hist.size()); // bin - uint32_t endElementIdx4 = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist.size() - 1; threadIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx4, hist.size() - 1); ++j) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < hist.size() - 1; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t j = threadIdx; j < std::min(endElementIdx, hist.size() - 1); ++j) { auto p = hist.begin() + j; assert((*p) < N); auto k1 = Hist::bin(v[*p]); auto k2 = Hist::bin(v[*(p + 1)]); assert(k2 >= k1); } - endElementIdx4 += blockDimension; } // forEachInWindow - uint32_t endElementIdx5 = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < hist.size(); threadIdx += blockDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx5, hist.size()); ++i) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < hist.size(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, hist.size()); ++i) { auto p = hist.begin() + i; auto j = *p; auto b0 = Hist::bin(v[j]); int tot = 0; auto ftest = [&](unsigned int k) { - assert(k >= 0 && k < N); + assert(k < N); ++tot; }; cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); @@ -140,7 +127,6 @@ struct mykernel { rtot = hist.end(bp) - hist.begin(bm); assert(tot == rtot); } - endElementIdx5 += blockDimension; } } From 5c9a8b9264c3c2899afe7ad0d054a90bd5812982 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Fri, 29 Jan 2021 15:02:41 +0100 Subject: [PATCH 25/32] Remove psws from HistoContainer class, never used. It corresponds to the number of blocks used in a prefix scan. --- src/alpaka/AlpakaCore/HistoContainer.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index b0bed60b5..097360254 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -81,13 +81,6 @@ namespace cms { } }; - struct storePrefixScanWorkingSpace { - template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, Histo *__restrict__ h, const unsigned int nblocks) const { - h->psws = nblocks; - } - }; - template ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize( Histo *__restrict__ h, @@ -105,11 +98,6 @@ namespace cms { const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; const Vec1 blocksPerGrid(nblocks); - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel( - WorkDiv1{Vec1::all(1u), Vec1::all(1u), Vec1::all(1u)}, storePrefixScanWorkingSpace(), h, nblocks)); - const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel( @@ -354,7 +342,6 @@ namespace cms { constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } Counter off[totbins()]; - int32_t psws; // prefix-scan working space index_type bins[capacity()]; }; From 94176e972bcee3395624b3fd6d658fb5606e2b3d Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Fri, 29 Jan 2021 15:10:59 +0100 Subject: [PATCH 26/32] Indices with += gridDimension stride: add endElementIdx within loop, which already makes things clearer. NB: TO DO: add a dedicated helper function. --- src/alpaka/AlpakaCore/HistoContainer.h | 54 ++++++++++------------ src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 42 ++++++----------- 2 files changed, 39 insertions(+), 57 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 097360254..bfabda366 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -17,27 +17,25 @@ namespace cms { struct countFromVector { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, - Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) const { - const uint32_t nt = offsets[nh]; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - h->count(acc, v[i], ih); - } - endElementIdx += gridDimension; - } + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension, endElementIdx += gridDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + h->count(acc, v[i], ih); + } + } } }; @@ -51,10 +49,9 @@ namespace cms { const uint32_t nt = offsets[nh]; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); @@ -63,7 +60,6 @@ namespace cms { assert(ih < int(nh)); h->fill(acc, v[i], i, ih); } - endElementIdx += gridDimension; } } }; @@ -278,14 +274,12 @@ namespace cms { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); + cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); - uint32_t endElementIdx = m + endElementIdxNoStride[0u]; - for (uint32_t threadIdx = m + firstElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension) { + for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) { off[i] = n; - } - endElementIdx += gridDimension; + } } } diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index a80f09bee..9359d4a8e 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -26,9 +26,8 @@ struct countMultiLocal { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { auto&& local = alpaka::block::shared::st::allocVar(acc); if (threadIdxLocal == 0) { @@ -41,7 +40,6 @@ struct countMultiLocal { assoc->add(acc, local); } } - endElementIdx += gridDimension; } } }; @@ -54,13 +52,11 @@ struct countMulti { uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { assoc->countDirect(acc, 2 + i % 4); } - endElementIdx += gridDimension; } } }; @@ -71,14 +67,12 @@ struct verifyMulti { const uint32_t maxNumberOfElements = Multiplicity::totbins(); const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; - threadIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { assert(m1->off[i] == m2->off[i]); } - endElementIdx += gridDimension; } } }; @@ -92,10 +86,9 @@ struct count { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; - threadIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { auto k = i / 4; auto j = i - 4 * k; @@ -107,7 +100,6 @@ struct count { assoc->countDirect(acc, tk[k][j]); } } - endElementIdx += gridDimension; } } }; @@ -121,10 +113,9 @@ struct fill { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; - threadIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { auto k = i / 4; auto j = i - 4 * k; @@ -136,7 +127,6 @@ struct fill { assoc->fillDirect(acc, tk[k][j], k); } } - endElementIdx += gridDimension; } } }; @@ -157,14 +147,12 @@ struct fillBulk { uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); - uint32_t endElementIdx = endElementIdxNoStride[0u]; - for (uint32_t threadIdx = firstElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t k = threadIdx; k < std::min(endElementIdx, n); ++k) { auto m = tk[k][3] < MaxElem ? 4 : 3; assoc->bulkFill(acc, *apc, &tk[k][0], m); } - endElementIdx += gridDimension; } } }; From cbb609fade1e5f88f19553963c62d1977fbfc30d Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Fri, 29 Jan 2021 15:12:14 +0100 Subject: [PATCH 27/32] clang-format --- src/alpaka/AlpakaCore/HistoContainer.h | 54 +++++---- src/alpaka/AlpakaCore/alpakaWorkDivHelper.h | 14 +-- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 108 +++++++++--------- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 30 +++-- 4 files changed, 109 insertions(+), 97 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index bfabda366..40aa3b364 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -17,25 +17,27 @@ namespace cms { struct countFromVector { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, - Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) const { - const uint32_t nt = offsets[nh]; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - h->count(acc, v[i], ih); - } - } + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + const auto &[firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < nt; + threadIdx += gridDimension, endElementIdx += gridDimension) { + for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + h->count(acc, v[i], ih); + } + } } }; @@ -49,9 +51,11 @@ namespace cms { const uint32_t nt = offsets[nh]; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension, endElementIdx += gridDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < nt; + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); @@ -274,12 +278,14 @@ namespace cms { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); + cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); - for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u]; threadIdx < totbins(); threadIdx += gridDimension, endElementIdx += gridDimension) { + for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u]; + threadIdx < totbins(); + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) { off[i] = n; - } + } } } diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index f1701e6ae..c9f232dee 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -8,7 +8,6 @@ using namespace alpaka_common; namespace cms { namespace alpakatools { - /* * Creates the accelerator-dependent workdiv. */ @@ -29,14 +28,12 @@ namespace cms { #endif } - /* * Computes the range of the element(s) global index(es) in grid. * Warning: the max index is not truncated by any max number of elements. */ template > - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_uncut(const T_Acc& acc) { - + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_uncut(const T_Acc& acc) { Vec firstElementIdxGlobalVec = Vec::zeros(); Vec endElementIdxUncutGlobalVec = Vec::zeros(); @@ -58,16 +55,14 @@ namespace cms { return {firstElementIdxGlobalVec, endElementIdxUncutGlobalVec}; } - /* * Computes the range of the element(s) global index(es) in grid. */ template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc, - const Vec& maxNumberOfElements) { - + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc, + const Vec& maxNumberOfElements) { static_assert(alpaka::dim::Dim::value == T_Dim::value, - "Accelerator and maxNumberOfElements need to have same dimension."); + "Accelerator and maxNumberOfElements need to have same dimension."); auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_global_index_range_uncut(acc); for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { @@ -77,7 +72,6 @@ namespace cms { return {firstElementIdxGlobalVec, endElementIdxGlobalVec}; } - } // namespace alpakatools } // namespace cms diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 77770cc93..65c23d732 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -9,7 +9,7 @@ template struct mykernel { template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, T const* __restrict__ v, uint32_t N) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* __restrict__ v, uint32_t N) const { assert(v); assert(N == 12000); @@ -17,34 +17,39 @@ struct mykernel { if (threadIdxLocal == 0) { printf("start kernel for %d data\n", N); } - + using Hist = cms::alpakatools::HistoContainer; auto&& hist = alpaka::block::shared::st::allocVar(acc); auto&& ws = alpaka::block::shared::st::allocVar(acc); - + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_uncut(acc); + cms::alpakatools::element_global_index_range_uncut(acc); // set off zero - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < Hist::totbins(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < Hist::totbins(); + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::totbins()); ++j) { - hist.off[j] = 0; + hist.off[j] = 0; } } alpaka::block::sync::syncBlockThreads(acc); // set bins zero - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < Hist::capacity(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < Hist::capacity(); + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::totbins()); ++j) { - hist.bins[j] = 0; + hist.bins[j] = 0; } } alpaka::block::sync::syncBlockThreads(acc); // count - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { hist.count(acc, v[j]); } @@ -64,7 +69,9 @@ struct mykernel { assert(N == hist.size()); // verify - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < Hist::nbins(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < Hist::nbins(); + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::nbins()); ++j) { assert(hist.off[j] <= hist.off[j + 1]); } @@ -77,7 +84,8 @@ struct mykernel { alpaka::block::sync::syncBlockThreads(acc); // fill - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { hist.fill(acc, v[j], j); } @@ -88,7 +96,9 @@ struct mykernel { assert(N == hist.size()); // bin - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < hist.size() - 1; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < hist.size() - 1; + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t j = threadIdx; j < std::min(endElementIdx, hist.size() - 1); ++j) { auto p = hist.begin() + j; assert((*p) < N); @@ -99,42 +109,44 @@ struct mykernel { } // forEachInWindow - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < hist.size(); threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < hist.size(); + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, hist.size()); ++i) { - auto p = hist.begin() + i; - auto j = *p; - auto b0 = Hist::bin(v[j]); - int tot = 0; - auto ftest = [&](unsigned int k) { - assert(k < N); - ++tot; - }; - cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); - int rtot = hist.size(b0); - assert(tot == rtot); - tot = 0; - auto vm = int(v[j]) - DELTA; - auto vp = int(v[j]) + DELTA; - constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); - vm = std::max(vm, 0); - vm = std::min(vm, vmax); - vp = std::min(vp, vmax); - vp = std::max(vp, 0); - assert(vp >= vm); - cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); - int bp = Hist::bin(vp); - int bm = Hist::bin(vm); - rtot = hist.end(bp) - hist.begin(bm); + auto p = hist.begin() + i; + auto j = *p; + auto b0 = Hist::bin(v[j]); + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k < N); + ++tot; + }; + cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); + int rtot = hist.size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); assert(tot == rtot); } } - } }; - template -void go(const DevHost& host, const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1& device, ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { +void go(const DevHost& host, + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1& device, + ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { std::mt19937 eng; int rmin = std::numeric_limits::min(); @@ -145,7 +157,7 @@ void go(const DevHost& host, const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1& device } std::uniform_int_distribution rgen(rmin, rmax); - constexpr unsigned int N = 12000; + constexpr unsigned int N = 12000; using Hist = cms::alpakatools::HistoContainer; std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' @@ -163,28 +175,22 @@ void go(const DevHost& host, const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1& device for (long long j = N / 2; j < N / 2 + N / 4; j++) v[j] = 4; - alpaka::mem::view::copy(queue, v_dbuf, v_hbuf, N); const Vec1& threadsPerBlockOrElementsPerThread(Vec1::all(256)); const Vec1& blocksPerGrid(Vec1::all(1)); - const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + const WorkDiv1& workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDiv, - mykernel(), - alpaka::mem::view::getPtrNative(v_dbuf), - N - )); - + alpaka::kernel::createTaskKernel( + workDiv, mykernel(), alpaka::mem::view::getPtrNative(v_dbuf), N)); } alpaka::wait::wait(queue); } - int main() { const DevHost host(alpaka::pltf::getDevByIdx(0u)); const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 device( - alpaka::pltf::getDevByIdx(0u)); + alpaka::pltf::getDevByIdx(0u)); ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); go(host, device, queue); diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 9359d4a8e..4036e7d87 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -26,8 +26,9 @@ struct countMultiLocal { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { auto&& local = alpaka::block::shared::st::allocVar(acc); if (threadIdxLocal == 0) { @@ -52,8 +53,9 @@ struct countMulti { uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { assoc->countDirect(acc, 2 + i % 4); } @@ -67,8 +69,9 @@ struct verifyMulti { const uint32_t maxNumberOfElements = Multiplicity::totbins(); const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < maxNumberOfElements; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { assert(m1->off[i] == m2->off[i]); @@ -86,8 +89,9 @@ struct count { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < maxNumberOfElements; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { auto k = i / 4; @@ -113,8 +117,9 @@ struct fill { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; + cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < maxNumberOfElements; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { auto k = i / 4; @@ -147,8 +152,9 @@ struct fillBulk { uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { + cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; + threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t k = threadIdx; k < std::min(endElementIdx, n); ++k) { auto m = tk[k][3] < MaxElem ? 4 : 3; assoc->bulkFill(acc, *apc, &tk[k][0], m); From d759b1f56d5dc0769e93e4a7cc1ca380c288c825 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Mon, 1 Feb 2021 11:05:10 +0100 Subject: [PATCH 28/32] [alpaka] PrefixScan: use dynamic shared memory (as in CUDA version) instead of global memory. This changes the prefixSCan interface (closer to CUDA version), hence the call sites. Important: To be noted is that in any case, the amount of memory needed was not num_items * sizeof(T), only num_blocks * sizeof(T) is sufficient. --- src/alpaka/AlpakaCore/HistoContainer.h | 7 ++--- src/alpaka/AlpakaCore/prefixScan.h | 42 ++++++++++++++++++++++++-- src/alpaka/test/alpaka/prefixScan_t.cc | 7 ----- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 40aa3b364..ff50672b2 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -90,9 +90,6 @@ namespace cms { const int num_items = Histo::totbins(); - auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); - uint32_t *psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); - const unsigned int nthreads = 1024; const Vec1 threadsPerBlockOrElementsPerThread(nthreads); const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; @@ -101,14 +98,14 @@ namespace cms { const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel( - workDiv, multiBlockPrefixScanFirstStep(), poff, poff, psum_d, num_items)); + workDiv, multiBlockPrefixScanFirstStep(), poff, poff, num_items)); const WorkDiv1 &workDivWith1Block = cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); alpaka::queue::enqueue( queue, alpaka::kernel::createTaskKernel( - workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, psum_d, num_items, nblocks)); + workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, num_items, nblocks)); } template diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index 071267581..ed2ef56fc 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -136,7 +136,7 @@ namespace cms { template struct multiBlockPrefixScanFirstStep { template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, T* psum, int32_t size) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size) const { uint32_t const gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); uint32_t const blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); uint32_t const threadDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); @@ -157,15 +157,17 @@ namespace cms { template struct multiBlockPrefixScanSecondStep { template - ALPAKA_FN_ACC void operator()( - const T_Acc& acc, T const* ci, T* co, T* psum, int32_t size, int32_t numBlocks) const { + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t numBlocks) const { uint32_t const blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); uint32_t const threadDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); uint32_t const blockIdx(alpaka::idx::getIdx(acc)[0u]); uint32_t const threadIdx(alpaka::idx::getIdx(acc)[0u]); + auto* psum(alpaka::block::shared::dyn::getMem(acc)); + auto&& ws = alpaka::block::shared::st::allocVar(acc); + // first each block does a scan of size 1024; (better be enough blocks....) assert(static_cast(blockDimension * threadDimension) >= numBlocks); for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { @@ -192,7 +194,41 @@ namespace cms { } } }; + } // namespace alpakatools } // namespace cms +namespace alpaka { + namespace kernel { + namespace traits { + //############################################################################# + //! The trait for getting the size of the block shared dynamic memory for a kernel. + template + struct BlockSharedMemDynSizeBytes, TAcc> { + //----------------------------------------------------------------------------- + //! \return The size of the shared memory allocated for a block. + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes( + cms::alpakatools::multiBlockPrefixScanSecondStep const& myKernel, + TVec const& blockThreadExtent, + TVec const& threadElemExtent, + T const* ci, + T* co, + int32_t size, + int32_t numBlocks) -> T { + alpaka::ignore_unused(myKernel); + alpaka::ignore_unused(blockThreadExtent); + alpaka::ignore_unused(threadElemExtent); + alpaka::ignore_unused(ci); + alpaka::ignore_unused(co); + alpaka::ignore_unused(size); + + // Reserve the buffer for the two blocks of A and B. + return static_cast(numBlocks) * sizeof(T); + } + }; + } // namespace traits + } // namespace kernel +} // namespace alpaka + #endif // HeterogeneousCore_AlpakaUtilities_interface_prefixScan_h diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index 93e1c6cc6..136312628 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -195,11 +195,6 @@ int main() { WorkDiv1{Vec1::all(nblocks), Vec1::all(nthreads), Vec1::all(1)}, init(), input_d, 1, num_items)); alpaka::wait::wait(queue); - auto psum_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items * sizeof(uint32_t))); - uint32_t* psum_d = alpaka::mem::view::getPtrNative(psum_dBuf); - - alpaka::mem::view::set(queue, psum_dBuf, 0u, Vec1::all(num_items * sizeof(uint32_t))); - #if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED nthreads = 1; auto nelements = 768; @@ -217,7 +212,6 @@ int main() { multiBlockPrefixScanFirstStep(), input_d, output1_d, - psum_d, num_items)); alpaka::wait::wait(queue); alpaka::queue::enqueue( @@ -226,7 +220,6 @@ int main() { multiBlockPrefixScanSecondStep(), input_d, output1_d, - psum_d, num_items, nblocks)); alpaka::wait::wait(queue); From c1824b8954156ef0617c7bc0ae3b79830cb120fc Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Mon, 1 Feb 2021 11:20:06 +0100 Subject: [PATCH 29/32] Renaming: element_global_index_range to compute non-truncated range, and element_global_index_range_truncated to compute range truncated by max number of elements of interest. --- src/alpaka/AlpakaCore/HistoContainer.h | 8 ++++---- src/alpaka/AlpakaCore/alpakaWorkDivHelper.h | 11 ++++++----- src/alpaka/test/alpaka/AtomicPairCounter_t.cc | 4 ++-- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 2 +- src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 12 ++++++------ src/alpaka/test/alpaka/prefixScan_t.cc | 4 ++-- src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h | 2 +- src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc | 10 +++++----- src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc | 10 +++++----- 9 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index ff50672b2..35c865d57 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -25,7 +25,7 @@ namespace cms { const uint32_t nt = offsets[nh]; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < nt; threadIdx += gridDimension, endElementIdx += gridDimension) { @@ -51,7 +51,7 @@ namespace cms { const uint32_t nt = offsets[nh]; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(nt)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < nt; @@ -73,7 +73,7 @@ namespace cms { ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, Histo *__restrict__ h) const { const auto &[firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(Histo::totbins())); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(Histo::totbins())); for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { h->off[i] = 0; @@ -275,7 +275,7 @@ namespace cms { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(totbins())); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(totbins())); for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u]; threadIdx < totbins(); diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index c9f232dee..a04828757 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -30,10 +30,10 @@ namespace cms { /* * Computes the range of the element(s) global index(es) in grid. - * Warning: the max index is not truncated by any max number of elements. + * Warning: the max index is not truncated by the max number of elements of interest. */ template > - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_uncut(const T_Acc& acc) { + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc) { Vec firstElementIdxGlobalVec = Vec::zeros(); Vec endElementIdxUncutGlobalVec = Vec::zeros(); @@ -57,13 +57,14 @@ namespace cms { /* * Computes the range of the element(s) global index(es) in grid. + * Truncated by the max number of elements of interest. */ template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc, - const Vec& maxNumberOfElements) { + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_truncated(const T_Acc& acc, + const Vec& maxNumberOfElements) { static_assert(alpaka::dim::Dim::value == T_Dim::value, "Accelerator and maxNumberOfElements need to have same dimension."); - auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_global_index_range_uncut(acc); + auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_global_index_range(acc); for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { endElementIdxGlobalVec[dimIndex] = std::min(endElementIdxGlobalVec[dimIndex], maxNumberOfElements[dimIndex]); diff --git a/src/alpaka/test/alpaka/AtomicPairCounter_t.cc b/src/alpaka/test/alpaka/AtomicPairCounter_t.cc index 49fca54b1..3eefc81a3 100644 --- a/src/alpaka/test/alpaka/AtomicPairCounter_t.cc +++ b/src/alpaka/test/alpaka/AtomicPairCounter_t.cc @@ -12,7 +12,7 @@ struct update { ALPAKA_FN_ACC void operator()( const T_Acc &acc, cms::alpakatools::AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { const auto &[firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { auto m = i % 11; @@ -46,7 +46,7 @@ struct verify { uint32_t const *cont, uint32_t n) const { const auto &[firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { assert(0 == ind[0]); diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 65c23d732..ac7ff1c43 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -25,7 +25,7 @@ struct mykernel { const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_uncut(acc); + cms::alpakatools::element_global_index_range(acc); // set off zero for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 4036e7d87..00e774a08 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -26,7 +26,7 @@ struct countMultiLocal { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { @@ -53,7 +53,7 @@ struct countMulti { uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { @@ -69,7 +69,7 @@ struct verifyMulti { const uint32_t maxNumberOfElements = Multiplicity::totbins(); const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(maxNumberOfElements)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension, endElementIdx += gridDimension) { @@ -89,7 +89,7 @@ struct count { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(maxNumberOfElements)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension, endElementIdx += gridDimension) { @@ -117,7 +117,7 @@ struct fill { const uint32_t maxNumberOfElements = 4 * n; const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(maxNumberOfElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(maxNumberOfElements)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < maxNumberOfElements; threadIdx += gridDimension, endElementIdx += gridDimension) { @@ -152,7 +152,7 @@ struct fillBulk { uint32_t n) const { const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; threadIdx += gridDimension, endElementIdx += gridDimension) { for (uint32_t k = threadIdx; k < std::min(endElementIdx, n); ++k) { diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index 136312628..59e63db68 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -84,7 +84,7 @@ struct init { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t* v, uint32_t val, uint32_t n) const { const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); for (uint32_t index = firstElementIdxGlobal[0u]; index < endElementIdxGlobal[0u]; ++index) { v[index] = val; @@ -99,7 +99,7 @@ struct verify { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t const* v, uint32_t n) const { const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(n)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); for (uint32_t index = firstElementIdxGlobal[0u]; index < endElementIdxGlobal[0u]; ++index) { assert(v[index] == index + 1); diff --git a/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h b/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h index 4dc36caa0..67ebf9781 100644 --- a/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h @@ -32,7 +32,7 @@ namespace cms { * Computes the range of the element(s) global index(es) in grid. */ template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc, + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_truncated(const T_Acc& acc, const Vec& maxNumberOfElements) { Vec firstElementIdxGlobalVec = Vec::zeros(); Vec endElementIdxGlobalVec = Vec::zeros(); diff --git a/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc b/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc index 6e20b8df5..1e62b169e 100644 --- a/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc +++ b/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc @@ -14,7 +14,7 @@ namespace { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { c[i] = a[i] + b[i]; @@ -32,7 +32,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec2::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -52,7 +52,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec2::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -76,7 +76,7 @@ namespace { // Global element index in 1D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); for (uint32_t row = firstElementIdxGlobal[0u]; row < endElementIdxGlobal[0u]; ++row) { T_Data tmp = 0; @@ -101,7 +101,7 @@ namespace { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { // theoreticalResult = i+i^2 = i*(i+1) diff --git a/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc b/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc index 56b105058..9e8cef29c 100644 --- a/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc +++ b/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc @@ -14,7 +14,7 @@ namespace { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { c[i] = a[i] + b[i]; @@ -32,7 +32,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec2::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -52,7 +52,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec2::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -76,7 +76,7 @@ namespace { // Global element index in 1D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); for (uint32_t row = firstElementIdxGlobal[0u]; row < endElementIdxGlobal[0u]; ++row) { T_Data tmp = 0; @@ -101,7 +101,7 @@ namespace { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range(acc, Vec1::all(numElements)); + cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { // theoreticalResult = i+i^2 = i*(i+1) From 52ccf4984aeccb0560e38092aae924e5c9f650b8 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Mon, 1 Feb 2021 11:20:56 +0100 Subject: [PATCH 30/32] clang-format --- src/alpaka/AlpakaCore/alpakaWorkDivHelper.h | 6 +++--- src/alpaka/test/alpaka/OneHistoContainer_t.cc | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index a04828757..392b01be0 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -33,7 +33,7 @@ namespace cms { * Warning: the max index is not truncated by the max number of elements of interest. */ template > - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc) { + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc) { Vec firstElementIdxGlobalVec = Vec::zeros(); Vec endElementIdxUncutGlobalVec = Vec::zeros(); @@ -60,8 +60,8 @@ namespace cms { * Truncated by the max number of elements of interest. */ template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_truncated(const T_Acc& acc, - const Vec& maxNumberOfElements) { + ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_truncated( + const T_Acc& acc, const Vec& maxNumberOfElements) { static_assert(alpaka::dim::Dim::value == T_Dim::value, "Accelerator and maxNumberOfElements need to have same dimension."); auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_global_index_range(acc); diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index ac7ff1c43..a86d59e76 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -24,8 +24,7 @@ struct mykernel { auto&& ws = alpaka::block::shared::st::allocVar(acc); const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc); // set off zero for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; From 788556daae5b1b52bc2e5353769d78217354e834 Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Mon, 1 Feb 2021 11:55:58 +0100 Subject: [PATCH 31/32] minor cleaning --- src/alpaka/AlpakaCore/prefixScan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index ed2ef56fc..829d949fa 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -164,9 +164,7 @@ namespace cms { uint32_t const blockIdx(alpaka::idx::getIdx(acc)[0u]); uint32_t const threadIdx(alpaka::idx::getIdx(acc)[0u]); - auto* psum(alpaka::block::shared::dyn::getMem(acc)); - - auto&& ws = alpaka::block::shared::st::allocVar(acc); + auto* const psum(alpaka::block::shared::dyn::getMem(acc)); // first each block does a scan of size 1024; (better be enough blocks....) assert(static_cast(blockDimension * threadDimension) >= numBlocks); @@ -183,6 +181,7 @@ namespace cms { alpaka::block::sync::syncBlockThreads(acc); + auto&& ws = alpaka::block::shared::st::allocVar(acc); blockPrefixScan(acc, psum, psum, numBlocks, ws); for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { @@ -201,6 +200,7 @@ namespace cms { namespace alpaka { namespace kernel { namespace traits { + //############################################################################# //! The trait for getting the size of the block shared dynamic memory for a kernel. template @@ -223,10 +223,10 @@ namespace alpaka { alpaka::ignore_unused(co); alpaka::ignore_unused(size); - // Reserve the buffer for the two blocks of A and B. return static_cast(numBlocks) * sizeof(T); } }; + } // namespace traits } // namespace kernel } // namespace alpaka From 548b4b91aab570582a90683398232b74ff13ceac Mon Sep 17 00:00:00 2001 From: Gabrielle Hugo Date: Tue, 2 Feb 2021 10:04:27 +0100 Subject: [PATCH 32/32] Forgot to remove device, now that memory allocation is removed --- src/alpaka/AlpakaCore/HistoContainer.h | 7 ++----- src/alpaka/test/alpaka/HistoContainer_t.cc | 1 - src/alpaka/test/alpaka/OneToManyAssoc_t.cc | 6 +++--- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 35c865d57..fae237362 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -83,9 +83,7 @@ namespace cms { template ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize( - Histo *__restrict__ h, - const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, - ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { + Histo *__restrict__ h, ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); const int num_items = Histo::totbins(); @@ -116,7 +114,6 @@ namespace cms { uint32_t const *__restrict__ offsets, uint32_t totSize, unsigned int nthreads, - const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 &device, ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; const Vec1 blocksPerGrid(nblocks); @@ -129,7 +126,7 @@ namespace cms { alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel( workDiv, countFromVector(), h, nh, v, offsets)); - launchFinalize(h, device, queue); + launchFinalize(h, queue); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel( diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index 64fc5e3cf..bf790b59c 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -78,7 +78,6 @@ void go(const DevHost& host, alpaka::mem::view::getPtrNative(off_d), offsets[10], 256, - device, queue); alpaka::mem::view::copy(queue, h_buf, h_d, 1u); diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 00e774a08..24393a81e 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -247,7 +247,7 @@ int main() { alpaka::kernel::createTaskKernel( workDiv4N, count(), alpaka::mem::view::getPtrNative(v_dbuf), alpaka::mem::view::getPtrNative(a_dbuf), N)); - cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(a_dbuf), device, queue); + cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(a_dbuf), queue); alpaka::queue::enqueue( queue, @@ -405,8 +405,8 @@ int main() { alpaka::mem::view::getPtrNative(m1_dbuf), alpaka::mem::view::getPtrNative(m2_dbuf))); - cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m1_dbuf), device, queue); - cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m2_dbuf), device, queue); + cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m1_dbuf), queue); + cms::alpakatools::launchFinalize(alpaka::mem::view::getPtrNative(m2_dbuf), queue); alpaka::queue::enqueue( queue,