Skip to content

Commit

Permalink
Full workflow from raw data to pixel tracks and vertices on GPUs (#216)
Browse files Browse the repository at this point in the history
Port and optimise the full workflow from pixel raw data to pixel tracks and vertices to GPUs.
Clean the pixel n-tuplets with the "fishbone" algorithm (only on GPUs).

Other changes:
  - recover the Riemann fit updates lost during the merge with CMSSW 10.4.x;
  - speed up clustering and track fitting;
  - minor bug fix to avoid trivial regression with the optimized fit.
  • Loading branch information
VinInn authored and fwyzard committed Jan 9, 2019
1 parent c258a04 commit 87ad758
Show file tree
Hide file tree
Showing 76 changed files with 4,622 additions and 2,461 deletions.
61 changes: 61 additions & 0 deletions Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h

#include <cstdint>
#include <array>

namespace phase1PixelTopology {

Expand Down Expand Up @@ -29,6 +30,66 @@ namespace phase1PixelTopology {
};


template<class Function, std::size_t... Indices>
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
{
return {{ f(Indices)... }};
}

template<int N, class Function>
constexpr auto map_to_array(Function f)
-> std::array<typename std::result_of<Function(std::size_t)>::type, N>
{
return map_to_array_helper(f, std::make_index_sequence<N>{});
}


constexpr uint32_t findMaxModuleStride() {
bool go = true;
int n=2;
while (go) {
for (uint8_t i=1; i<11; ++i) {
if (layerStart[i]%n !=0) {go=false; break;}
}
if(!go) break;
n*=2;
}
return n/2;
}

constexpr uint32_t maxModuleStride = findMaxModuleStride();


constexpr uint8_t findLayer(uint32_t detId) {
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}

constexpr uint8_t findLayerFromCompact(uint32_t detId) {
detId*=maxModuleStride;
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}


constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);

constexpr bool validateLayerIndex() {
bool res=true;
for (auto i=0U; i<numberOfModules; ++i) {
auto j = i/maxModuleStride;
res &=(layer[j]<10);
res &=(i>=layerStart[layer[j]]);
res &=(i<layerStart[layer[j]+1]);
}
return res;
}

static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");


// this is for the ROC n<512 (upgrade 1024)
constexpr inline
uint16_t divu52(uint16_t n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,13 @@ int main() {
assert(std::get<1>(ori)==bp);
}

using namespace phase1PixelTopology;
for (auto i=0U; i<numberOfModules; ++i) {
assert(layer[i]<10);
assert(i>=layerStart[layer[i]]);
assert(i<layerStart[layer[i]+1]);
}


return 0;
}
54 changes: 54 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H
#define HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H

#include <cuda_runtime.h>
#include <cstdint>

class AtomicPairCounter {
public:

using c_type = unsigned long long int;

AtomicPairCounter(){}
AtomicPairCounter(c_type i) { counter.ac=i;}

__device__ __host__
AtomicPairCounter & operator=(c_type i) { counter.ac=i; return *this;}

struct Counters {
uint32_t n; // in a "One to Many" association is the number of "One"
uint32_t m; // in a "One to Many" association is the total number of associations
};

union Atomic2 {
Counters counters;
c_type ac;
};

#ifdef __CUDACC__

static constexpr c_type incr = 1UL<<32;

__device__ __host__
Counters get() const { return counter.counters;}

// increment n by 1 and m by i. return previous value
__device__
Counters add(uint32_t i) {
c_type c = i;
c+=incr;
Atomic2 ret;
ret.ac = atomicAdd(&counter.ac,c);
return ret.counters;
}

#endif

private:

Atomic2 counter;

};


#endif
3 changes: 2 additions & 1 deletion HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ template <class T> struct SimpleVector {
}

#endif // __CUDACC__

inline constexpr bool empty() const { return m_size==0;}
inline constexpr bool full() const { return m_size==m_capacity;}
inline constexpr T& operator[](int i) { return m_data[i]; }
inline constexpr const T& operator[](int i) const { return m_data[i]; }
inline constexpr void reset() { m_size = 0; }
Expand Down
4 changes: 4 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ template <class T, int maxSize> struct VecArray {
return T();
}

inline constexpr T const * begin() const { return m_data;}
inline constexpr T const * end() const { return m_data+m_size;}
inline constexpr T * begin() { return m_data;}
inline constexpr T * end() { return m_data+m_size;}
inline constexpr int size() const { return m_size; }
inline constexpr T& operator[](int i) { return m_data[i]; }
inline constexpr const T& operator[](int i) const { return m_data[i]; }
Expand Down
135 changes: 111 additions & 24 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#ifdef __CUDACC__
#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
#endif
#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"


#ifdef __CUDACC__
namespace cudautils {
Expand All @@ -38,36 +40,51 @@ namespace cudautils {

template<typename Histo, typename T>
__global__
void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
uint32_t * __restrict__ ws ) {
void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= offsets[nh]) return;
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < nh);
(*h).fill(v[i], i, ws, ih);
(*h).fill(v[i], i, ih);
}

template<typename Histo>
void launchZero(Histo * __restrict__ h, cudaStream_t stream) {
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
}

template<typename Histo>
void launchFinalize(Histo * __restrict__ h, uint8_t * __restrict__ ws, cudaStream_t stream) {
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
size_t wss = Histo::wsSize();
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
}


template<typename Histo, typename T>
void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws,
void fillManyFromVector(Histo * __restrict__ h, uint8_t * __restrict__ ws,
uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize,
int nthreads, cudaStream_t stream) {
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
launchZero(h,stream);
auto nblocks = (totSize + nthreads - 1) / nthreads;
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
size_t wss = Histo::totbins();
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
launchFinalize(h,ws,stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
}


template<typename Assoc>
__global__
void finalizeBulk(AtomicPairCounter const * apc, Assoc * __restrict__ assoc) {
assoc->bulkFinalizeFill(*apc);
}

} // namespace cudautils
#endif

Expand Down Expand Up @@ -149,8 +166,8 @@ class HistoContainer {
uint32_t * v =nullptr;
void * d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
return std::max(temp_storage_bytes,size_t(totbins()));
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
return temp_storage_bytes;
}
#endif

Expand All @@ -176,22 +193,79 @@ class HistoContainer {
#endif
}

static __host__ __device__
__forceinline__
uint32_t atomicDecrement(Counter & x) {
#ifdef __CUDA_ARCH__
return atomicSub(&x, 1);
#else
return x--;
#endif
}

__host__ __device__
__forceinline__
void countDirect(T b) {
assert(b<nbins());
atomicIncrement(off[b]);
}

__host__ __device__
__forceinline__
void fillDirect(T b, index_type j) {
assert(b<nbins());
auto w = atomicDecrement(off[b]);
assert(w>0);
bins[w-1] = j;
}


#ifdef __CUDACC__
__device__
__forceinline__
uint32_t bulkFill(AtomicPairCounter & apc, index_type const * v, uint32_t n) {
auto c = apc.add(n);
off[c.m] = c.n;
for(int j=0; j<n; ++j) bins[c.n+j]=v[j];
return c.m;
}

__device__
__forceinline__
void bulkFinalize(AtomicPairCounter const & apc) {
off[apc.get().m]=apc.get().n;
}

__device__
__forceinline__
void bulkFinalizeFill(AtomicPairCounter const & apc) {
auto m = apc.get().m;
auto n = apc.get().n;
auto i = m + blockIdx.x * blockDim.x + threadIdx.x;
if (i>=totbins()) return;
off[i]=n;
}


#endif


__host__ __device__
__forceinline__
void count(T t) {
uint32_t b = bin(t);
assert(b<nbins());
atomicIncrement(off[b+1]);
atomicIncrement(off[b]);
}

__host__ __device__
__forceinline__
void fill(T t, index_type j, Counter * ws) {
void fill(T t, index_type j) {
uint32_t b = bin(t);
assert(b<nbins());
auto w = atomicIncrement(ws[b]);
assert(w < size(b));
bins[off[b] + w] = j;
auto w = atomicDecrement(off[b]);
assert(w>0);
bins[w-1] = j;
}


Expand All @@ -202,31 +276,35 @@ class HistoContainer {
assert(b<nbins());
b+=histOff(nh);
assert(b<totbins());
atomicIncrement(off[b+1]);
atomicIncrement(off[b]);
}

__host__ __device__
__forceinline__
void fill(T t, index_type j, Counter * ws, uint32_t nh) {
void fill(T t, index_type j, uint32_t nh) {
uint32_t b = bin(t);
assert(b<nbins());
b+=histOff(nh);
assert(b<totbins());
auto w = atomicIncrement(ws[b]);
assert(w < size(b));
bins[off[b] + w] = j;
auto w = atomicDecrement(off[b]);
assert(w>0);
bins[w-1] = j;
}

#ifdef __CUDACC__
__device__
__forceinline__
void finalize(Counter * ws) {
blockPrefixScan(off+1,totbins()-1,ws);
assert(off[totbins()-1]==0);
blockPrefixScan(off,totbins(),ws);
assert(off[totbins()-1]==off[totbins()-2]);
}
__host__
#endif
void finalize() {
for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
assert(off[totbins()-1]==0);
for(uint32_t i=1; i<totbins(); ++i) off[i]+=off[i-1];
assert(off[totbins()-1]==off[totbins()-2]);
}

constexpr auto size() const { return uint32_t(off[totbins()-1]);}
Expand All @@ -245,4 +323,13 @@ class HistoContainer {
index_type bins[capacity()];
};



template<
typename I, // type stored in the container (usually an index in a vector of the input values)
uint32_t MAXONES, // max number of "ones"
uint32_t MAXMANYS // max number of "manys"
>
using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;

#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
Loading

0 comments on commit 87ad758

Please sign in to comment.