Full workflow from raw data to pixel tracks and vertices on GPUs (#216)

Port and optimise the full workflow from pixel raw data to pixel tracks and vertices to GPUs. Clean the pixel n-tuplets with the "fishbone" algorithm (only on GPUs). Other changes: - recover the Riemann fit updates lost during the merge with CMSSW 10.4.x; - speed up clustering and track fitting; - minor bug fix to avoid trivial regression with the optimized fit.
cms-patatrack · Jan 9, 2019 · 87ad758 · 87ad758
1 parent c258a04
commit 87ad758
Show file tree

Hide file tree

Showing 76 changed files with 4,622 additions and 2,461 deletions.
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -2,6 +2,7 @@
 #define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
 
 #include <cstdint>
+#include <array>
 
 namespace phase1PixelTopology {
 
@@ -29,6 +30,66 @@ namespace phase1PixelTopology {
                                           };
 
 
+  template<class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
+  {
+    return {{ f(Indices)... }};
+  }
+
+  template<int N, class Function>
+  constexpr auto map_to_array(Function f)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, N>
+  {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
+
+
+  constexpr uint32_t findMaxModuleStride() {
+    bool go = true;
+    int n=2;
+    while (go) {
+      for  (uint8_t i=1; i<11; ++i) {
+        if (layerStart[i]%n !=0) {go=false; break;}
+      }
+      if(!go) break;
+      n*=2;
+    }
+    return n/2;
+  }
+
+  constexpr uint32_t maxModuleStride = findMaxModuleStride();
+
+
+  constexpr uint8_t findLayer(uint32_t detId) {
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
+    detId*=maxModuleStride;
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+
+  constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
+  constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr bool validateLayerIndex() {
+    bool res=true;
+    for (auto i=0U; i<numberOfModules; ++i)  {
+      auto j = i/maxModuleStride;
+      res &=(layer[j]<10);
+      res &=(i>=layerStart[layer[j]]);
+      res &=(i<layerStart[layer[j]+1]);
+    }
+    return res;
+  }
+
+  static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");
+
+
   // this is for the ROC n<512 (upgrade 1024)
   constexpr inline
   uint16_t  divu52(uint16_t n) {

diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
@@ -141,5 +141,13 @@ int main() {
     assert(std::get<1>(ori)==bp);
   }
 
+  using namespace phase1PixelTopology;
+  for (auto i=0U; i<numberOfModules; ++i)  {
+    assert(layer[i]<10);
+    assert(i>=layerStart[layer[i]]);
+    assert(i<layerStart[layer[i]+1]);
+  }
+
+
   return 0;
 }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
@@ -0,0 +1,54 @@
+#ifndef HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H
+#define HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+class AtomicPairCounter {
+public:
+
+  using c_type = unsigned long long int;
+
+  AtomicPairCounter(){}
+  AtomicPairCounter(c_type i) { counter.ac=i;}
+
+  __device__ __host__
+  AtomicPairCounter & operator=(c_type i) { counter.ac=i; return *this;}
+
+  struct Counters {
+    uint32_t n;  // in a "One to Many" association is the number of "One"
+    uint32_t m;  // in a "One to Many" association is the total number of associations
+  };
+
+  union Atomic2 {
+    Counters counters;
+    c_type ac;
+  };
+
+#ifdef __CUDACC__
+
+  static constexpr c_type incr = 1UL<<32;
+
+  __device__ __host__
+  Counters get() const { return counter.counters;}
+
+  // increment n by 1 and m by i.  return previous value
+  __device__
+  Counters add(uint32_t i) {
+    c_type c = i; 
+    c+=incr;
+    Atomic2 ret;
+    ret.ac = atomicAdd(&counter.ac,c);
+    return ret.counters;
+  }
+
+#endif
+
+private:
+
+  Atomic2 counter;
+
+};
+
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
@@ -80,7 +80,8 @@ template <class T> struct SimpleVector {
   }
 
 #endif // __CUDACC__
-
+  inline constexpr bool empty() const { return m_size==0;}
+  inline constexpr bool full() const { return m_size==m_capacity;}
   inline constexpr T& operator[](int i) { return m_data[i]; }
   inline constexpr const T& operator[](int i) const { return m_data[i]; }
   inline constexpr void reset() { m_size = 0; }

diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
@@ -80,6 +80,10 @@ template <class T, int maxSize> struct VecArray {
       return T();
   }
 
+  inline constexpr T const * begin() const { return m_data;}
+  inline constexpr T const * end() const { return m_data+m_size;}
+  inline constexpr T * begin() { return m_data;}
+  inline constexpr T * end()  { return m_data+m_size;}
   inline constexpr int size() const { return m_size; }
   inline constexpr T& operator[](int i) { return m_data[i]; }
   inline constexpr const T& operator[](int i) const { return m_data[i]; }

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -19,6 +19,8 @@
 #ifdef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 #endif
+#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
+
 
 #ifdef __CUDACC__
 namespace cudautils {
@@ -38,36 +40,51 @@ namespace cudautils {
 
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
-                      uint32_t * __restrict__ ws ) {
+  void fillFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
      if(i >= offsets[nh]) return;
      auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
      assert((*off) > 0);
      int32_t ih = off - offsets - 1;
      assert(ih >= 0);
      assert(ih < nh);
-     (*h).fill(v[i], i, ws, ih);
+     (*h).fill(v[i], i, ih);
+  }
+
+  template<typename Histo>
+  void launchZero(Histo * __restrict__ h, cudaStream_t stream) {
+    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
+    cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
+  }
+
+  template<typename Histo>
+  void launchFinalize(Histo * __restrict__ h, uint8_t *  __restrict__ ws, cudaStream_t stream) {
+    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
+    size_t wss = Histo::wsSize();
+    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
   }
 
 
   template<typename Histo, typename T>
-  void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter *  __restrict__ ws,  
+  void fillManyFromVector(Histo * __restrict__ h, uint8_t *  __restrict__ ws,  
                           uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, 
                           int nthreads, cudaStream_t stream) {
-    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
-    cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
+    launchZero(h,stream); 
     auto nblocks = (totSize + nthreads - 1) / nthreads;
     countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
     cudaCheck(cudaGetLastError());
-    size_t wss = Histo::totbins();
-    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
-    cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
+    launchFinalize(h,ws,stream);
+    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
     cudaCheck(cudaGetLastError());
   }
 
 
+  template<typename Assoc>
+  __global__
+  void finalizeBulk(AtomicPairCounter const * apc, Assoc * __restrict__ assoc) {
+     assoc->bulkFinalizeFill(*apc);
+  }
+
 } // namespace cudautils
 #endif
 
@@ -149,8 +166,8 @@ class HistoContainer {
     uint32_t * v =nullptr;
     void * d_temp_storage = nullptr;
     size_t  temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
-    return std::max(temp_storage_bytes,size_t(totbins()));
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
+    return temp_storage_bytes;
   }
 #endif
 
@@ -176,22 +193,79 @@ class HistoContainer {
     #endif
   }
 
+  static __host__ __device__
+  __forceinline__
+  uint32_t atomicDecrement(Counter & x) {
+    #ifdef __CUDA_ARCH__
+    return atomicSub(&x, 1);
+    #else
+    return x--;
+    #endif
+  }
+
+ __host__ __device__
+  __forceinline__
+  void countDirect(T b) {
+    assert(b<nbins());
+    atomicIncrement(off[b]);
+  }
+
+  __host__ __device__
+  __forceinline__
+  void fillDirect(T b, index_type j) {
+    assert(b<nbins());
+    auto w = atomicDecrement(off[b]);
+    assert(w>0);
+    bins[w-1] = j;
+  }
+
+
+#ifdef __CUDACC__
+  __device__
+  __forceinline__
+  uint32_t bulkFill(AtomicPairCounter & apc, index_type const * v, uint32_t n) {
+    auto c = apc.add(n);
+    off[c.m] = c.n;
+    for(int j=0; j<n; ++j) bins[c.n+j]=v[j];
+    return c.m;
+  }
+
+  __device__
+  __forceinline__
+  void bulkFinalize(AtomicPairCounter const & apc) {
+     off[apc.get().m]=apc.get().n;
+  }
+
+  __device__
+  __forceinline__
+  void bulkFinalizeFill(AtomicPairCounter const & apc) {
+     auto m = apc.get().m;
+     auto n = apc.get().n;
+     auto i = m + blockIdx.x * blockDim.x + threadIdx.x;
+     if (i>=totbins()) return;
+     off[i]=n;
+  }
+
+
+#endif
+
+
   __host__ __device__
   __forceinline__
   void count(T t) {
     uint32_t b = bin(t);
     assert(b<nbins());
-    atomicIncrement(off[b+1]);
+    atomicIncrement(off[b]);
   }
 
   __host__ __device__
   __forceinline__
-  void fill(T t, index_type j, Counter * ws) {
+  void fill(T t, index_type j) {
     uint32_t b = bin(t);
     assert(b<nbins());
-    auto w = atomicIncrement(ws[b]);
-    assert(w < size(b));
-    bins[off[b] + w] = j;
+    auto w = atomicDecrement(off[b]);
+    assert(w>0);
+    bins[w-1] = j;
   }
 
 
@@ -202,31 +276,35 @@ class HistoContainer {
     assert(b<nbins());
     b+=histOff(nh);
     assert(b<totbins());
-    atomicIncrement(off[b+1]);
+    atomicIncrement(off[b]);
   }
 
   __host__ __device__
   __forceinline__
-  void fill(T t, index_type j, Counter * ws, uint32_t nh) {
+  void fill(T t, index_type j, uint32_t nh) {
     uint32_t b = bin(t);
     assert(b<nbins());
     b+=histOff(nh);
     assert(b<totbins());
-    auto w = atomicIncrement(ws[b]);
-    assert(w < size(b));
-    bins[off[b] + w] = j;
+    auto w = atomicDecrement(off[b]);
+    assert(w>0);
+    bins[w-1] = j;
   }
 
 #ifdef __CUDACC__
   __device__
   __forceinline__
   void finalize(Counter * ws) {
-    blockPrefixScan(off+1,totbins()-1,ws);
+    assert(off[totbins()-1]==0);
+    blockPrefixScan(off,totbins(),ws);
+    assert(off[totbins()-1]==off[totbins()-2]);
   }
   __host__
 #endif
   void finalize() {
-    for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
+    assert(off[totbins()-1]==0);
+    for(uint32_t i=1; i<totbins(); ++i) off[i]+=off[i-1];
+    assert(off[totbins()-1]==off[totbins()-2]);
   }
 
   constexpr auto size() const { return uint32_t(off[totbins()-1]);}
@@ -245,4 +323,13 @@ class HistoContainer {
   index_type bins[capacity()];
 };
 
+
+
+template<
+  typename I, // type stored in the container (usually an index in a vector of the input values)
+  uint32_t MAXONES, // max number of "ones"
+  uint32_t MAXMANYS // max number of "manys"
+>
+using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
+
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h