Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

faster track fit and faster clustering #216

Merged
merged 104 commits into from
Jan 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
104 commits
Select commit Hold shift + click to select a range
d15d0dd
use gpu vertices
VinInn Oct 5, 2018
fc8ffad
add vertex spitting
VinInn Oct 5, 2018
9e3a3aa
fix iterations
VinInn Oct 5, 2018
c9418f6
apply outlier rejection, tune error
VinInn Oct 7, 2018
5e9d7cf
fix duplicate cleaning
VinInn Oct 19, 2018
2e7910f
sort and clean
VinInn Oct 20, 2018
2d155f1
fishbone works
VinInn Oct 26, 2018
aad5235
fishbone works
VinInn Oct 26, 2018
3244703
fishbone works
VinInn Oct 26, 2018
d6508a8
add layerid
VinInn Oct 28, 2018
e2fd6d2
copy layer on gpu
VinInn Oct 28, 2018
c68f413
efficient
VinInn Oct 29, 2018
9dc2184
optimize parallelization
VinInn Oct 29, 2018
b3ed9d0
update notebook to include fishbone
VinInn Oct 31, 2018
cc973f6
silence it
VinInn Oct 31, 2018
06365df
mark magic 2
VinInn Oct 31, 2018
f2439af
remove magic 256, reduce it to 128
VinInn Oct 31, 2018
ae7fc3f
reduce size
VinInn Oct 31, 2018
9030763
remove duplicate code lines
VinInn Oct 31, 2018
d79faf5
narrow cut to avoid inefficiency for realistic
VinInn Nov 1, 2018
376a0d4
Merged gpuVertexRedux from repository VinInn with cms-merge-topic
VinInn Nov 1, 2018
ad06e33
build pentuplets
VinInn Nov 2, 2018
7bea72e
simplify
VinInn Nov 3, 2018
932fec9
align to offline
VinInn Nov 4, 2018
d0f3adf
simplify histogrammer: no need of ws in fill
VinInn Nov 4, 2018
6a192fd
test cuda_assert
VinInn Nov 5, 2018
fce72dc
use more stable and gpu friendly version of circle
VinInn Nov 5, 2018
f7dbc25
assoc tested
VinInn Nov 6, 2018
483d591
check cosdir
VinInn Nov 6, 2018
384465e
clean clode
VinInn Nov 6, 2018
ada49bd
try to use template errors
VinInn Nov 7, 2018
6acfd9f
retune but still use old params
VinInn Nov 7, 2018
45f65b9
add AtomicPairCounter and implement manyToOne
VinInn Nov 8, 2018
7ac1cf3
tuning cuts
VinInn Nov 9, 2018
2d8e41b
few steps toward persistent gputracks, crashes
VinInn Nov 9, 2018
0adee78
Q productions works
VinInn Nov 9, 2018
0198e52
forward hits
VinInn Nov 10, 2018
fd8f49c
compiles
VinInn Nov 11, 2018
fd49d01
runs
VinInn Nov 11, 2018
aef70eb
use less memory
VinInn Nov 11, 2018
545a326
use even less memory
VinInn Nov 11, 2018
511aa99
add quality flag
VinInn Nov 11, 2018
de2333d
factorize
VinInn Nov 12, 2018
2a2ab2b
factorize
VinInn Nov 12, 2018
e028fd1
reading correcly tuples
VinInn Nov 12, 2018
3064dd9
read hits
VinInn Nov 12, 2018
e64286e
Add B-hadron MTV variation to pixel track validation sequence
makortel Nov 12, 2018
5695203
fix errors on gpu
VinInn Nov 12, 2018
bca3120
tip/zip ok
VinInn Nov 13, 2018
9367675
use new version of Rinman fit
VinInn Nov 13, 2018
76cfce2
fix error^2
VinInn Nov 13, 2018
433d1cd
fix pixel errors
VinInn Nov 13, 2018
3bd5c2e
use error from templates
VinInn Nov 13, 2018
6899cd0
dup remover written
VinInn Nov 13, 2018
ae195a2
filter duplicates
VinInn Nov 13, 2018
e135f21
mae sure algo is stable
VinInn Nov 14, 2018
a605664
Merged mtvBhadronPixel from repository makortel with cms-merge-topic
VinInn Nov 14, 2018
5763c56
fix for absent lape
VinInn Nov 14, 2018
7ea1bb1
drop quads if sharing cell with pents
VinInn Nov 14, 2018
dd6ad51
add region cuts
VinInn Nov 14, 2018
34989f3
merged, refactorize
VinInn Nov 16, 2018
8cc7562
merged, refactorize
VinInn Nov 16, 2018
5cafde2
Merged gpuTracks104 from repository VinInn with cms-merge-topic
VinInn Nov 16, 2018
5ed6161
back to previous status
VinInn Nov 16, 2018
572613b
prepare vertex finder to read from gpu
VinInn Nov 17, 2018
2797693
produce vertices
VinInn Nov 17, 2018
aa0e4ad
maka vertices on gpu only: not scheduled..
VinInn Nov 17, 2018
989019e
make profiling working
VinInn Nov 17, 2018
db66a14
minor cleanup
VinInn Nov 17, 2018
79fd0ae
silenced
VinInn Nov 18, 2018
68f9162
solve conflict
VinInn Nov 18, 2018
ddad076
resize to avoid overflows
VinInn Nov 18, 2018
a6e3e7d
protect and report cell overflow as well
VinInn Nov 19, 2018
0935c5e
more cleanup
VinInn Nov 19, 2018
e695b29
remove all cpu stuff from CA on gpu
VinInn Nov 19, 2018
b2445f2
fix gpu only wf
VinInn Nov 19, 2018
fb73c7a
Address code style and quality issues (#203)
fwyzard Nov 28, 2018
6110cf4
Fix MTV validation of initialStepPreSplitting tracks and add B-hadron…
makortel Nov 29, 2018
f11b911
Merge branch 'CMSSW_10_4_X_Patatrack' into gpuTracks104
VinInn Nov 29, 2018
77bd114
Fix Free issues
VinInn Nov 29, 2018
ef869f5
Merge branch 'CMSSW_10_4_X_Patatrack' into gpuTracks104
VinInn Nov 29, 2018
94b521e
Remove stray empty lines for consistency with upstream
fwyzard Dec 3, 2018
54f759e
Merged gpuTracks104 from repository VinInn with cms-merge-topic
VinInn Dec 3, 2018
76a4ae9
add test, adress first set of comments
VinInn Dec 3, 2018
ce143ca
more comments addressed
VinInn Dec 3, 2018
13b7277
silenced
VinInn Dec 3, 2018
c455a96
now works
VinInn Dec 4, 2018
6d9379f
test of fit on gpu works
VinInn Dec 4, 2018
d3cc0b4
Merge branch 'CMSSW_10_4_X_Patatrack' of https://github.com/cms-patat…
VinInn Dec 7, 2018
522cfdf
late fishbone
VinInn Dec 7, 2018
c8623a5
make fishbone configurable
VinInn Dec 8, 2018
807d794
Merge branch 'CMSSW_10_4_X_Patatrack' into gpuTracks104
VinInn Dec 8, 2018
1d517e8
Merged gpuTracks104 from repository VinInn with cms-merge-topic
VinInn Dec 9, 2018
909cfb0
fix long standing bug (minor effect)
VinInn Dec 10, 2018
b8b4299
compiles and run
VinInn Dec 11, 2018
0c55b56
lit fit is wrong
VinInn Dec 11, 2018
811cb63
fix stupid bug
VinInn Dec 11, 2018
47d28a7
remove default arg
VinInn Dec 11, 2018
ff31eaf
debug occupancy
VinInn Dec 12, 2018
5ee726d
faster clustering
VinInn Dec 12, 2018
31584e9
apply to vertex as well
VinInn Dec 12, 2018
002a576
clean assert
VinInn Dec 13, 2018
49df121
fix missing Free
VinInn Dec 14, 2018
4095225
Silence initcheck reports
fwyzard Jan 8, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h

#include <cstdint>
#include <array>

namespace phase1PixelTopology {

Expand Down Expand Up @@ -29,6 +30,66 @@ namespace phase1PixelTopology {
};


template<class Function, std::size_t... Indices>
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
{
return {{ f(Indices)... }};
}

template<int N, class Function>
constexpr auto map_to_array(Function f)
-> std::array<typename std::result_of<Function(std::size_t)>::type, N>
{
return map_to_array_helper(f, std::make_index_sequence<N>{});
}


constexpr uint32_t findMaxModuleStride() {
bool go = true;
int n=2;
while (go) {
for (uint8_t i=1; i<11; ++i) {
if (layerStart[i]%n !=0) {go=false; break;}
}
if(!go) break;
n*=2;
}
return n/2;
}

constexpr uint32_t maxModuleStride = findMaxModuleStride();


constexpr uint8_t findLayer(uint32_t detId) {
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}

constexpr uint8_t findLayerFromCompact(uint32_t detId) {
detId*=maxModuleStride;
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}


constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);

constexpr bool validateLayerIndex() {
bool res=true;
for (auto i=0U; i<numberOfModules; ++i) {
auto j = i/maxModuleStride;
res &=(layer[j]<10);
res &=(i>=layerStart[layer[j]]);
res &=(i<layerStart[layer[j]+1]);
}
return res;
}

static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");


// this is for the ROC n<512 (upgrade 1024)
constexpr inline
uint16_t divu52(uint16_t n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,13 @@ int main() {
assert(std::get<1>(ori)==bp);
}

using namespace phase1PixelTopology;
for (auto i=0U; i<numberOfModules; ++i) {
assert(layer[i]<10);
assert(i>=layerStart[layer[i]]);
assert(i<layerStart[layer[i]+1]);
}


return 0;
}
54 changes: 54 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H
#define HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H

#include <cuda_runtime.h>
#include <cstdint>

class AtomicPairCounter {
public:

using c_type = unsigned long long int;

AtomicPairCounter(){}
AtomicPairCounter(c_type i) { counter.ac=i;}

__device__ __host__
AtomicPairCounter & operator=(c_type i) { counter.ac=i; return *this;}

struct Counters {
uint32_t n; // in a "One to Many" association is the number of "One"
uint32_t m; // in a "One to Many" association is the total number of associations
};

union Atomic2 {
Counters counters;
c_type ac;
};

#ifdef __CUDACC__

static constexpr c_type incr = 1UL<<32;

__device__ __host__
Counters get() const { return counter.counters;}

// increment n by 1 and m by i. return previous value
__device__
Counters add(uint32_t i) {
c_type c = i;
c+=incr;
Atomic2 ret;
ret.ac = atomicAdd(&counter.ac,c);
return ret.counters;
}

#endif

private:

Atomic2 counter;

};


#endif
3 changes: 2 additions & 1 deletion HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ template <class T> struct SimpleVector {
}

#endif // __CUDACC__

inline constexpr bool empty() const { return m_size==0;}
inline constexpr bool full() const { return m_size==m_capacity;}
inline constexpr T& operator[](int i) { return m_data[i]; }
inline constexpr const T& operator[](int i) const { return m_data[i]; }
inline constexpr void reset() { m_size = 0; }
Expand Down
4 changes: 4 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ template <class T, int maxSize> struct VecArray {
return T();
}

inline constexpr T const * begin() const { return m_data;}
inline constexpr T const * end() const { return m_data+m_size;}
inline constexpr T * begin() { return m_data;}
inline constexpr T * end() { return m_data+m_size;}
inline constexpr int size() const { return m_size; }
inline constexpr T& operator[](int i) { return m_data[i]; }
inline constexpr const T& operator[](int i) const { return m_data[i]; }
Expand Down
135 changes: 111 additions & 24 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#ifdef __CUDACC__
#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
#endif
#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"


#ifdef __CUDACC__
namespace cudautils {
Expand All @@ -38,36 +40,51 @@ namespace cudautils {

template<typename Histo, typename T>
__global__
void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
uint32_t * __restrict__ ws ) {
void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= offsets[nh]) return;
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < nh);
(*h).fill(v[i], i, ws, ih);
(*h).fill(v[i], i, ih);
}

template<typename Histo>
void launchZero(Histo * __restrict__ h, cudaStream_t stream) {
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
}

template<typename Histo>
void launchFinalize(Histo * __restrict__ h, uint8_t * __restrict__ ws, cudaStream_t stream) {
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
size_t wss = Histo::wsSize();
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
}


template<typename Histo, typename T>
void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws,
void fillManyFromVector(Histo * __restrict__ h, uint8_t * __restrict__ ws,
uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize,
int nthreads, cudaStream_t stream) {
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
launchZero(h,stream);
auto nblocks = (totSize + nthreads - 1) / nthreads;
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
size_t wss = Histo::totbins();
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
launchFinalize(h,ws,stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
}


template<typename Assoc>
__global__
void finalizeBulk(AtomicPairCounter const * apc, Assoc * __restrict__ assoc) {
assoc->bulkFinalizeFill(*apc);
}

} // namespace cudautils
#endif

Expand Down Expand Up @@ -149,8 +166,8 @@ class HistoContainer {
uint32_t * v =nullptr;
void * d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
return std::max(temp_storage_bytes,size_t(totbins()));
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
return temp_storage_bytes;
}
#endif

Expand All @@ -176,22 +193,79 @@ class HistoContainer {
#endif
}

static __host__ __device__
__forceinline__
uint32_t atomicDecrement(Counter & x) {
#ifdef __CUDA_ARCH__
return atomicSub(&x, 1);
#else
return x--;
#endif
}

__host__ __device__
__forceinline__
void countDirect(T b) {
assert(b<nbins());
atomicIncrement(off[b]);
}

__host__ __device__
__forceinline__
void fillDirect(T b, index_type j) {
assert(b<nbins());
auto w = atomicDecrement(off[b]);
assert(w>0);
bins[w-1] = j;
}


#ifdef __CUDACC__
__device__
__forceinline__
uint32_t bulkFill(AtomicPairCounter & apc, index_type const * v, uint32_t n) {
auto c = apc.add(n);
off[c.m] = c.n;
for(int j=0; j<n; ++j) bins[c.n+j]=v[j];
return c.m;
}

__device__
__forceinline__
void bulkFinalize(AtomicPairCounter const & apc) {
off[apc.get().m]=apc.get().n;
}

__device__
__forceinline__
void bulkFinalizeFill(AtomicPairCounter const & apc) {
auto m = apc.get().m;
auto n = apc.get().n;
auto i = m + blockIdx.x * blockDim.x + threadIdx.x;
if (i>=totbins()) return;
off[i]=n;
}


#endif


__host__ __device__
__forceinline__
void count(T t) {
uint32_t b = bin(t);
assert(b<nbins());
atomicIncrement(off[b+1]);
atomicIncrement(off[b]);
}

__host__ __device__
__forceinline__
void fill(T t, index_type j, Counter * ws) {
void fill(T t, index_type j) {
uint32_t b = bin(t);
assert(b<nbins());
auto w = atomicIncrement(ws[b]);
assert(w < size(b));
bins[off[b] + w] = j;
auto w = atomicDecrement(off[b]);
assert(w>0);
bins[w-1] = j;
}


Expand All @@ -202,31 +276,35 @@ class HistoContainer {
assert(b<nbins());
b+=histOff(nh);
assert(b<totbins());
atomicIncrement(off[b+1]);
atomicIncrement(off[b]);
}

__host__ __device__
__forceinline__
void fill(T t, index_type j, Counter * ws, uint32_t nh) {
void fill(T t, index_type j, uint32_t nh) {
uint32_t b = bin(t);
assert(b<nbins());
b+=histOff(nh);
assert(b<totbins());
auto w = atomicIncrement(ws[b]);
assert(w < size(b));
bins[off[b] + w] = j;
auto w = atomicDecrement(off[b]);
assert(w>0);
bins[w-1] = j;
}

#ifdef __CUDACC__
__device__
__forceinline__
void finalize(Counter * ws) {
blockPrefixScan(off+1,totbins()-1,ws);
assert(off[totbins()-1]==0);
blockPrefixScan(off,totbins(),ws);
assert(off[totbins()-1]==off[totbins()-2]);
}
__host__
#endif
void finalize() {
for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
assert(off[totbins()-1]==0);
for(uint32_t i=1; i<totbins(); ++i) off[i]+=off[i-1];
assert(off[totbins()-1]==off[totbins()-2]);
}

constexpr auto size() const { return uint32_t(off[totbins()-1]);}
Expand All @@ -245,4 +323,13 @@ class HistoContainer {
index_type bins[capacity()];
};



template<
typename I, // type stored in the container (usually an index in a vector of the input values)
uint32_t MAXONES, // max number of "ones"
uint32_t MAXMANYS // max number of "manys"
>
using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;

#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
Loading