Skip to content

Commit

Permalink
Full workflow from raw data to pixel tracks and vertices on GPUs (#216)
Browse files Browse the repository at this point in the history
Port and optimise the full workflow from pixel raw data to pixel tracks and vertices to GPUs.
Clean the pixel n-tuplets with the "fishbone" algorithm (only on GPUs).

Other changes:
  - recover the Riemann fit updates lost during the merge with CMSSW 10.4.x;
  - speed up clustering and track fitting;
  - minor bug fix to avoid trivial regression with the optimized fit.
  • Loading branch information
VinInn authored and fwyzard committed Oct 20, 2020
1 parent aed142f commit c29d3a5
Show file tree
Hide file tree
Showing 10 changed files with 265 additions and 51 deletions.
61 changes: 61 additions & 0 deletions Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h

#include <cstdint>
#include <array>

namespace phase1PixelTopology {

Expand Down Expand Up @@ -29,6 +30,66 @@ namespace phase1PixelTopology {
};


template<class Function, std::size_t... Indices>
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
{
return {{ f(Indices)... }};
}

template<int N, class Function>
constexpr auto map_to_array(Function f)
-> std::array<typename std::result_of<Function(std::size_t)>::type, N>
{
return map_to_array_helper(f, std::make_index_sequence<N>{});
}


constexpr uint32_t findMaxModuleStride() {
bool go = true;
int n=2;
while (go) {
for (uint8_t i=1; i<11; ++i) {
if (layerStart[i]%n !=0) {go=false; break;}
}
if(!go) break;
n*=2;
}
return n/2;
}

constexpr uint32_t maxModuleStride = findMaxModuleStride();


constexpr uint8_t findLayer(uint32_t detId) {
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}

constexpr uint8_t findLayerFromCompact(uint32_t detId) {
detId*=maxModuleStride;
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}


constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);

constexpr bool validateLayerIndex() {
bool res=true;
for (auto i=0U; i<numberOfModules; ++i) {
auto j = i/maxModuleStride;
res &=(layer[j]<10);
res &=(i>=layerStart[layer[j]]);
res &=(i<layerStart[layer[j]+1]);
}
return res;
}

static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");


// this is for the ROC n<512 (upgrade 1024)
constexpr inline
uint16_t divu52(uint16_t n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,13 @@ int main() {
assert(std::get<1>(ori)==bp);
}

using namespace phase1PixelTopology;
for (auto i=0U; i<numberOfModules; ++i) {
assert(layer[i]<10);
assert(i>=layerStart[layer[i]]);
assert(i<layerStart[layer[i]+1]);
}


return 0;
}
46 changes: 33 additions & 13 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,9 @@ namespace gpuClustering {
constexpr uint32_t maxPixInModule = 4000;
constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2;
using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
constexpr auto wss = Hist::totbins();
__shared__ Hist hist;
__shared__ typename Hist::Counter ws[wss];
for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
__shared__ typename Hist::Counter ws[32];
for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0;}
__syncthreads();

assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
Expand All @@ -107,10 +106,10 @@ namespace gpuClustering {
#endif
}
__syncthreads();
hist.finalize(ws);
__syncthreads();
if (threadIdx.x<32) ws[threadIdx.x]=0; // used by prefix scan...
__syncthreads();
hist.finalize(ws);
__syncthreads();
#ifdef GPU_DEBUG
assert(hist.size()==totGood);
if (thisModuleId % 100 == 1)
Expand All @@ -120,7 +119,7 @@ namespace gpuClustering {
for (int i = first; i < msize; i += blockDim.x) {
if (id[i] == InvId) // skip invalid pixels
continue;
hist.fill(y[i],i-firstPixel,ws);
hist.fill(y[i],i-firstPixel);
}

#ifdef CLUS_LIMIT_LOOP
Expand All @@ -134,21 +133,45 @@ namespace gpuClustering {
jmax[k] = hist.end();
#endif

#ifdef GPU_DEBUG
__shared__ int nloops;
nloops=0;
#endif


__syncthreads(); // for hit filling!

#ifdef GPU_DEBUG
// look for anomalous high occupancy
__shared__ uint32_t n40,n60;
n40=n60=0;
__syncthreads();
for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) {
if(hist.size(j)>60) atomicAdd(&n60,1);
if(hist.size(j)>40) atomicAdd(&n40,1);
}
__syncthreads();
if (0==threadIdx.x) {
if (n60>0) printf("columns with more than 60 px %d in %d\n",n60,thisModuleId);
else if (n40>0) printf("columns with more than 40 px %d in %d\n",n40,thisModuleId);
}
__syncthreads();
#endif

// for each pixel, look at all the pixels until the end of the module;
// when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
// after the loop, all the pixel in each cluster should have the id equeal to the lowest
// pixel in the cluster ( clus[i] == i ).
bool more = true;
while (__syncthreads_or(more)) {
more = false;
if (1==nloops%2) {
for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
auto p = hist.begin()+j;
auto i = *p + firstPixel;
auto m = clusterId[i];
while (m!=clusterId[m]) m=clusterId[m];
clusterId[i]=m;
}
} else {
more = false;
for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
auto p = hist.begin()+j;
auto i = *p + firstPixel;
Expand All @@ -166,9 +189,7 @@ namespace gpuClustering {
// loop to columns
auto loop = [&](uint16_t const * kk) {
auto m = (*kk)+firstPixel;
#ifdef GPU_DEBUG
assert(m!=i);
#endif
if (std::abs(int(x[m]) - int(x[i])) > 1) return;
// if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
auto old = atomicMin(&clusterId[m], clusterId[i]);
Expand All @@ -185,9 +206,8 @@ namespace gpuClustering {
++p;
for (;p<e;++p) loop(p);
} // pixel loop
#ifdef GPU_DEBUG
}
if (threadIdx.x==0) ++nloops;
#endif
} // end while

#ifdef GPU_DEBUG
Expand Down
3 changes: 2 additions & 1 deletion RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,12 @@ class PixelCPEBase : public PixelClusterParameterEstimator

struct ClusterParam
{
ClusterParam(){}
ClusterParam(const SiPixelCluster & cl) : theCluster(&cl) {}

virtual ~ClusterParam() = default;

const SiPixelCluster * theCluster;
const SiPixelCluster * theCluster = nullptr;;

//--- Cluster-level quantities (filled in computeAnglesFrom....)
float cotalpha;
Expand Down
4 changes: 4 additions & 0 deletions RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ class PixelCPEFast final : public PixelCPEBase
public:
struct ClusterParamGeneric : ClusterParam
{
ClusterParamGeneric() {}
ClusterParamGeneric(const SiPixelCluster & cl) : ClusterParam(cl){}

// The truncation value pix_maximum is an angle-dependent cutoff on the
// individual pixel signals. It should be applied to all pixels in the
// cluster [signal_i = fminf(signal_i, pixmax)] before the column and row
Expand Down Expand Up @@ -52,6 +54,8 @@ class PixelCPEFast final : public PixelCPEBase

LocalPoint localPosition (DetParam const & theDetParam, ClusterParam & theClusterParam) const override;
LocalError localError (DetParam const & theDetParam, ClusterParam & theClusterParam) const override;

void errorFromTemplates(DetParam const & theDetParam, ClusterParamGeneric & theClusterParam, float qclus) const;

static void
collect_edge_charges(ClusterParam & theClusterParam, //!< input, the cluster
Expand Down
44 changes: 37 additions & 7 deletions RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ namespace pixelCPEforGPU {

float x0,y0,z0; // the vertex in the local coord of the detector

float sx[3], sy[3]; // the errors...

Frame frame;
};

Expand Down Expand Up @@ -208,31 +210,37 @@ namespace pixelCPEforGPU {
}

constexpr inline
void error(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
void errorFromSize(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
// Edge cluster errors
cp.xerr[ic]= 0.0050;
cp.yerr[ic]= 0.0085;

// FIXME these are errors form Run1
constexpr float xerr_barrel_l1[] = { 0.00115, 0.00120, 0.00088 };
constexpr float xerr_barrel_l1_def = 0.01030;
constexpr float xerr_barrel_l1_def = 0.00200; // 0.01030;
constexpr float yerr_barrel_l1[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
constexpr float yerr_barrel_l1_def = 0.00210;
constexpr float xerr_barrel_ln[] = { 0.00115, 0.00120, 0.00088 };
constexpr float xerr_barrel_ln_def = 0.01030;
constexpr float xerr_barrel_ln_def = 0.00200; // 0.01030;
constexpr float yerr_barrel_ln[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
constexpr float yerr_barrel_ln_def = 0.00210;
constexpr float xerr_endcap[] = { 0.0020, 0.0020 };
constexpr float xerr_endcap_def = 0.0020;
constexpr float yerr_endcap[] = { 0.00210 };
constexpr float yerr_endcap_def = 0.00210;

auto sx = cp.maxRow[ic] - cp.minRow[ic];
auto sy = cp.maxCol[ic] - cp.minCol[ic];

// is edgy ?
bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
// is one and big?
bool isBig1X = (0==sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
bool isBig1Y = (0==sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic]);

if (not isEdgeX) {
auto sx = cp.maxRow[ic] - cp.minRow[ic];

if (!isEdgeX && !isBig1X ) {
if (not detParams.isBarrel) {
cp.xerr[ic] = sx <std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
} else if (detParams.layer == 1) {
Expand All @@ -242,8 +250,7 @@ namespace pixelCPEforGPU {
}
}

if (not isEdgeY) {
auto sy = cp.maxCol[ic] - cp.minCol[ic];;
if (!isEdgeY && !isBig1Y) {
if (not detParams.isBarrel) {
cp.yerr[ic] = sy <std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
} else if (detParams.layer == 1) {
Expand All @@ -254,6 +261,29 @@ namespace pixelCPEforGPU {
}
}


constexpr inline
void errorFromDB(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
// Edge cluster errors
cp.xerr[ic]= 0.0050f;
cp.yerr[ic]= 0.0085f;

auto sx = cp.maxRow[ic] - cp.minRow[ic];
auto sy = cp.maxCol[ic] - cp.minCol[ic];

// is edgy ?
bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
// is one and big?
uint32_t ix = (0==sx);
uint32_t iy = (0==sy);
ix+= (0==sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
iy+= (0==sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic]);

if (not isEdgeX) cp.xerr[ic] = detParams.sx[ix];
if (not isEdgeY) cp.yerr[ic] = detParams.sy[iy];
}

}

#endif // RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
13 changes: 10 additions & 3 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,22 @@ namespace pixelgpudetails {
gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);

cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins()));
cudaCheck(cudaMalloc((void **) & gpu_.hws_d, HitsOnGPU::Hist::wsSize()));
cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
gpu_.me_d = gpu_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));

// Feels a bit dumb but constexpr arrays are not supported for device code
// TODO: should be moved to EventSetup (or better ideas?)
// Would it be better to use "constant memory"?
cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayerStart_, 11 * sizeof(uint32_t)));
cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id()));
cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayer_, phase1PixelTopology::layer.size() * sizeof(uint8_t)));
cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayer_, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size() * sizeof(uint8_t), cudaMemcpyDefault, cudaStream.id()));

gpu_.phase1TopologyLayerStart_d = d_phase1TopologyLayerStart_;
gpu_.phase1TopologyLayer_d = d_phase1TopologyLayer_;

gpu_.me_d = gpu_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));

cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)));

Expand Down Expand Up @@ -113,6 +119,7 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(gpu_.hws_d));
cudaCheck(cudaFree(gpu_d));
cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
cudaCheck(cudaFree(d_phase1TopologyLayer_));

cudaCheck(cudaFreeHost(h_hitsModuleStart_));
cudaCheck(cudaFreeHost(h_owner_32bit_));
Expand Down
1 change: 1 addition & 0 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ namespace pixelgpudetails {
HitsOnGPU gpu_;
uint32_t nhits_ = 0;
uint32_t *d_phase1TopologyLayerStart_ = nullptr;
uint8_t *d_phase1TopologyLayer_ = nullptr;
uint32_t *h_hitsModuleStart_ = nullptr;
uint16_t *h_detInd_ = nullptr;
int32_t *h_charge_ = nullptr;
Expand Down
6 changes: 3 additions & 3 deletions RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ namespace gpuPixelRecHits {
assert(h < 2000*256);

pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
pixelCPEforGPU::error(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);

chargeh[h] = clusParams.charge[ic];

Expand All @@ -135,8 +135,8 @@ namespace gpuPixelRecHits {
xl[h]= clusParams.xpos[ic];
yl[h]= clusParams.ypos[ic];

xe[h]= clusParams.xerr[ic];
ye[h]= clusParams.yerr[ic];
xe[h]= clusParams.xerr[ic]*clusParams.xerr[ic];
ye[h]= clusParams.yerr[ic]*clusParams.yerr[ic];
mr[h]= clusParams.minRow[ic];
mc[h]= clusParams.minCol[ic];

Expand Down
Loading

0 comments on commit c29d3a5

Please sign in to comment.