Skip to content

Commit

Permalink
Use Cooperative Groups for a more fine-grained synchronisation
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Aug 29, 2021
1 parent 7bed080 commit 91163ef
Showing 1 changed file with 21 additions and 14 deletions.
35 changes: 21 additions & 14 deletions src/cudadev/plugin-PixelTriplets/gpuFishbone.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <cstdio>
#include <limits>

#include <cooperative_groups.h>

#include "DataFormats/approx_atan2.h"
#include "Geometry/phase1PixelTopology.h"
#include "CUDACore/VecArray.h"
Expand All @@ -15,6 +17,7 @@
#include "GPUCACell.h"

namespace gpuPixelDoublets {
namespace cg = cooperative_groups;

template <int threadsPerHit>
__global__ void fishbone(GPUCACell::Hits const* __restrict__ hits_p,
Expand All @@ -29,11 +32,14 @@ namespace gpuPixelDoublets {

// blockDim must be a multiple of threadsPerHit
assert((blockDim.x / threadsPerHit > 0) && (blockDim.x % threadsPerHit == 0));
auto hitsPerBlock = blockDim.x / threadsPerHit;

// partition the thread block into threadsPerHit-sized tiles
auto tile = cg::tiled_partition<threadsPerHit>(cg::this_thread_block());
auto hitsPerBlock = tile.meta_group_size();
auto numberOfBlocks = gridDim.x;
auto hitsPerGrid = numberOfBlocks * hitsPerBlock;
auto firstHit = (threadIdx.x + blockIdx.x * blockDim.x) / threadsPerHit;
auto innerThread = threadIdx.x % threadsPerHit;
auto firstHit = tile.meta_group_rank() + blockIdx.x * hitsPerBlock;
uint32_t innerThread = tile.thread_rank();

// threadsPerHit buffers in shared memory
__shared__ float s_x[threadsPerHit][maxCellsPerHit];
Expand All @@ -45,13 +51,13 @@ namespace gpuPixelDoublets {
__shared__ uint32_t s_doublets[threadsPerHit];

// buffer used by the current thread
float (&x)[maxCellsPerHit] = s_x[innerThread];
float (&y)[maxCellsPerHit] = s_y[innerThread];
float (&z)[maxCellsPerHit] = s_z[innerThread];
float (&length2)[maxCellsPerHit] = s_length2[innerThread];
uint32_t (&cc)[maxCellsPerHit] = s_cc[innerThread];
uint16_t (&detId)[maxCellsPerHit] = s_detId[innerThread];
uint32_t & doublets = s_doublets[innerThread];
float(&x)[maxCellsPerHit] = s_x[innerThread];
float(&y)[maxCellsPerHit] = s_y[innerThread];
float(&z)[maxCellsPerHit] = s_z[innerThread];
float(&length2)[maxCellsPerHit] = s_length2[innerThread];
uint32_t(&cc)[maxCellsPerHit] = s_cc[innerThread];
uint16_t(&detId)[maxCellsPerHit] = s_detId[innerThread];
uint32_t& doublets = s_doublets[innerThread];

// outer loop: parallelize over the hits
for (int hit = firstHit; hit < (int)nHits; hit += hitsPerGrid) {
Expand All @@ -69,7 +75,7 @@ namespace gpuPixelDoublets {
if (innerThread == 0) {
doublets = 0;
}
__syncthreads();
tile.sync();
for (int32_t i = innerThread; i < size; i += threadsPerHit) {
auto& cell = cells[vc[i]];
if (cell.unused())
Expand All @@ -84,14 +90,14 @@ namespace gpuPixelDoublets {
length2[doublets] = x[doublets] * x[doublets] + y[doublets] * y[doublets] + z[doublets] * z[doublets];
atomicInc(&doublets, 0xFFFFFFFF);
}
__syncthreads();
tile.sync();
if (doublets < 2)
continue;

// inner loop: parallelize over the doublets
for (int32_t i = innerThread; i < doublets - 1; i += threadsPerHit) {
for (uint32_t i = innerThread; i < doublets - 1; i += threadsPerHit) {
auto& cell_i = cells[cc[i]];
for (auto j = i + 1; j < doublets; ++j) {
for (uint32_t j = i + 1; j < doublets; ++j) {
// must be different detectors (potentially in the same layer)
if (detId[i] == detId[j])
continue;
Expand All @@ -110,6 +116,7 @@ namespace gpuPixelDoublets {
} // i
} // hit
}

} // namespace gpuPixelDoublets

#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h

0 comments on commit 91163ef

Please sign in to comment.