From 91163ef31c744332c38c82ab3b78f901d60348ae Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sun, 29 Aug 2021 23:55:45 +0200 Subject: [PATCH] Use Cooperative Groups for a more fine-grained synchronisation --- .../plugin-PixelTriplets/gpuFishbone.h | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h index c5a492dab..37fe19465 100644 --- a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h +++ b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h @@ -7,6 +7,8 @@ #include #include +#include + #include "DataFormats/approx_atan2.h" #include "Geometry/phase1PixelTopology.h" #include "CUDACore/VecArray.h" @@ -15,6 +17,7 @@ #include "GPUCACell.h" namespace gpuPixelDoublets { + namespace cg = cooperative_groups; template __global__ void fishbone(GPUCACell::Hits const* __restrict__ hits_p, @@ -29,11 +32,14 @@ namespace gpuPixelDoublets { // blockDim must be a multiple of threadsPerHit assert((blockDim.x / threadsPerHit > 0) && (blockDim.x % threadsPerHit == 0)); - auto hitsPerBlock = blockDim.x / threadsPerHit; + + // partition the thread block into threadsPerHit-sized tiles + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto hitsPerBlock = tile.meta_group_size(); auto numberOfBlocks = gridDim.x; auto hitsPerGrid = numberOfBlocks * hitsPerBlock; - auto firstHit = (threadIdx.x + blockIdx.x * blockDim.x) / threadsPerHit; - auto innerThread = threadIdx.x % threadsPerHit; + auto firstHit = tile.meta_group_rank() + blockIdx.x * hitsPerBlock; + uint32_t innerThread = tile.thread_rank(); // threadsPerHit buffers in shared memory __shared__ float s_x[threadsPerHit][maxCellsPerHit]; @@ -45,13 +51,13 @@ namespace gpuPixelDoublets { __shared__ uint32_t s_doublets[threadsPerHit]; // buffer used by the current thread - float (&x)[maxCellsPerHit] = s_x[innerThread]; - float (&y)[maxCellsPerHit] = s_y[innerThread]; - float (&z)[maxCellsPerHit] = s_z[innerThread]; - float (&length2)[maxCellsPerHit] = s_length2[innerThread]; - uint32_t (&cc)[maxCellsPerHit] = s_cc[innerThread]; - uint16_t (&detId)[maxCellsPerHit] = s_detId[innerThread]; - uint32_t & doublets = s_doublets[innerThread]; + float(&x)[maxCellsPerHit] = s_x[innerThread]; + float(&y)[maxCellsPerHit] = s_y[innerThread]; + float(&z)[maxCellsPerHit] = s_z[innerThread]; + float(&length2)[maxCellsPerHit] = s_length2[innerThread]; + uint32_t(&cc)[maxCellsPerHit] = s_cc[innerThread]; + uint16_t(&detId)[maxCellsPerHit] = s_detId[innerThread]; + uint32_t& doublets = s_doublets[innerThread]; // outer loop: parallelize over the hits for (int hit = firstHit; hit < (int)nHits; hit += hitsPerGrid) { @@ -69,7 +75,7 @@ namespace gpuPixelDoublets { if (innerThread == 0) { doublets = 0; } - __syncthreads(); + tile.sync(); for (int32_t i = innerThread; i < size; i += threadsPerHit) { auto& cell = cells[vc[i]]; if (cell.unused()) @@ -84,14 +90,14 @@ namespace gpuPixelDoublets { length2[doublets] = x[doublets] * x[doublets] + y[doublets] * y[doublets] + z[doublets] * z[doublets]; atomicInc(&doublets, 0xFFFFFFFF); } - __syncthreads(); + tile.sync(); if (doublets < 2) continue; // inner loop: parallelize over the doublets - for (int32_t i = innerThread; i < doublets - 1; i += threadsPerHit) { + for (uint32_t i = innerThread; i < doublets - 1; i += threadsPerHit) { auto& cell_i = cells[cc[i]]; - for (auto j = i + 1; j < doublets; ++j) { + for (uint32_t j = i + 1; j < doublets; ++j) { // must be different detectors (potentially in the same layer) if (detId[i] == detId[j]) continue; @@ -110,6 +116,7 @@ namespace gpuPixelDoublets { } // i } // hit } + } // namespace gpuPixelDoublets #endif // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h