Use Cooperative Groups for a more fine-grained synchronisation

cms-patatrack · Aug 29, 2021 · 91163ef · 91163ef
1 parent 7bed080
commit 91163ef
Showing 1 changed file with 21 additions and 14 deletions.
diff --git a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h
@@ -7,6 +7,8 @@
 #include <cstdio>
 #include <limits>
 
+#include <cooperative_groups.h>
+
 #include "DataFormats/approx_atan2.h"
 #include "Geometry/phase1PixelTopology.h"
 #include "CUDACore/VecArray.h"
@@ -15,6 +17,7 @@
 #include "GPUCACell.h"
 
 namespace gpuPixelDoublets {
+  namespace cg = cooperative_groups;
 
   template <int threadsPerHit>
   __global__ void fishbone(GPUCACell::Hits const* __restrict__ hits_p,
@@ -29,11 +32,14 @@ namespace gpuPixelDoublets {
 
     // blockDim must be a multiple of threadsPerHit
     assert((blockDim.x / threadsPerHit > 0) && (blockDim.x % threadsPerHit == 0));
-    auto hitsPerBlock = blockDim.x / threadsPerHit;
+
+    // partition the thread block into threadsPerHit-sized tiles
+    auto tile = cg::tiled_partition<threadsPerHit>(cg::this_thread_block());
+    auto hitsPerBlock = tile.meta_group_size();
     auto numberOfBlocks = gridDim.x;
     auto hitsPerGrid = numberOfBlocks * hitsPerBlock;
-    auto firstHit = (threadIdx.x + blockIdx.x * blockDim.x) / threadsPerHit;
-    auto innerThread = threadIdx.x % threadsPerHit;
+    auto firstHit = tile.meta_group_rank() + blockIdx.x * hitsPerBlock;
+    uint32_t innerThread = tile.thread_rank();
 
     // threadsPerHit buffers in shared memory
     __shared__ float s_x[threadsPerHit][maxCellsPerHit];
@@ -45,13 +51,13 @@ namespace gpuPixelDoublets {
     __shared__ uint32_t s_doublets[threadsPerHit];
 
     // buffer used by the current thread
-    float (&x)[maxCellsPerHit] = s_x[innerThread];
-    float (&y)[maxCellsPerHit] = s_y[innerThread];
-    float (&z)[maxCellsPerHit] = s_z[innerThread];
-    float (&length2)[maxCellsPerHit] = s_length2[innerThread];
-    uint32_t (&cc)[maxCellsPerHit] = s_cc[innerThread];
-    uint16_t (&detId)[maxCellsPerHit] = s_detId[innerThread];
-    uint32_t & doublets = s_doublets[innerThread];
+    float(&x)[maxCellsPerHit] = s_x[innerThread];
+    float(&y)[maxCellsPerHit] = s_y[innerThread];
+    float(&z)[maxCellsPerHit] = s_z[innerThread];
+    float(&length2)[maxCellsPerHit] = s_length2[innerThread];
+    uint32_t(&cc)[maxCellsPerHit] = s_cc[innerThread];
+    uint16_t(&detId)[maxCellsPerHit] = s_detId[innerThread];
+    uint32_t& doublets = s_doublets[innerThread];
 
     // outer loop: parallelize over the hits
     for (int hit = firstHit; hit < (int)nHits; hit += hitsPerGrid) {
@@ -69,7 +75,7 @@ namespace gpuPixelDoublets {
       if (innerThread == 0) {
         doublets = 0;
       }
-      __syncthreads();
+      tile.sync();
       for (int32_t i = innerThread; i < size; i += threadsPerHit) {
         auto& cell = cells[vc[i]];
         if (cell.unused())
@@ -84,14 +90,14 @@ namespace gpuPixelDoublets {
         length2[doublets] = x[doublets] * x[doublets] + y[doublets] * y[doublets] + z[doublets] * z[doublets];
         atomicInc(&doublets, 0xFFFFFFFF);
       }
-      __syncthreads();
+      tile.sync();
       if (doublets < 2)
         continue;
 
       // inner loop: parallelize over the doublets
-      for (int32_t i = innerThread; i < doublets - 1; i += threadsPerHit) {
+      for (uint32_t i = innerThread; i < doublets - 1; i += threadsPerHit) {
         auto& cell_i = cells[cc[i]];
-        for (auto j = i + 1; j < doublets; ++j) {
+        for (uint32_t j = i + 1; j < doublets; ++j) {
           // must be different detectors (potentially in the same layer)
           if (detId[i] == detId[j])
             continue;
@@ -110,6 +116,7 @@ namespace gpuPixelDoublets {
       }    // i
     }      // hit
   }
+
 }  // namespace gpuPixelDoublets
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h