Skip to content

Commit

Permalink
Use Cooperative Groups for a more fine-grained synchronisation
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Aug 30, 2021
1 parent 416a7e2 commit e607935
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
params_.dcaCutOuterTriplet_);

if (nhits > 1 && params_.earlyFishbone_) {
gpuPixelDoublets::fishbone<1>(
gpuPixelDoublets::fishbone<1, 1>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
}

Expand All @@ -132,7 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());

if (nhits > 1 && params_.lateFishbone_) {
gpuPixelDoublets::fishbone<1>(
gpuPixelDoublets::fishbone<1, 1>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
constexpr int threadsPerHit = 16;
constexpr int hitsPerBlock = 8;
auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock;
gpuPixelDoublets::fishbone<hitsPerBlock><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
gpuPixelDoublets::fishbone<hitsPerBlock, threadsPerHit><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
cudaCheck(cudaGetLastError());
}
Expand Down Expand Up @@ -116,7 +116,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
constexpr int threadsPerHit = 16;
constexpr int hitsPerBlock = 8;
auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock;
gpuPixelDoublets::fishbone<hitsPerBlock><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
gpuPixelDoublets::fishbone<hitsPerBlock, threadsPerHit><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
cudaCheck(cudaGetLastError());
}
Expand Down
23 changes: 15 additions & 8 deletions src/cudadev/plugin-PixelTriplets/gpuFishbone.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#include <cstdio>
#include <limits>

#include <cooperative_groups.h>
namespace cg = cooperative_groups;

#include "DataFormats/approx_atan2.h"
#include "Geometry/phase1PixelTopology.h"
#include "CUDACore/VecArray.h"
Expand All @@ -16,7 +19,7 @@

namespace gpuPixelDoublets {

template <int hitsPerBlock>
template <int hitsPerBlock = 1, int threadsPerHit = 1>
__global__ void fishbone(GPUCACell::Hits const* __restrict__ hits_p,
GPUCACell* cells,
uint32_t const* __restrict__ nCells,
Expand All @@ -27,13 +30,17 @@ namespace gpuPixelDoublets {

auto const& hits = *hits_p;

// blockDim must be a multiple of threadsPerHit
assert((blockDim.x / hitsPerBlock > 0) && (blockDim.x % hitsPerBlock == 0));
auto threadsPerHit = blockDim.x / hitsPerBlock;
// blockDim must be hitsPerBlock times threadsPerHit
assert(blockDim.x == hitsPerBlock * threadsPerHit);

// partition the thread block into threadsPerHit-sized tiles
auto tile = cg::tiled_partition<threadsPerHit>(cg::this_thread_block());
assert(hitsPerBlock == tile.meta_group_size());

auto numberOfBlocks = gridDim.x;
auto hitsPerGrid = numberOfBlocks * hitsPerBlock;
auto hitInBlock = threadIdx.x / threadsPerHit; // 0 .. hitsPerBlock-1
auto innerThread = threadIdx.x % threadsPerHit;
auto hitInBlock = tile.meta_group_rank(); // 0 .. hitsPerBlock-1
auto innerThread = tile.thread_rank();
auto firstHit = hitInBlock + blockIdx.x * hitsPerBlock;

// hitsPerBlock buffers in shared memory
Expand Down Expand Up @@ -70,7 +77,7 @@ namespace gpuPixelDoublets {
if (innerThread == 0) {
doublets = 0;
}
__syncthreads();
tile.sync();
for (int32_t i = innerThread; i < size; i += threadsPerHit) {
auto& cell = cells[vc[i]];
if (cell.unused())
Expand All @@ -86,7 +93,7 @@ namespace gpuPixelDoublets {
length2[index] = x[index] * x[index] + y[index] * y[index] + z[index] * z[index];
}

__syncthreads();
tile.sync();
if (doublets < 2)
continue;

Expand Down

0 comments on commit e607935

Please sign in to comment.