Skip to content

Commit

Permalink
Tracklet finder on GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Dec 3, 2024
1 parent ff486d4 commit 822e629
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,6 @@ class TimeFrameGPU : public TimeFrame
void setDevicePropagator(const o2::base::PropagatorImpl<float>*) override;

// Host-specific getters
gsl::span<int> getHostNTracklets(const int chunkId);
gsl::span<int> getHostNCells(const int chunkId);
gsl::span<int, nLayers - 1> getNTracklets() { return mNTracklets; }
gsl::span<int, nLayers - 2> getNCells() { return mNCells; }

Expand Down
14 changes: 7 additions & 7 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,9 @@ void TimeFrameGPU<nLayers>::createCellsLUTDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs");
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
Expand All @@ -355,7 +355,7 @@ void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
mNCells[layer] = 0;
checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost));
checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost));
LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB);
allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator());

Expand Down Expand Up @@ -446,9 +446,9 @@ void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts");
for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1));
mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1);
checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1);
checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
}
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}
Expand Down
6 changes: 2 additions & 4 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ int TrackerTraitsGPU<nLayers>::getTFNumberOfClusters() const
template <int nLayers>
int TrackerTraitsGPU<nLayers>::getTFNumberOfTracklets() const
{
return mTimeFrameGPU->getNumberOfTracklets();
return std::accumulate(mTimeFrameGPU->getNTracklets().begin(), mTimeFrameGPU->getNTracklets().end(), 0);
}

template <int nLayers>
Expand All @@ -91,7 +91,7 @@ template <int nLayers>
void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex)
{
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
// TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
mTimeFrameGPU->createTrackletsLUTDevice(iteration);

const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
Expand Down Expand Up @@ -169,10 +169,8 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)

for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) {
LOGP(info, "continuing here");
continue;
}
LOGP(info, "+> {}", mTimeFrameGPU->getNTracklets()[iLayer]);
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
Expand Down
1 change: 0 additions & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,6 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
nTracklets[iLayer] = unique_end - tracklets_ptr;
LOGP(info, "=> {} {}", nTracklets[iLayer], unique_end - tracklets_ptr);
if (iLayer > 0) {
gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int)));
gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
Expand Down
2 changes: 0 additions & 2 deletions Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
/// Sort tracklets
auto& trkl{tf->getTracklets()[iLayer + 1]};
auto oldsize{trkl.size()};
std::sort(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) {
return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex);
});
Expand All @@ -226,7 +225,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
/// Compute LUT
std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0);
lut.push_back(trkl.size());
LOGP(info, "CPU layer {} -> old size: {} - new size: {}", iLayer, oldsize, trkl.size());
}
/// Layer 0 is done outside the loop
std::sort(tf->getTracklets()[0].begin(), tf->getTracklets()[0].end(), [](const Tracklet& a, const Tracklet& b) {
Expand Down

0 comments on commit 822e629

Please sign in to comment.