Skip to content

Commit

Permalink
ITS::gpu: Add processNeighbours GPU kernel and handler (#13822)
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas authored Dec 20, 2024
1 parent e9e0633 commit 07e4515
Show file tree
Hide file tree
Showing 6 changed files with 421 additions and 88 deletions.
12 changes: 9 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame
void createCellsBuffers(const int);
void createCellsDevice();
void createCellsLUTDevice();
void createNeighboursDevice();
void createNeighboursIndexTablesDevice();
void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
void createNeighboursLUTDevice(const int, const unsigned int);
void createNeighboursDeviceArray();
void createTrackITSExtDevice(std::vector<CellSeed>&);
void downloadTrackITSExtDevice(std::vector<CellSeed>&);
void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
Expand Down Expand Up @@ -113,7 +114,10 @@ class TimeFrameGPU : public TimeFrame
Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
gsl::span<int*> getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; }
gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
Expand Down Expand Up @@ -195,7 +199,9 @@ class TimeFrameGPU : public TimeFrame

Road<nLayers - 2>* mRoadsDevice;
TrackITSExt* mTrackITSExtDevice;
std::array<gpuPair<int, int>*, nLayers - 2> mNeighboursDevice;
std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
std::array<int*, nLayers - 2> mNeighboursDevice;
int** mNeighboursDeviceArray;
std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;

Expand Down
28 changes: 25 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,31 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
const int nBlocks,
const int nThreads);

void filterCellNeighboursHandler(std::vector<int>&,
gpuPair<int, int>*,
unsigned int);
int filterCellNeighboursHandler(std::vector<int>&,
gpuPair<int, int>*,
int*,
unsigned int);

template <int nLayers = 7>
void processNeighboursHandler(const int startLayer,
const int startLevel,
CellSeed** allCellSeeds,
CellSeed* currentCellSeeds,
const unsigned int nCurrentCells,
const unsigned char** usedClusters,
int* neighbours,
gsl::span<int*> neighboursDeviceLUTs,
const TrackingFrameInfo** foundTrackingFrameInfo,
const float bz,
const float MaxChi2ClusterAttachment,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType,
const std::vector<int>& lastCellIdHost, // temporary host vector
const std::vector<CellSeed>& lastCellSeedHost, // temporary host vector
std::vector<int>& updatedCellIdHost, // temporary host vector
std::vector<CellSeed>& updatedCellSeedHost, // temporary host vector
const int nBlocks,
const int nThreads);

void trackSeedHandler(CellSeed* trackSeeds,
const TrackingFrameInfo** foundTrackingFrameInfo,
Expand Down
38 changes: 25 additions & 13 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursDevice()
void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
// Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

Expand Down Expand Up @@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
mCellsNeighbours[layer].clear();
mCellsNeighbours[layer].resize(neighbours.size());
LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

Expand Down Expand Up @@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
// TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
}

template <int nLayers>
Expand Down
Loading

0 comments on commit 07e4515

Please sign in to comment.