diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 37f392ebbd3a7..4ac22607a580b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame void createCellsBuffers(const int); void createCellsDevice(); void createCellsLUTDevice(); - void createNeighboursDevice(); + void createNeighboursIndexTablesDevice(); void createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); void createNeighboursLUTDevice(const int, const unsigned int); + void createNeighboursDeviceArray(); void createTrackITSExtDevice(std::vector&); void downloadTrackITSExtDevice(std::vector&); void downloadCellsNeighboursDevice(std::vector>>&, const int); @@ -113,7 +114,10 @@ class TimeFrameGPU : public TimeFrame Road* getDeviceRoads() { return mRoadsDevice; } TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; } int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; } - gpuPair* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } + gsl::span getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; } + gpuPair* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; } + int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } + int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; } TrackingFrameInfo* getDeviceTrackingFrameInfo(const int); const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } @@ -195,7 +199,9 @@ class TimeFrameGPU : public TimeFrame Road* mRoadsDevice; TrackITSExt* mTrackITSExtDevice; - std::array*, nLayers - 2> mNeighboursDevice; + std::array*, nLayers - 2> mNeighbourPairsDevice; + std::array mNeighboursDevice; + int** mNeighboursDeviceArray; std::array mTrackingFrameInfoDevice; const TrackingFrameInfo** mTrackingFrameInfoDeviceArray; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index b0db34199df09..f50a11a83805f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -176,9 +176,31 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nBlocks, const int nThreads); -void filterCellNeighboursHandler(std::vector&, - gpuPair*, - unsigned int); +int filterCellNeighboursHandler(std::vector&, + gpuPair*, + int*, + unsigned int); + +template +void processNeighboursHandler(const int startLayer, + const int startLevel, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, + const unsigned int nCurrentCells, + const unsigned char** usedClusters, + int* neighbours, + gsl::span neighboursDeviceLUTs, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float MaxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType, + const std::vector& lastCellIdHost, // temporary host vector + const std::vector& lastCellSeedHost, // temporary host vector + std::vector& updatedCellIdHost, // temporary host vector + std::vector& updatedCellSeedHost, // temporary host vector + const int nBlocks, + const int nThreads); void trackSeedHandler(CellSeed* trackSeeds, const TrackingFrameInfo** foundTrackingFrameInfo, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 4bd15c0203d81..fd067b9930fd0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -306,17 +306,28 @@ void TimeFrameGPU::loadTrackletsLUTDevice() } template -void TimeFrameGPU::createNeighboursDevice() +void TimeFrameGPU::createNeighboursIndexTablesDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours"); + // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps. + allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); + LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc + checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -400,19 +411,20 @@ void TimeFrameGPU::createNeighboursDevice(const unsigned int& layer, st START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); mCellsNeighbours[layer].clear(); mCellsNeighbours[layer].resize(neighbours.size()); + LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); + allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); - allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator()); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template -void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) +void TimeFrameGPU::createNeighboursDeviceArray() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); - LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc - checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); + allocMemAsync(reinterpret_cast(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -459,7 +471,7 @@ void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector) / MB); // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) - checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 4e34600562ae7..395aab3a470ac 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -91,13 +91,12 @@ template void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) { auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); mTimeFrameGPU->createTrackletsLUTDevice(iteration); const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); gsl::span diamondSpan(&diamondVert, 1); int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; - int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; + int endROF{o2::gpu::CAMath::Min(mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())}; countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), mTimeFrameGPU->getDeviceMultCutMask(), @@ -214,7 +213,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) template void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) { - mTimeFrameGPU->createNeighboursDevice(); + mTimeFrameGPU->createNeighboursIndexTablesDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { @@ -228,17 +227,16 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) continue; } - int layerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer].size())}; mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. mTimeFrameGPU->getDeviceArrayCellsLUT(), - mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), mTrkParams[0].MaxChi2ClusterAttachment, mBz, iLayer, - layerCellsNum, + mTimeFrameGPU->getNCells()[iLayer], nextLayerCellsNum, 1e2, conf.nBlocks, @@ -250,12 +248,12 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), mTimeFrameGPU->getDeviceArrayCellsLUT(), - mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), mTrkParams[0].MaxChi2ClusterAttachment, mBz, iLayer, - layerCellsNum, + mTimeFrameGPU->getNCells()[iLayer], nextLayerCellsNum, 1e2, conf.nBlocks, @@ -264,9 +262,11 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size()); filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer], + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighbours(iLayer), cellsNeighboursLayer[iLayer].size()); } + mTimeFrameGPU->createNeighboursDeviceArray(); mTimeFrameGPU->downloadCellsDevice(); mTimeFrameGPU->unregisterRest(); }; @@ -274,6 +274,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) template void TrackerTraitsGPU::findRoads(const int iteration) { + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); for (int startLevel{mTrkParams[iteration].CellsPerRoad()}; startLevel >= mTrkParams[iteration].CellMinimumLevel(); --startLevel) { const int minimumLayer{startLevel - 1}; std::vector trackSeeds; @@ -284,7 +285,25 @@ void TrackerTraitsGPU::findRoads(const int iteration) std::vector lastCellId, updatedCellId; std::vector lastCellSeed, updatedCellSeed; - processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); + processNeighboursHandler(startLayer, + startLevel, + mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceCells()[startLayer], + mTimeFrameGPU->getNCells()[startLayer], + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceNeighbours(startLayer - 1), + mTimeFrameGPU->getDeviceNeighboursLUTs(), + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), + mBz, + mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment + mTimeFrameGPU->getDevicePropagator(), + mCorrType, + lastCellId, // temporary host vector + lastCellSeed, // temporary host vector + updatedCellId, // temporary host vectors + updatedCellSeed, // temporary host vectors + conf.nBlocks, + conf.nThreads); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { @@ -292,7 +311,25 @@ void TrackerTraitsGPU::findRoads(const int iteration) lastCellId.swap(updatedCellId); std::vector().swap(updatedCellSeed); /// tame the memory peaks updatedCellId.clear(); - processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId); + processNeighboursHandler(iLayer, + --level, + mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceCells()[iLayer], + mTimeFrameGPU->getNCells()[iLayer], + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceNeighbours(iLayer - 1), + mTimeFrameGPU->getDeviceNeighboursLUTs(), + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), + mBz, + mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment + mTimeFrameGPU->getDevicePropagator(), + mCorrType, + lastCellId, // temporary host vector + lastCellSeed, // temporary host vector + updatedCellId, // temporary host vectors + updatedCellSeed, // temporary host vectors + conf.nBlocks, + conf.nThreads); } for (auto& seed : updatedCellSeed) { if (seed.getQ2Pt() > 1.e3 || seed.getChi2() > mTrkParams[0].MaxChi2NDF * ((startLevel + 2) * 2 - 5)) { @@ -307,16 +344,16 @@ void TrackerTraitsGPU::findRoads(const int iteration) } mTimeFrameGPU->createTrackITSExtDevice(trackSeeds); mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds); - auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds, - mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo, - mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks, + + trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo + mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks mTrkParams[iteration].MinPt, // std::vector& minPtsHost, - trackSeeds.size(), // const size_t nSeeds, - mBz, // const float Bz, + trackSeeds.size(), // const size_t nSeeds + mBz, // const float Bz startLevel, // const int startLevel, - mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment, - mTrkParams[0].MaxChi2NDF, // float maxChi2NDF, + mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment + mTrkParams[0].MaxChi2NDF, // float maxChi2NDF mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator mCorrType, // o2::base::PropagatorImpl::MatCorrType conf.nBlocks, @@ -371,8 +408,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) } mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } + mTimeFrameGPU->loadUsedClustersDevice(); } - mTimeFrameGPU->loadUsedClustersDevice(); if (iteration == mTrkParams.size() - 1) { mTimeFrameGPU->unregisterHostMemory(0); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index a723f36c17dd0..67a515df1c730 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -112,7 +112,7 @@ GPUd() bool fitTrack(TrackITSExt& track, float chi2ndfcut, float maxQoverPt, int nCl, - float Bz, + float bz, const TrackingFrameInfo** tfInfos, const o2::base::Propagator* prop, o2::base::PropagatorF::MatCorrType matCorrType) @@ -128,7 +128,7 @@ GPUd() bool fitTrack(TrackITSExt& track, if (!prop->propagateToX(track, trackingHit.xTrackingFrame, - Bz, + bz, o2::base::PropagatorImpl::MAX_SIN_PHI, o2::base::PropagatorImpl::MAX_STEP, matCorrType)) { @@ -192,9 +192,6 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, 0.f, 0.f, 0.f, 0.f, sg2q2pt}); } -// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }; -// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }; - struct sort_tracklets { GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); } }; @@ -269,7 +266,7 @@ GPUg() void fitTrackSeedsKernel( o2::its::TrackITSExt* tracks, const float* minPts, const unsigned int nSeeds, - const float Bz, + const float bz, const int startLevel, float maxChi2ClusterAttachment, float maxChi2NDF, @@ -295,7 +292,7 @@ GPUg() void fitTrackSeedsKernel( maxChi2NDF, // float maxChi2NDF, o2::constants::math::VeryBig, // float maxQoverPt, 0, // nCl, - Bz, // float Bz, + bz, // float bz, foundTrackingFrameInfo, // TrackingFrameInfo** trackingFrameInfo, propagator, // const o2::base::Propagator* propagator, matCorrType); // o2::base::PropagatorF::MatCorrType matCorrType @@ -314,7 +311,7 @@ GPUg() void fitTrackSeedsKernel( maxChi2NDF, // float maxChi2NDF, 50.f, // float maxQoverPt, 0, // nCl, - Bz, // float Bz, + bz, // float bz, foundTrackingFrameInfo, // TrackingFrameInfo** trackingFrameInfo, propagator, // const o2::base::Propagator* propagator, matCorrType); // o2::base::PropagatorF::MatCorrType matCorrType @@ -368,7 +365,6 @@ GPUg() void computeLayerCellNeighboursKernel( // FIXME: this is prone to race conditions: check on level is not atomic const int currentCellLevel{currentCellSeed.getLevel()}; if (currentCellLevel >= nextCellSeed.getLevel()) { - // atomicExch(cellSeedArray[layerIndex + 1][iNextCell].getLevelPtr(), currentCellLevel + 1); // Update level on corresponding cell cellSeedArray[layerIndex + 1][iNextCell].setLevel(currentCellLevel + 1); } } @@ -575,6 +571,107 @@ GPUg() void computeLayerTrackletsMultiROFKernel( } } +template +GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, + int* trackletsLookUpTable, + const int nTracklets) +{ + for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { + atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); + } +} + +template +GPUg() void processNeighboursKernel(const int layer, + const int level, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, + const int* currentCellIds, + const unsigned int nCurrentCells, + CellSeed* updatedCellSeeds, + int* updatedCellsIds, + int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration + const unsigned char** usedClusters, // Used clusters + int* neighbours, + int* neighboursLUT, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float MaxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType) +{ + constexpr float radl = 9.36f; // Radiation length of Si [cm]. + constexpr float rho = 2.33f; // Density of Si [g/cm^3]. + constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment. + for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) { + int foundSeeds{0}; + const auto& currentCell{currentCellSeeds[iCurrentCell]}; + if (currentCell.getLevel() != level) { + continue; + } + if (currentCellIds == nullptr && (usedClusters[layer][currentCell.getFirstClusterIndex()] || + usedClusters[layer + 1][currentCell.getSecondClusterIndex()] || + usedClusters[layer + 2][currentCell.getThirdClusterIndex()])) { + continue; + } + const int cellId = currentCellIds == nullptr ? iCurrentCell : currentCellIds[iCurrentCell]; + + const int startNeighbourId{cellId ? neighboursLUT[cellId - 1] : 0}; + const int endNeighbourId{neighboursLUT[cellId]}; + + for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) { + const int neighbourCellId = neighbours[iNeighbourCell]; + const CellSeed& neighbourCell = allCellSeeds[layer - 1][neighbourCellId]; + + if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) { + continue; + } + if (usedClusters[layer - 1][neighbourCell.getFirstClusterIndex()]) { + continue; + } + if (currentCell.getLevel() - 1 != neighbourCell.getLevel()) { + continue; + } + CellSeed seed{currentCell}; + auto& trHit = foundTrackingFrameInfo[layer - 1][neighbourCell.getFirstClusterIndex()]; + + if (!seed.rotate(trHit.alphaTrackingFrame)) { + continue; + } + + if (!propagator->propagateToX(seed, trHit.xTrackingFrame, bz, o2::base::PropagatorImpl::MAX_SIN_PHI, o2::base::PropagatorImpl::MAX_STEP, matCorrType)) { + continue; + } + + if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) { + if (!seed.correctForMaterial(layerxX0[layer - 1], layerxX0[layer - 1] * radl * rho, true)) { + continue; + } + } + + auto predChi2{seed.getPredictedChi2Quiet(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)}; + if ((predChi2 > MaxChi2ClusterAttachment) || predChi2 < 0.f) { + continue; + } + seed.setChi2(seed.getChi2() + predChi2); + if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) { + continue; + } + seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex(); + seed.setLevel(neighbourCell.getLevel()); + seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex()); + seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex()); + if constexpr (dryRun) { + foundSeedsTable[iCurrentCell]++; + } else { + updatedCellsIds[foundSeedsTable[iCurrentCell] + foundSeeds] = neighbourCellId; + updatedCellSeeds[foundSeedsTable[iCurrentCell] + foundSeeds] = seed; + } + foundSeeds++; + } + } +} + ///////////////////////////////////////// // Debug Kernels ///////////////////////////////////////// @@ -584,6 +681,7 @@ GPUd() void pPointer(T* ptr) { printf("[%p]\t", ptr); } + template GPUg() void printPointersKernel(std::tuple args) { @@ -692,35 +790,14 @@ GPUg() void printTrackletsLUTPerROF(const int layerId, } } -template -GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, - int* trackletsLookUpTable, - const int nTracklets) -{ - for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { - atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); - } -} - -// Decrease LUT entries corresponding to duplicated tracklets. NB: duplicate tracklets are removed separately (see const Tracklets*). -GPUg() void removeDuplicateTrackletsEntriesLUTKernel( - int* trackletsLookUpTable, - const Tracklet* tracklets, - const int* nTracklets, - const int layerIndex) +GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = 0) { - int id0{-1}, id1{-1}; - for (int iTracklet{0}; iTracklet < nTracklets[layerIndex]; ++iTracklet) { - auto& trk = tracklets[iTracklet]; - if (trk.firstClusterIndex == id0 && trk.secondClusterIndex == id1) { - trackletsLookUpTable[id0]--; - } else { - id0 = trk.firstClusterIndex; - id1 = trk.secondClusterIndex; + for (unsigned int iCell{0}; iCell < nCells; ++iCell) { + if (threadIdx.x == tId) { + seed[iCell].printCell(); } } } - } // namespace gpu template @@ -996,8 +1073,8 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaDeviceSynchronize()); + // gpuCheckError(cudaPeekAtLastError()); + // gpuCheckError(cudaDeviceSynchronize()); void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage @@ -1062,29 +1139,182 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, gpuCheckError(cudaDeviceSynchronize()); } -void filterCellNeighboursHandler(std::vector& neighHost, - gpuPair* cellNeighbours, - unsigned int nNeigh) +int filterCellNeighboursHandler(std::vector& neighHost, // TODO: eventually remove this! + gpuPair* cellNeighbourPairs, + int* cellNeighbours, + unsigned int nNeigh) { - thrust::device_ptr> neighVector(cellNeighbours); + thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); + thrust::device_ptr validNeighs(cellNeighbours); thrust::device_vector keys(nNeigh); // TODO: externally allocate. thrust::device_vector vals(nNeigh); // TODO: externally allocate. - thrust::copy(thrust::make_transform_iterator(neighVector, gpu::pair_to_second()), - thrust::make_transform_iterator(neighVector + nNeigh, gpu::pair_to_second()), + thrust::copy(thrust::make_transform_iterator(neighVectorPairs, gpu::pair_to_second()), + thrust::make_transform_iterator(neighVectorPairs + nNeigh, gpu::pair_to_second()), keys.begin()); thrust::sequence(vals.begin(), vals.end()); thrust::sort_by_key(keys.begin(), keys.end(), vals.begin()); thrust::device_vector> sortedNeigh(nNeigh); - thrust::copy(thrust::make_permutation_iterator(neighVector, vals.begin()), - thrust::make_permutation_iterator(neighVector, vals.end()), + thrust::copy(thrust::make_permutation_iterator(neighVectorPairs, vals.begin()), + thrust::make_permutation_iterator(neighVectorPairs, vals.end()), sortedNeigh.begin()); discardResult(cudaDeviceSynchronize()); auto trimmedBegin = thrust::find_if(sortedNeigh.begin(), sortedNeigh.end(), gpu::is_valid_pair()); // trim leading -1s auto trimmedSize = sortedNeigh.end() - trimmedBegin; - thrust::device_vector validNeigh(trimmedSize); neighHost.resize(trimmedSize); - thrust::transform(trimmedBegin, sortedNeigh.end(), validNeigh.begin(), gpu::pair_to_first()); - gpuCheckError(cudaMemcpy(neighHost.data(), thrust::raw_pointer_cast(validNeigh.data()), trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); + thrust::transform(trimmedBegin, sortedNeigh.end(), validNeighs, gpu::pair_to_first()); + gpuCheckError(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); + + return trimmedSize; +} + +template +void processNeighboursHandler(const int startLayer, + const int startLevel, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, + const unsigned int nCurrentCells, + const unsigned char** usedClusters, + int* neighbours, + gsl::span neighboursDeviceLUTs, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float MaxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType, + const std::vector& lastCellIdHost, // temporary host vector + const std::vector& lastCellSeedHost, // temporary host vector + std::vector& updatedCellIdHost, // temporary host vector + std::vector& updatedCellSeedHost, // temporary host vector + const int nBlocks, + const int nThreads) +{ + thrust::device_vector foundSeedsTable(nCurrentCells + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this. + thrust::device_vector lastCellIds(lastCellIdHost); + thrust::device_vector lastCellSeed(lastCellSeedHost); + gpu::processNeighboursKernel<<>>(startLayer, + startLevel, + allCellSeeds, + lastCellIdHost.empty() ? currentCellSeeds : thrust::raw_pointer_cast(&lastCellSeed[0]), // lastCellSeeds + lastCellIdHost.empty() ? nullptr : thrust::raw_pointer_cast(&lastCellIds[0]), // lastCellIds, + lastCellIdHost.empty() ? nCurrentCells : lastCellSeedHost.size(), + nullptr, // updatedCellSeeds, + nullptr, // updatedCellsIds, + thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + usedClusters, // Used clusters + neighbours, + neighboursDeviceLUTs[startLayer - 1], + foundTrackingFrameInfo, + bz, + MaxChi2ClusterAttachment, + propagator, + matCorrType); + void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; + size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCurrentCells + 1, // num_items + 0)); + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCurrentCells + 1, // num_items + 0)); + + thrust::device_vector updatedCellIds(foundSeedsTable.back()) /*, lastCellIds(foundSeedsTable.back())*/; + thrust::device_vector updatedCellSeeds(foundSeedsTable.back()) /*, lastCellSeeds(foundSeedsTable.back())*/; + + gpu::processNeighboursKernel<<>>(startLayer, + startLevel, + allCellSeeds, + lastCellIdHost.empty() ? currentCellSeeds : thrust::raw_pointer_cast(&lastCellSeed[0]), // lastCellSeeds + lastCellIdHost.empty() ? nullptr : thrust::raw_pointer_cast(&lastCellIds[0]), // lastCellIds, + lastCellIdHost.empty() ? nCurrentCells : lastCellSeedHost.size(), + thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds + thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds + thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + usedClusters, // Used clusters + neighbours, + neighboursDeviceLUTs[startLayer - 1], + foundTrackingFrameInfo, + bz, + MaxChi2ClusterAttachment, + propagator, + matCorrType); + + // Temporary copyback to host to validate the kernel + updatedCellIdHost.resize(updatedCellIds.size()); + updatedCellSeedHost.resize(updatedCellSeeds.size()); + thrust::copy(updatedCellIds.begin(), updatedCellIds.end(), updatedCellIdHost.begin()); + thrust::copy(updatedCellSeeds.begin(), updatedCellSeeds.end(), updatedCellSeedHost.begin()); + + // int level = startLevel; + // for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { + // --level; + // lastCellSeeds.swap(updatedCellSeeds); + // lastCellIds.swap(updatedCellIds); + // foundSeedsTable.resize(lastCellSeeds.size() + 1); + // thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); + + // gpu::processNeighboursKernel<<<1, 1>>>(iLayer, + // level, + // allCellSeeds, + // thrust::raw_pointer_cast(&lastCellSeeds[0]), + // thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds, + // lastCellSeeds.size(), + // nullptr, // updatedCellSeeds, + // nullptr, // updatedCellsIds, + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + // usedClusters, // Used clusters + // neighbours, + // neighboursDeviceLUTs[iLayer - 1], + // foundTrackingFrameInfo, + // bz, + // MaxChi2ClusterAttachment, + // propagator, + // matCorrType); + + // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + // temp_storage_bytes, // temp_storage_bytes + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + // foundSeedsTable.size(), // num_items + // 0)); + // discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + // temp_storage_bytes_2, // temp_storage_bytes + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + // foundSeedsTable.size(), // num_items + // 0)); + // updatedCellIds.resize(foundSeedsTable.back(), 0); + // updatedCellSeeds.resize(foundSeedsTable.back(), CellSeed()); + + // gpu::processNeighboursKernel<<<1, 1>>>(iLayer, + // level, + // allCellSeeds, + // thrust::raw_pointer_cast(&lastCellSeeds[0]), + // thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds, + // lastCellSeeds.size(), + // thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds + // thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + // usedClusters, // Used clusters + // neighbours, + // neighboursDeviceLUTs[iLayer - 1], + // foundTrackingFrameInfo, + // bz, + // MaxChi2ClusterAttachment, + // propagator, + // matCorrType); + // gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&updatedCellSeeds[0]), updatedCellSeeds.size()); + // } + + gpuCheckError(cudaFree(d_temp_storage)); + gpuCheckError(cudaFree(d_temp_storage_2)); } void trackSeedHandler(CellSeed* trackSeeds, @@ -1092,7 +1322,7 @@ void trackSeedHandler(CellSeed* trackSeeds, o2::its::TrackITSExt* tracks, std::vector& minPtsHost, const unsigned int nSeeds, - const float Bz, + const float bz, const int startLevel, float maxChi2ClusterAttachment, float maxChi2NDF, @@ -1108,7 +1338,7 @@ void trackSeedHandler(CellSeed* trackSeeds, tracks, // TrackITSExt* thrust::raw_pointer_cast(&minPts[0]), // const float* minPts, nSeeds, // const unsigned int - Bz, // const float + bz, // const float startLevel, // const int maxChi2ClusterAttachment, // float maxChi2NDF, // float @@ -1179,4 +1409,24 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, std::vector& mulScatAng, const int nBlocks, const int nThreads); + +template void processNeighboursHandler<7>(const int startLayer, + const int startLevel, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, + const unsigned int nCurrentCells, + const unsigned char** usedClusters, + int* neighbours, + gsl::span neighboursDeviceLUTs, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float MaxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType, + const std::vector& lastCellIdHost, // temporary host vector + const std::vector& lastCellSeedHost, // temporary host vector + std::vector& updatedCellIdHost, // temporary host vector + std::vector& updatedCellSeedHost, // temporary host vector + const int nBlocks, + const int nThreads); } // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h index cb9f28665cf07..fa0473ae88462 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h @@ -15,10 +15,10 @@ #ifndef TRACKINGITSU_INCLUDE_CACELL_H_ #define TRACKINGITSU_INCLUDE_CACELL_H_ - #ifndef GPUCA_GPUCODE_DEVICE #include #include +#include #endif #include "GPUCommonDef.h" @@ -104,6 +104,7 @@ class CellSeed final : public o2::track::TrackParCovF GPUhd() int* getLevelPtr() { return &mLevel; } GPUhd() int* getClusters() { return mClusters; } GPUhd() int getCluster(int i) const { return mClusters[i]; } + GPUhdi() void printCell() const; private: int mClusters[7] = {-1, -1, -1, -1, -1, -1, -1}; @@ -112,6 +113,11 @@ class CellSeed final : public o2::track::TrackParCovF float mChi2 = 0.f; }; +GPUhdi() void CellSeed::printCell() const +{ + printf("trkl: %d, %d\t lvl: %d\t chi2: %f\n", mTracklets[0], mTracklets[1], mLevel, mChi2); +} + } // namespace its } // namespace o2 #endif /* TRACKINGITSU_INCLUDE_CACELL_H_ */