diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index 37f392ebbd3a7..4ac22607a580b 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame
   void createCellsBuffers(const int);
   void createCellsDevice();
   void createCellsLUTDevice();
-  void createNeighboursDevice();
+  void createNeighboursIndexTablesDevice();
   void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
   void createNeighboursLUTDevice(const int, const unsigned int);
+  void createNeighboursDeviceArray();
   void createTrackITSExtDevice(std::vector<CellSeed>&);
   void downloadTrackITSExtDevice(std::vector<CellSeed>&);
   void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
@@ -113,7 +114,10 @@ class TimeFrameGPU : public TimeFrame
   Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
   TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
   int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
-  gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
+  gsl::span<int*> getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; }
+  gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
+  int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
+  int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
   TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
   const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
   const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
@@ -195,7 +199,9 @@ class TimeFrameGPU : public TimeFrame
 
   Road<nLayers - 2>* mRoadsDevice;
   TrackITSExt* mTrackITSExtDevice;
-  std::array<gpuPair<int, int>*, nLayers - 2> mNeighboursDevice;
+  std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
+  std::array<int*, nLayers - 2> mNeighboursDevice;
+  int** mNeighboursDeviceArray;
   std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
   const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index b0db34199df09..f50a11a83805f 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -176,9 +176,31 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   const int nBlocks,
                                   const int nThreads);
 
-void filterCellNeighboursHandler(std::vector<int>&,
-                                 gpuPair<int, int>*,
-                                 unsigned int);
+int filterCellNeighboursHandler(std::vector<int>&,
+                                gpuPair<int, int>*,
+                                int*,
+                                unsigned int);
+
+template <int nLayers = 7>
+void processNeighboursHandler(const int startLayer,
+                              const int startLevel,
+                              CellSeed** allCellSeeds,
+                              CellSeed* currentCellSeeds,
+                              const unsigned int nCurrentCells,
+                              const unsigned char** usedClusters,
+                              int* neighbours,
+                              gsl::span<int*> neighboursDeviceLUTs,
+                              const TrackingFrameInfo** foundTrackingFrameInfo,
+                              const float bz,
+                              const float MaxChi2ClusterAttachment,
+                              const o2::base::Propagator* propagator,
+                              const o2::base::PropagatorF::MatCorrType matCorrType,
+                              const std::vector<int>& lastCellIdHost,        // temporary host vector
+                              const std::vector<CellSeed>& lastCellSeedHost, // temporary host vector
+                              std::vector<int>& updatedCellIdHost,           // temporary host vector
+                              std::vector<CellSeed>& updatedCellSeedHost,    // temporary host vector
+                              const int nBlocks,
+                              const int nThreads);
 
 void trackSeedHandler(CellSeed* trackSeeds,
                       const TrackingFrameInfo** foundTrackingFrameInfo,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 4bd15c0203d81..fd067b9930fd0 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDevice()
+void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
+  // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
+  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
+  checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
+  checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
     checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
-  checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
+  LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
+  checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
   mCellsNeighbours[layer].clear();
   mCellsNeighbours[layer].resize(neighbours.size());
+  LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
+  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
+  checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
+void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
-  LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
-  checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
+  checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
   // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
-  checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 4e34600562ae7..395aab3a470ac 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -91,13 +91,12 @@ template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex)
 {
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
-  // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
   mTimeFrameGPU->createTrackletsLUTDevice(iteration);
 
   const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
   gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
   int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
-  int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
+  int endROF{o2::gpu::CAMath::Min(mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};
 
   countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
                                        mTimeFrameGPU->getDeviceMultCutMask(),
@@ -214,7 +213,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
 {
-  mTimeFrameGPU->createNeighboursDevice();
+  mTimeFrameGPU->createNeighboursIndexTablesDevice();
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
   std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
   for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
@@ -228,17 +227,16 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
       continue;
     }
 
-    int layerCellsNum{static_cast<int>(mTimeFrameGPU->getCells()[iLayer].size())};
     mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
     countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
                                mTimeFrameGPU->getDeviceArrayCellsLUT(),
-                               mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                               mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
                                mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
                                mTrkParams[0].MaxChi2ClusterAttachment,
                                mBz,
                                iLayer,
-                               layerCellsNum,
+                               mTimeFrameGPU->getNCells()[iLayer],
                                nextLayerCellsNum,
                                1e2,
                                conf.nBlocks,
@@ -250,12 +248,12 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
     computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                  mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
                                  mTimeFrameGPU->getDeviceArrayCellsLUT(),
-                                 mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                                 mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
                                  mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
                                  mTrkParams[0].MaxChi2ClusterAttachment,
                                  mBz,
                                  iLayer,
-                                 layerCellsNum,
+                                 mTimeFrameGPU->getNCells()[iLayer],
                                  nextLayerCellsNum,
                                  1e2,
                                  conf.nBlocks,
@@ -264,9 +262,11 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
     mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size());
 
     filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer],
+                                mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
                                 mTimeFrameGPU->getDeviceNeighbours(iLayer),
                                 cellsNeighboursLayer[iLayer].size());
   }
+  mTimeFrameGPU->createNeighboursDeviceArray();
   mTimeFrameGPU->downloadCellsDevice();
   mTimeFrameGPU->unregisterRest();
 };
@@ -274,6 +274,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
 {
+  auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
   for (int startLevel{mTrkParams[iteration].CellsPerRoad()}; startLevel >= mTrkParams[iteration].CellMinimumLevel(); --startLevel) {
     const int minimumLayer{startLevel - 1};
     std::vector<CellSeed> trackSeeds;
@@ -284,7 +285,25 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       std::vector<int> lastCellId, updatedCellId;
       std::vector<CellSeed> lastCellSeed, updatedCellSeed;
 
-      processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
+      processNeighboursHandler<nLayers>(startLayer,
+                                        startLevel,
+                                        mTimeFrameGPU->getDeviceArrayCells(),
+                                        mTimeFrameGPU->getDeviceCells()[startLayer],
+                                        mTimeFrameGPU->getNCells()[startLayer],
+                                        mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                        mTimeFrameGPU->getDeviceNeighbours(startLayer - 1),
+                                        mTimeFrameGPU->getDeviceNeighboursLUTs(),
+                                        mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
+                                        mBz,
+                                        mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
+                                        mTimeFrameGPU->getDevicePropagator(),
+                                        mCorrType,
+                                        lastCellId,      // temporary host vector
+                                        lastCellSeed,    // temporary host vector
+                                        updatedCellId,   // temporary host vectors
+                                        updatedCellSeed, // temporary host vectors
+                                        conf.nBlocks,
+                                        conf.nThreads);
 
       int level = startLevel;
       for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
@@ -292,7 +311,25 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
         lastCellId.swap(updatedCellId);
         std::vector<CellSeed>().swap(updatedCellSeed); /// tame the memory peaks
         updatedCellId.clear();
-        processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId);
+        processNeighboursHandler<nLayers>(iLayer,
+                                          --level,
+                                          mTimeFrameGPU->getDeviceArrayCells(),
+                                          mTimeFrameGPU->getDeviceCells()[iLayer],
+                                          mTimeFrameGPU->getNCells()[iLayer],
+                                          mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                          mTimeFrameGPU->getDeviceNeighbours(iLayer - 1),
+                                          mTimeFrameGPU->getDeviceNeighboursLUTs(),
+                                          mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
+                                          mBz,
+                                          mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
+                                          mTimeFrameGPU->getDevicePropagator(),
+                                          mCorrType,
+                                          lastCellId,      // temporary host vector
+                                          lastCellSeed,    // temporary host vector
+                                          updatedCellId,   // temporary host vectors
+                                          updatedCellSeed, // temporary host vectors
+                                          conf.nBlocks,
+                                          conf.nThreads);
       }
       for (auto& seed : updatedCellSeed) {
         if (seed.getQ2Pt() > 1.e3 || seed.getChi2() > mTrkParams[0].MaxChi2NDF * ((startLevel + 2) * 2 - 5)) {
@@ -307,16 +344,16 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
     }
     mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
     mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
-    auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
-    trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),             // CellSeed* trackSeeds,
-                     mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo,
-                     mTimeFrameGPU->getDeviceTrackITSExt(),            // o2::its::TrackITSExt* tracks,
+
+    trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),             // CellSeed* trackSeeds
+                     mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo
+                     mTimeFrameGPU->getDeviceTrackITSExt(),            // o2::its::TrackITSExt* tracks
                      mTrkParams[iteration].MinPt,                      // std::vector<float>& minPtsHost,
-                     trackSeeds.size(),                                // const size_t nSeeds,
-                     mBz,                                              // const float Bz,
+                     trackSeeds.size(),                                // const size_t nSeeds
+                     mBz,                                              // const float Bz
                      startLevel,                                       // const int startLevel,
-                     mTrkParams[0].MaxChi2ClusterAttachment,           // float maxChi2ClusterAttachment,
-                     mTrkParams[0].MaxChi2NDF,                         // float maxChi2NDF,
+                     mTrkParams[0].MaxChi2ClusterAttachment,           // float maxChi2ClusterAttachment
+                     mTrkParams[0].MaxChi2NDF,                         // float maxChi2NDF
                      mTimeFrameGPU->getDevicePropagator(),             // const o2::base::Propagator* propagator
                      mCorrType,                                        // o2::base::PropagatorImpl<float>::MatCorrType
                      conf.nBlocks,
@@ -371,8 +408,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       }
       mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
     }
+    mTimeFrameGPU->loadUsedClustersDevice();
   }
-  mTimeFrameGPU->loadUsedClustersDevice();
   if (iteration == mTrkParams.size() - 1) {
     mTimeFrameGPU->unregisterHostMemory(0);
   }
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index a723f36c17dd0..67a515df1c730 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -112,7 +112,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
                      float chi2ndfcut,
                      float maxQoverPt,
                      int nCl,
-                     float Bz,
+                     float bz,
                      const TrackingFrameInfo** tfInfos,
                      const o2::base::Propagator* prop,
                      o2::base::PropagatorF::MatCorrType matCorrType)
@@ -128,7 +128,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
 
     if (!prop->propagateToX(track,
                             trackingHit.xTrackingFrame,
-                            Bz,
+                            bz,
                             o2::base::PropagatorImpl<float>::MAX_SIN_PHI,
                             o2::base::PropagatorImpl<float>::MAX_STEP,
                             matCorrType)) {
@@ -192,9 +192,6 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
                              0.f, 0.f, 0.f, 0.f, sg2q2pt});
 }
 
-// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); };
-// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; };
-
 struct sort_tracklets {
   GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }
 };
@@ -269,7 +266,7 @@ GPUg() void fitTrackSeedsKernel(
   o2::its::TrackITSExt* tracks,
   const float* minPts,
   const unsigned int nSeeds,
-  const float Bz,
+  const float bz,
   const int startLevel,
   float maxChi2ClusterAttachment,
   float maxChi2NDF,
@@ -295,7 +292,7 @@ GPUg() void fitTrackSeedsKernel(
                                maxChi2NDF,                   // float maxChi2NDF,
                                o2::constants::math::VeryBig, // float maxQoverPt,
                                0,                            // nCl,
-                               Bz,                           // float Bz,
+                               bz,                           // float bz,
                                foundTrackingFrameInfo,       // TrackingFrameInfo** trackingFrameInfo,
                                propagator,                   // const o2::base::Propagator* propagator,
                                matCorrType);                 // o2::base::PropagatorF::MatCorrType matCorrType
@@ -314,7 +311,7 @@ GPUg() void fitTrackSeedsKernel(
                           maxChi2NDF,               // float maxChi2NDF,
                           50.f,                     // float maxQoverPt,
                           0,                        // nCl,
-                          Bz,                       // float Bz,
+                          bz,                       // float bz,
                           foundTrackingFrameInfo,   // TrackingFrameInfo** trackingFrameInfo,
                           propagator,               // const o2::base::Propagator* propagator,
                           matCorrType);             // o2::base::PropagatorF::MatCorrType matCorrType
@@ -368,7 +365,6 @@ GPUg() void computeLayerCellNeighboursKernel(
         // FIXME: this is prone to race conditions: check on level is not atomic
         const int currentCellLevel{currentCellSeed.getLevel()};
         if (currentCellLevel >= nextCellSeed.getLevel()) {
-          // atomicExch(cellSeedArray[layerIndex + 1][iNextCell].getLevelPtr(), currentCellLevel + 1); // Update level on corresponding cell
           cellSeedArray[layerIndex + 1][iNextCell].setLevel(currentCellLevel + 1);
         }
       }
@@ -575,6 +571,107 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
   }
 }
 
+template <int nLayers = 7>
+GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
+                                              int* trackletsLookUpTable,
+                                              const int nTracklets)
+{
+  for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) {
+    atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1);
+  }
+}
+
+template <bool dryRun, bool debug = false, int nLayers = 7>
+GPUg() void processNeighboursKernel(const int layer,
+                                    const int level,
+                                    CellSeed** allCellSeeds,
+                                    CellSeed* currentCellSeeds,
+                                    const int* currentCellIds,
+                                    const unsigned int nCurrentCells,
+                                    CellSeed* updatedCellSeeds,
+                                    int* updatedCellsIds,
+                                    int* foundSeedsTable,               // auxiliary only in GPU code to compute the number of cells per iteration
+                                    const unsigned char** usedClusters, // Used clusters
+                                    int* neighbours,
+                                    int* neighboursLUT,
+                                    const TrackingFrameInfo** foundTrackingFrameInfo,
+                                    const float bz,
+                                    const float MaxChi2ClusterAttachment,
+                                    const o2::base::Propagator* propagator,
+                                    const o2::base::PropagatorF::MatCorrType matCorrType)
+{
+  constexpr float radl = 9.36f;                                                           // Radiation length of Si [cm].
+  constexpr float rho = 2.33f;                                                            // Density of Si [g/cm^3].
+  constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment.
+  for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) {
+    int foundSeeds{0};
+    const auto& currentCell{currentCellSeeds[iCurrentCell]};
+    if (currentCell.getLevel() != level) {
+      continue;
+    }
+    if (currentCellIds == nullptr && (usedClusters[layer][currentCell.getFirstClusterIndex()] ||
+                                      usedClusters[layer + 1][currentCell.getSecondClusterIndex()] ||
+                                      usedClusters[layer + 2][currentCell.getThirdClusterIndex()])) {
+      continue;
+    }
+    const int cellId = currentCellIds == nullptr ? iCurrentCell : currentCellIds[iCurrentCell];
+
+    const int startNeighbourId{cellId ? neighboursLUT[cellId - 1] : 0};
+    const int endNeighbourId{neighboursLUT[cellId]};
+
+    for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) {
+      const int neighbourCellId = neighbours[iNeighbourCell];
+      const CellSeed& neighbourCell = allCellSeeds[layer - 1][neighbourCellId];
+
+      if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) {
+        continue;
+      }
+      if (usedClusters[layer - 1][neighbourCell.getFirstClusterIndex()]) {
+        continue;
+      }
+      if (currentCell.getLevel() - 1 != neighbourCell.getLevel()) {
+        continue;
+      }
+      CellSeed seed{currentCell};
+      auto& trHit = foundTrackingFrameInfo[layer - 1][neighbourCell.getFirstClusterIndex()];
+
+      if (!seed.rotate(trHit.alphaTrackingFrame)) {
+        continue;
+      }
+
+      if (!propagator->propagateToX(seed, trHit.xTrackingFrame, bz, o2::base::PropagatorImpl<float>::MAX_SIN_PHI, o2::base::PropagatorImpl<float>::MAX_STEP, matCorrType)) {
+        continue;
+      }
+
+      if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) {
+        if (!seed.correctForMaterial(layerxX0[layer - 1], layerxX0[layer - 1] * radl * rho, true)) {
+          continue;
+        }
+      }
+
+      auto predChi2{seed.getPredictedChi2Quiet(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)};
+      if ((predChi2 > MaxChi2ClusterAttachment) || predChi2 < 0.f) {
+        continue;
+      }
+      seed.setChi2(seed.getChi2() + predChi2);
+      if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) {
+        continue;
+      }
+      seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex();
+      seed.setLevel(neighbourCell.getLevel());
+      seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex());
+      seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex());
+      if constexpr (dryRun) {
+        foundSeedsTable[iCurrentCell]++;
+      } else {
+        updatedCellsIds[foundSeedsTable[iCurrentCell] + foundSeeds] = neighbourCellId;
+        updatedCellSeeds[foundSeedsTable[iCurrentCell] + foundSeeds] = seed;
+      }
+      foundSeeds++;
+    }
+  }
+}
+
 /////////////////////////////////////////
 // Debug Kernels
 /////////////////////////////////////////
@@ -584,6 +681,7 @@ GPUd() void pPointer(T* ptr)
 {
   printf("[%p]\t", ptr);
 }
+
 template <typename... Args>
 GPUg() void printPointersKernel(std::tuple<Args...> args)
 {
@@ -692,35 +790,14 @@ GPUg() void printTrackletsLUTPerROF(const int layerId,
   }
 }
 
-template <int nLayers = 7>
-GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
-                                              int* trackletsLookUpTable,
-                                              const int nTracklets)
-{
-  for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) {
-    atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1);
-  }
-}
-
-// Decrease LUT entries corresponding to duplicated tracklets. NB: duplicate tracklets are removed separately (see const Tracklets*).
-GPUg() void removeDuplicateTrackletsEntriesLUTKernel(
-  int* trackletsLookUpTable,
-  const Tracklet* tracklets,
-  const int* nTracklets,
-  const int layerIndex)
+GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = 0)
 {
-  int id0{-1}, id1{-1};
-  for (int iTracklet{0}; iTracklet < nTracklets[layerIndex]; ++iTracklet) {
-    auto& trk = tracklets[iTracklet];
-    if (trk.firstClusterIndex == id0 && trk.secondClusterIndex == id1) {
-      trackletsLookUpTable[id0]--;
-    } else {
-      id0 = trk.firstClusterIndex;
-      id1 = trk.secondClusterIndex;
+  for (unsigned int iCell{0}; iCell < nCells; ++iCell) {
+    if (threadIdx.x == tId) {
+      seed[iCell].printCell();
     }
   }
 }
-
 } // namespace gpu
 
 template <int nLayers>
@@ -996,8 +1073,8 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
     layerIndex,
     nCells,
     maxCellNeighbours);
-  gpuCheckError(cudaPeekAtLastError());
-  gpuCheckError(cudaDeviceSynchronize());
+  // gpuCheckError(cudaPeekAtLastError());
+  // gpuCheckError(cudaDeviceSynchronize());
   void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr;
   size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0;
   gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage,     // d_temp_storage
@@ -1062,29 +1139,182 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
   gpuCheckError(cudaDeviceSynchronize());
 }
 
-void filterCellNeighboursHandler(std::vector<int>& neighHost,
-                                 gpuPair<int, int>* cellNeighbours,
-                                 unsigned int nNeigh)
+int filterCellNeighboursHandler(std::vector<int>& neighHost, // TODO: eventually remove this!
+                                gpuPair<int, int>* cellNeighbourPairs,
+                                int* cellNeighbours,
+                                unsigned int nNeigh)
 {
-  thrust::device_ptr<gpuPair<int, int>> neighVector(cellNeighbours);
+  thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
+  thrust::device_ptr<int> validNeighs(cellNeighbours);
   thrust::device_vector<int> keys(nNeigh); // TODO: externally allocate.
   thrust::device_vector<int> vals(nNeigh); // TODO: externally allocate.
-  thrust::copy(thrust::make_transform_iterator(neighVector, gpu::pair_to_second<int, int>()),
-               thrust::make_transform_iterator(neighVector + nNeigh, gpu::pair_to_second<int, int>()),
+  thrust::copy(thrust::make_transform_iterator(neighVectorPairs, gpu::pair_to_second<int, int>()),
+               thrust::make_transform_iterator(neighVectorPairs + nNeigh, gpu::pair_to_second<int, int>()),
                keys.begin());
   thrust::sequence(vals.begin(), vals.end());
   thrust::sort_by_key(keys.begin(), keys.end(), vals.begin());
   thrust::device_vector<gpuPair<int, int>> sortedNeigh(nNeigh);
-  thrust::copy(thrust::make_permutation_iterator(neighVector, vals.begin()),
-               thrust::make_permutation_iterator(neighVector, vals.end()),
+  thrust::copy(thrust::make_permutation_iterator(neighVectorPairs, vals.begin()),
+               thrust::make_permutation_iterator(neighVectorPairs, vals.end()),
                sortedNeigh.begin());
   discardResult(cudaDeviceSynchronize());
   auto trimmedBegin = thrust::find_if(sortedNeigh.begin(), sortedNeigh.end(), gpu::is_valid_pair<int, int>()); // trim leading -1s
   auto trimmedSize = sortedNeigh.end() - trimmedBegin;
-  thrust::device_vector<int> validNeigh(trimmedSize);
   neighHost.resize(trimmedSize);
-  thrust::transform(trimmedBegin, sortedNeigh.end(), validNeigh.begin(), gpu::pair_to_first<int, int>());
-  gpuCheckError(cudaMemcpy(neighHost.data(), thrust::raw_pointer_cast(validNeigh.data()), trimmedSize * sizeof(int), cudaMemcpyDeviceToHost));
+  thrust::transform(trimmedBegin, sortedNeigh.end(), validNeighs, gpu::pair_to_first<int, int>());
+  gpuCheckError(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost));
+
+  return trimmedSize;
+}
+
+template <int nLayers>
+void processNeighboursHandler(const int startLayer,
+                              const int startLevel,
+                              CellSeed** allCellSeeds,
+                              CellSeed* currentCellSeeds,
+                              const unsigned int nCurrentCells,
+                              const unsigned char** usedClusters,
+                              int* neighbours,
+                              gsl::span<int*> neighboursDeviceLUTs,
+                              const TrackingFrameInfo** foundTrackingFrameInfo,
+                              const float bz,
+                              const float MaxChi2ClusterAttachment,
+                              const o2::base::Propagator* propagator,
+                              const o2::base::PropagatorF::MatCorrType matCorrType,
+                              const std::vector<int>& lastCellIdHost,        // temporary host vector
+                              const std::vector<CellSeed>& lastCellSeedHost, // temporary host vector
+                              std::vector<int>& updatedCellIdHost,           // temporary host vector
+                              std::vector<CellSeed>& updatedCellSeedHost,    // temporary host vector
+                              const int nBlocks,
+                              const int nThreads)
+{
+  thrust::device_vector<int> foundSeedsTable(nCurrentCells + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this.
+  thrust::device_vector<int> lastCellIds(lastCellIdHost);
+  thrust::device_vector<CellSeed> lastCellSeed(lastCellSeedHost);
+  gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(startLayer,
+                                                            startLevel,
+                                                            allCellSeeds,
+                                                            lastCellIdHost.empty() ? currentCellSeeds : thrust::raw_pointer_cast(&lastCellSeed[0]), // lastCellSeeds
+                                                            lastCellIdHost.empty() ? nullptr : thrust::raw_pointer_cast(&lastCellIds[0]),           // lastCellIds,
+                                                            lastCellIdHost.empty() ? nCurrentCells : lastCellSeedHost.size(),
+                                                            nullptr,                                       // updatedCellSeeds,
+                                                            nullptr,                                       // updatedCellsIds,
+                                                            thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
+                                                            usedClusters,                                  // Used clusters
+                                                            neighbours,
+                                                            neighboursDeviceLUTs[startLayer - 1],
+                                                            foundTrackingFrameInfo,
+                                                            bz,
+                                                            MaxChi2ClusterAttachment,
+                                                            propagator,
+                                                            matCorrType);
+  void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr;
+  size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0;
+  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,                                // d_temp_storage
+                                              temp_storage_bytes,                            // temp_storage_bytes
+                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+                                              nCurrentCells + 1,                             // num_items
+                                              0));
+  discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,                                // d_temp_storage
+                                              temp_storage_bytes,                            // temp_storage_bytes
+                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+                                              thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+                                              nCurrentCells + 1,                             // num_items
+                                              0));
+
+  thrust::device_vector<int> updatedCellIds(foundSeedsTable.back()) /*, lastCellIds(foundSeedsTable.back())*/;
+  thrust::device_vector<CellSeed> updatedCellSeeds(foundSeedsTable.back()) /*, lastCellSeeds(foundSeedsTable.back())*/;
+
+  gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(startLayer,
+                                                             startLevel,
+                                                             allCellSeeds,
+                                                             lastCellIdHost.empty() ? currentCellSeeds : thrust::raw_pointer_cast(&lastCellSeed[0]), // lastCellSeeds
+                                                             lastCellIdHost.empty() ? nullptr : thrust::raw_pointer_cast(&lastCellIds[0]),           // lastCellIds,
+                                                             lastCellIdHost.empty() ? nCurrentCells : lastCellSeedHost.size(),
+                                                             thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds
+                                                             thrust::raw_pointer_cast(&updatedCellIds[0]),   // updatedCellsIds
+                                                             thrust::raw_pointer_cast(&foundSeedsTable[0]),  // auxiliary only in GPU code to compute the number of cells per iteration
+                                                             usedClusters,                                   // Used clusters
+                                                             neighbours,
+                                                             neighboursDeviceLUTs[startLayer - 1],
+                                                             foundTrackingFrameInfo,
+                                                             bz,
+                                                             MaxChi2ClusterAttachment,
+                                                             propagator,
+                                                             matCorrType);
+
+  // Temporary copyback to host to validate the kernel
+  updatedCellIdHost.resize(updatedCellIds.size());
+  updatedCellSeedHost.resize(updatedCellSeeds.size());
+  thrust::copy(updatedCellIds.begin(), updatedCellIds.end(), updatedCellIdHost.begin());
+  thrust::copy(updatedCellSeeds.begin(), updatedCellSeeds.end(), updatedCellSeedHost.begin());
+
+  // int level = startLevel;
+  // for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
+  //   --level;
+  //   lastCellSeeds.swap(updatedCellSeeds);
+  //   lastCellIds.swap(updatedCellIds);
+  //   foundSeedsTable.resize(lastCellSeeds.size() + 1);
+  //   thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
+
+  //   gpu::processNeighboursKernel<true, false><<<1, 1>>>(iLayer,
+  //                                                       level,
+  //                                                       allCellSeeds,
+  //                                                       thrust::raw_pointer_cast(&lastCellSeeds[0]),
+  //                                                       thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds,
+  //                                                       lastCellSeeds.size(),
+  //                                                       nullptr,                                       // updatedCellSeeds,
+  //                                                       nullptr,                                       // updatedCellsIds,
+  //                                                       thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
+  //                                                       usedClusters,                                  // Used clusters
+  //                                                       neighbours,
+  //                                                       neighboursDeviceLUTs[iLayer - 1],
+  //                                                       foundTrackingFrameInfo,
+  //                                                       bz,
+  //                                                       MaxChi2ClusterAttachment,
+  //                                                       propagator,
+  //                                                       matCorrType);
+
+  //   gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,                              // d_temp_storage
+  //                                               temp_storage_bytes,                            // temp_storage_bytes
+  //                                               thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+  //                                               thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+  //                                               foundSeedsTable.size(),                        // num_items
+  //                                               0));
+  //   discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  //   gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,                              // d_temp_storage
+  //                                               temp_storage_bytes_2,                          // temp_storage_bytes
+  //                                               thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+  //                                               thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+  //                                               foundSeedsTable.size(),                        // num_items
+  //                                               0));
+  //   updatedCellIds.resize(foundSeedsTable.back(), 0);
+  //   updatedCellSeeds.resize(foundSeedsTable.back(), CellSeed());
+
+  //   gpu::processNeighboursKernel<false><<<1, 1>>>(iLayer,
+  //                                                 level,
+  //                                                 allCellSeeds,
+  //                                                 thrust::raw_pointer_cast(&lastCellSeeds[0]),
+  //                                                 thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds,
+  //                                                 lastCellSeeds.size(),
+  //                                                 thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds
+  //                                                 thrust::raw_pointer_cast(&updatedCellIds[0]),   // updatedCellsIds
+  //                                                 thrust::raw_pointer_cast(&foundSeedsTable[0]),  // auxiliary only in GPU code to compute the number of cells per iteration
+  //                                                 usedClusters,                                   // Used clusters
+  //                                                 neighbours,
+  //                                                 neighboursDeviceLUTs[iLayer - 1],
+  //                                                 foundTrackingFrameInfo,
+  //                                                 bz,
+  //                                                 MaxChi2ClusterAttachment,
+  //                                                 propagator,
+  //                                                 matCorrType);
+  //   gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&updatedCellSeeds[0]), updatedCellSeeds.size());
+  // }
+
+  gpuCheckError(cudaFree(d_temp_storage));
+  gpuCheckError(cudaFree(d_temp_storage_2));
 }
 
 void trackSeedHandler(CellSeed* trackSeeds,
@@ -1092,7 +1322,7 @@ void trackSeedHandler(CellSeed* trackSeeds,
                       o2::its::TrackITSExt* tracks,
                       std::vector<float>& minPtsHost,
                       const unsigned int nSeeds,
-                      const float Bz,
+                      const float bz,
                       const int startLevel,
                       float maxChi2ClusterAttachment,
                       float maxChi2NDF,
@@ -1108,7 +1338,7 @@ void trackSeedHandler(CellSeed* trackSeeds,
     tracks,                               // TrackITSExt*
     thrust::raw_pointer_cast(&minPts[0]), // const float* minPts,
     nSeeds,                               // const unsigned int
-    Bz,                                   // const float
+    bz,                                   // const float
     startLevel,                           // const int
     maxChi2ClusterAttachment,             // float
     maxChi2NDF,                           // float
@@ -1179,4 +1409,24 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                                std::vector<float>& mulScatAng,
                                                const int nBlocks,
                                                const int nThreads);
+
+template void processNeighboursHandler<7>(const int startLayer,
+                                          const int startLevel,
+                                          CellSeed** allCellSeeds,
+                                          CellSeed* currentCellSeeds,
+                                          const unsigned int nCurrentCells,
+                                          const unsigned char** usedClusters,
+                                          int* neighbours,
+                                          gsl::span<int*> neighboursDeviceLUTs,
+                                          const TrackingFrameInfo** foundTrackingFrameInfo,
+                                          const float bz,
+                                          const float MaxChi2ClusterAttachment,
+                                          const o2::base::Propagator* propagator,
+                                          const o2::base::PropagatorF::MatCorrType matCorrType,
+                                          const std::vector<int>& lastCellIdHost,        // temporary host vector
+                                          const std::vector<CellSeed>& lastCellSeedHost, // temporary host vector
+                                          std::vector<int>& updatedCellIdHost,           // temporary host vector
+                                          std::vector<CellSeed>& updatedCellSeedHost,    // temporary host vector
+                                          const int nBlocks,
+                                          const int nThreads);
 } // namespace o2::its
\ No newline at end of file
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h
index cb9f28665cf07..fa0473ae88462 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h
@@ -15,10 +15,10 @@
 
 #ifndef TRACKINGITSU_INCLUDE_CACELL_H_
 #define TRACKINGITSU_INCLUDE_CACELL_H_
-
 #ifndef GPUCA_GPUCODE_DEVICE
 #include <array>
 #include <vector>
+#include <iostream>
 #endif
 
 #include "GPUCommonDef.h"
@@ -104,6 +104,7 @@ class CellSeed final : public o2::track::TrackParCovF
   GPUhd() int* getLevelPtr() { return &mLevel; }
   GPUhd() int* getClusters() { return mClusters; }
   GPUhd() int getCluster(int i) const { return mClusters[i]; }
+  GPUhdi() void printCell() const;
 
  private:
   int mClusters[7] = {-1, -1, -1, -1, -1, -1, -1};
@@ -112,6 +113,11 @@ class CellSeed final : public o2::track::TrackParCovF
   float mChi2 = 0.f;
 };
 
+GPUhdi() void CellSeed::printCell() const
+{
+  printf("trkl: %d, %d\t lvl: %d\t chi2: %f\n", mTracklets[0], mTracklets[1], mLevel, mChi2);
+}
+
 } // namespace its
 } // namespace o2
 #endif /* TRACKINGITSU_INCLUDE_CACELL_H_ */