From 8ba8566b0f4f757bcbd3ef255e2840bc4cc96f3c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Dec 2020 06:29:28 -0800
Subject: [PATCH 1/6] [cudacompat] Copy cuda to cudacompat

---
 src/cudacompat/CUDACore/AtomicPairCounter.h   |   58 +
 .../CUDACore/CachingDeviceAllocator.h         |  747 ++++++++++++
 .../CUDACore/CachingHostAllocator.h           |  648 +++++++++++
 src/cudacompat/CUDACore/ContextState.cc       |   17 +
 src/cudacompat/CUDACore/ContextState.h        |   61 +
 src/cudacompat/CUDACore/ESProduct.h           |  102 ++
 src/cudacompat/CUDACore/EventCache.cc         |   68 ++
 src/cudacompat/CUDACore/EventCache.h          |   57 +
 src/cudacompat/CUDACore/HistoContainer.h      |  323 ++++++
 src/cudacompat/CUDACore/HostAllocator.h       |   55 +
 src/cudacompat/CUDACore/HostProduct.h         |   29 +
 src/cudacompat/CUDACore/Product.h             |   60 +
 src/cudacompat/CUDACore/ProductBase.cc        |   29 +
 src/cudacompat/CUDACore/ProductBase.h         |   93 ++
 src/cudacompat/CUDACore/ScopedContext.cc      |  116 ++
 src/cudacompat/CUDACore/ScopedContext.h       |  241 ++++
 src/cudacompat/CUDACore/ScopedSetDevice.h     |   30 +
 src/cudacompat/CUDACore/SharedEventPtr.h      |   18 +
 src/cudacompat/CUDACore/SharedStreamPtr.h     |   18 +
 src/cudacompat/CUDACore/SimpleVector.h        |  141 +++
 src/cudacompat/CUDACore/StreamCache.cc        |   43 +
 src/cudacompat/CUDACore/StreamCache.h         |   50 +
 src/cudacompat/CUDACore/VecArray.h            |  106 ++
 src/cudacompat/CUDACore/allocate_device.cc    |   39 +
 src/cudacompat/CUDACore/allocate_device.h     |   16 +
 src/cudacompat/CUDACore/allocate_host.cc      |   36 +
 src/cudacompat/CUDACore/allocate_host.h       |   16 +
 src/cudacompat/CUDACore/chooseDevice.cc       |   14 +
 src/cudacompat/CUDACore/chooseDevice.h        |   10 +
 src/cudacompat/CUDACore/copyAsync.h           |   69 ++
 src/cudacompat/CUDACore/cudaCheck.h           |   61 +
 src/cudacompat/CUDACore/cudaCompat.cc         |   17 +
 src/cudacompat/CUDACore/cudaCompat.h          |  112 ++
 src/cudacompat/CUDACore/cuda_assert.h         |   18 +
 src/cudacompat/CUDACore/cuda_cxx17.h          |   63 ++
 src/cudacompat/CUDACore/cudastdAlgorithm.h    |   70 ++
 src/cudacompat/CUDACore/currentDevice.h       |   18 +
 .../CUDACore/deviceAllocatorStatus.cc         |    7 +
 .../CUDACore/deviceAllocatorStatus.h          |   23 +
 src/cudacompat/CUDACore/deviceCount.h         |   18 +
 src/cudacompat/CUDACore/device_unique_ptr.h   |  101 ++
 src/cudacompat/CUDACore/eigenSoA.h            |   55 +
 .../CUDACore/eventWorkHasCompleted.h          |   32 +
 .../CUDACore/getCachingDeviceAllocator.h      |   77 ++
 .../CUDACore/getCachingHostAllocator.h        |   46 +
 .../CUDACore/host_noncached_unique_ptr.h      |   74 ++
 src/cudacompat/CUDACore/host_unique_ptr.h     |   80 ++
 src/cudacompat/CUDACore/launch.h              |  147 +++
 src/cudacompat/CUDACore/memsetAsync.h         |   33 +
 src/cudacompat/CUDACore/prefixScan.h          |  188 +++
 src/cudacompat/CUDACore/radixSort.h           |  277 +++++
 src/cudacompat/CUDACore/requireDevices.cc     |   30 +
 src/cudacompat/CUDACore/requireDevices.h      |   17 +
 src/cudacompat/CUDADataFormats/BeamSpotCUDA.h |   33 +
 .../CUDADataFormats/HeterogeneousSoA.h        |  189 ++++
 .../CUDADataFormats/PixelTrackHeterogeneous.h |   74 ++
 .../CUDADataFormats/SiPixelClustersCUDA.cc    |   21 +
 .../CUDADataFormats/SiPixelClustersCUDA.h     |   73 ++
 .../CUDADataFormats/SiPixelDigiErrorsCUDA.cc  |   42 +
 .../CUDADataFormats/SiPixelDigiErrorsCUDA.h   |   41 +
 .../CUDADataFormats/SiPixelDigisCUDA.cc       |   50 +
 .../CUDADataFormats/SiPixelDigisCUDA.h        |   98 ++
 .../CUDADataFormats/TrackingRecHit2DCUDA.cc   |   43 +
 .../CUDADataFormats/TrackingRecHit2DCUDA.h    |    1 +
 .../TrackingRecHit2DHeterogeneous.h           |  155 +++
 .../CUDADataFormats/TrackingRecHit2DSOAView.h |  101 ++
 .../CUDADataFormats/TrajectoryStateSoA.h      |   59 +
 .../CUDADataFormats/ZVertexHeterogeneous.h    |   14 +
 src/cudacompat/CUDADataFormats/ZVertexSoA.h   |   26 +
 .../CUDADataFormats/gpuClusteringConstants.h  |   32 +
 src/cudacompat/CondFormats/PixelCPEFast.cc    |   85 ++
 src/cudacompat/CondFormats/PixelCPEFast.h     |   43 +
 .../CondFormats/SiPixelFedCablingMapGPU.h     |   26 +
 .../SiPixelFedCablingMapGPUWrapper.cc         |   54 +
 .../SiPixelFedCablingMapGPUWrapper.h          |   46 +
 src/cudacompat/CondFormats/SiPixelFedIds.h    |   17 +
 .../SiPixelGainCalibrationForHLTGPU.cc        |   38 +
 .../SiPixelGainCalibrationForHLTGPU.h         |   28 +
 .../CondFormats/SiPixelGainForHLTonGPU.h      |   74 ++
 src/cudacompat/CondFormats/pixelCPEforGPU.h   |  344 ++++++
 src/cudacompat/DataFormats/BeamSpotPOD.h      |   21 +
 src/cudacompat/DataFormats/DigiClusterCount.h |   17 +
 src/cudacompat/DataFormats/FEDHeader.cc       |   46 +
 src/cudacompat/DataFormats/FEDHeader.h        |   59 +
 src/cudacompat/DataFormats/FEDNumbering.cc    |  111 ++
 src/cudacompat/DataFormats/FEDNumbering.h     |  132 +++
 src/cudacompat/DataFormats/FEDRawData.cc      |   34 +
 src/cudacompat/DataFormats/FEDRawData.h       |   55 +
 .../DataFormats/FEDRawDataCollection.cc       |   17 +
 .../DataFormats/FEDRawDataCollection.h        |   38 +
 src/cudacompat/DataFormats/FEDTrailer.cc      |   47 +
 src/cudacompat/DataFormats/FEDTrailer.h       |   62 +
 src/cudacompat/DataFormats/PixelErrors.h      |   21 +
 src/cudacompat/DataFormats/SOARotation.h      |  140 +++
 src/cudacompat/DataFormats/SiPixelDigisSoA.cc |   12 +
 src/cudacompat/DataFormats/SiPixelDigisSoA.h  |   33 +
 .../DataFormats/SiPixelRawDataError.cc        |  101 ++
 .../DataFormats/SiPixelRawDataError.h         |   57 +
 src/cudacompat/DataFormats/TrackCount.h       |   14 +
 src/cudacompat/DataFormats/VertexCount.h      |   14 +
 src/cudacompat/DataFormats/approx_atan2.h     |  290 +++++
 src/cudacompat/DataFormats/fed_header.h       |   66 ++
 src/cudacompat/DataFormats/fed_trailer.h      |   76 ++
 src/cudacompat/Framework/EDGetToken.h         |   85 ++
 src/cudacompat/Framework/EDProducer.h         |   53 +
 src/cudacompat/Framework/EDPutToken.h         |   89 ++
 src/cudacompat/Framework/ESPluginFactory.cc   |   34 +
 src/cudacompat/Framework/ESPluginFactory.h    |   60 +
 src/cudacompat/Framework/ESProducer.h         |   16 +
 src/cudacompat/Framework/EmptyWaitingTask.h   |   27 +
 src/cudacompat/Framework/Event.h              |   56 +
 src/cudacompat/Framework/EventSetup.h         |   56 +
 src/cudacompat/Framework/FunctorTask.h        |   52 +
 src/cudacompat/Framework/PluginFactory.cc     |   34 +
 src/cudacompat/Framework/PluginFactory.h      |   58 +
 src/cudacompat/Framework/ProductRegistry.h    |   73 ++
 .../Framework/ReusableObjectHolder.h          |  167 +++
 src/cudacompat/Framework/RunningAverage.h     |   53 +
 src/cudacompat/Framework/WaitingTask.h        |   93 ++
 src/cudacompat/Framework/WaitingTaskHolder.h  |   90 ++
 src/cudacompat/Framework/WaitingTaskList.cc   |  172 +++
 src/cudacompat/Framework/WaitingTaskList.h    |  175 +++
 .../Framework/WaitingTaskWithArenaHolder.cc   |   96 ++
 .../Framework/WaitingTaskWithArenaHolder.h    |  100 ++
 src/cudacompat/Framework/Worker.cc            |   25 +
 src/cudacompat/Framework/Worker.h             |  112 ++
 src/cudacompat/Framework/hardware_pause.h     |   33 +
 src/cudacompat/Geometry/phase1PixelTopology.h |  174 +++
 src/cudacompat/Makefile                       |  147 +++
 src/cudacompat/Makefile.deps                  |   12 +
 src/cudacompat/bin/EventProcessor.cc          |   45 +
 src/cudacompat/bin/EventProcessor.h           |   39 +
 src/cudacompat/bin/PluginManager.cc           |   39 +
 src/cudacompat/bin/PluginManager.h            |   26 +
 src/cudacompat/bin/SharedLibrary.cc           |   82 ++
 src/cudacompat/bin/SharedLibrary.h            |   53 +
 src/cudacompat/bin/Source.cc                  |  100 ++
 src/cudacompat/bin/Source.h                   |   40 +
 src/cudacompat/bin/StreamSchedule.cc          |   94 ++
 src/cudacompat/bin/StreamSchedule.h           |   51 +
 src/cudacompat/bin/main.cc                    |  171 +++
 .../BeamSpotESProducer.cc                     |   28 +
 .../plugin-BeamSpotProducer/BeamSpotToCUDA.cc |   44 +
 .../PixelTrackSoAFromCUDA.cc                  |   65 ++
 .../plugin-PixelTriplets/BrokenLine.h         |  565 +++++++++
 .../BrokenLineFitOnGPU.cc                     |   68 ++
 .../BrokenLineFitOnGPU.cu                     |   85 ++
 .../plugin-PixelTriplets/BrokenLineFitOnGPU.h |  185 +++
 .../plugin-PixelTriplets/CAConstants.h        |   69 ++
 .../plugin-PixelTriplets/CAHitNtupletCUDA.cc  |   44 +
 .../CAHitNtupletGeneratorKernels.cc           |  184 +++
 .../CAHitNtupletGeneratorKernels.cu           |  306 +++++
 .../CAHitNtupletGeneratorKernels.h            |  207 ++++
 .../CAHitNtupletGeneratorKernelsAlloc.cc      |    1 +
 .../CAHitNtupletGeneratorKernelsAlloc.cu      |    1 +
 .../CAHitNtupletGeneratorKernelsAlloc.h       |   39 +
 .../CAHitNtupletGeneratorKernelsImpl.h        |  605 ++++++++++
 .../CAHitNtupletGeneratorOnGPU.cc             |  167 +++
 .../CAHitNtupletGeneratorOnGPU.h              |   56 +
 .../plugin-PixelTriplets/CircleEq.h           |  107 ++
 .../plugin-PixelTriplets/FitResult.h          |   65 ++
 .../plugin-PixelTriplets/FitUtils.h           |  246 ++++
 .../plugin-PixelTriplets/GPUCACell.h          |  348 ++++++
 .../plugin-PixelTriplets/HelixFitOnGPU.cc     |   16 +
 .../plugin-PixelTriplets/HelixFitOnGPU.h      |   68 ++
 .../plugin-PixelTriplets/RiemannFit.h         | 1005 +++++++++++++++++
 .../plugin-PixelTriplets/RiemannFitOnGPU.cc   |  110 ++
 .../plugin-PixelTriplets/RiemannFitOnGPU.cu   |  131 +++
 .../plugin-PixelTriplets/RiemannFitOnGPU.h    |  187 +++
 .../plugin-PixelTriplets/choleskyInversion.h  |  349 ++++++
 .../plugin-PixelTriplets/gpuFishbone.h        |   93 ++
 .../plugin-PixelTriplets/gpuPixelDoublets.h   |  130 +++
 .../gpuPixelDoubletsAlgos.h                   |  244 ++++
 .../PixelVertexProducerCUDA.cc                |   90 ++
 .../PixelVertexSoAFromCUDA.cc                 |   49 +
 .../gpuClusterTracksByDensity.h               |  234 ++++
 .../gpuClusterTracksDBSCAN.h                  |  242 ++++
 .../gpuClusterTracksIterative.h               |  213 ++++
 .../gpuFitVertices.h                          |  113 ++
 .../plugin-PixelVertexFinding/gpuSortByPt2.h  |   73 ++
 .../gpuSplitVertices.h                        |  139 +++
 .../gpuVertexFinder.cc                        |    1 +
 .../gpuVertexFinder.cu                        |    1 +
 .../gpuVertexFinder.h                         |   83 ++
 .../gpuVertexFinderImpl.h                     |  173 +++
 .../plugin-SiPixelClusterizer/ErrorChecker.cc |   91 ++
 .../plugin-SiPixelClusterizer/ErrorChecker.h  |   33 +
 ...iPixelFedCablingMapGPUWrapperESProducer.cc |   43 +
 ...PixelGainCalibrationForHLTGPUESProducer.cc |   32 +
 .../SiPixelRawToClusterCUDA.cc                |  176 +++
 .../SiPixelRawToClusterGPUKernel.cu           |  679 +++++++++++
 .../SiPixelRawToClusterGPUKernel.h            |  223 ++++
 .../plugin-SiPixelClusterizer/gpuCalibPixel.h |   69 ++
 .../gpuClusterChargeCut.h                     |  125 ++
 .../plugin-SiPixelClusterizer/gpuClustering.h |  306 +++++
 .../gpuClusteringConstants.h                  |    6 +
 .../SiPixelDigisSoAFromCUDA.cc                |   72 ++
 .../PixelCPEFastESProducer.cc                 |   23 +
 .../plugin-SiPixelRecHits/PixelRecHits.cu     |   78 ++
 .../plugin-SiPixelRecHits/PixelRecHits.h      |   33 +
 .../SiPixelRecHitCUDA.cc                      |   61 +
 .../plugin-SiPixelRecHits/gpuPixelRecHits.h   |  222 ++++
 .../plugin-Validation/CountValidator.cc       |  149 +++
 .../plugin-Validation/HistoValidator.cc       |  192 ++++
 .../plugin-Validation/SimpleAtomicHisto.h     |   60 +
 src/cudacompat/plugins.txt                    |   14 +
 src/cudacompat/test/AtomicPairCounter_t.cu    |   67 ++
 src/cudacompat/test/HistoContainer_t.cu       |  155 +++
 src/cudacompat/test/HistoContainer_t_cpu.cc   |  146 +++
 src/cudacompat/test/OneHistoContainer_t.cu    |  142 +++
 src/cudacompat/test/OneToManyAssoc_cpu_t.cc   |    1 +
 src/cudacompat/test/OneToManyAssoc_t.cu       |    1 +
 src/cudacompat/test/OneToManyAssoc_t.h        |  304 +++++
 src/cudacompat/test/TrackingRecHit2DCUDA_t.cu |   56 +
 .../test/TrajectoryStateSOA_cpu_t.cc          |    1 +
 src/cudacompat/test/TrajectoryStateSOA_t.cu   |    1 +
 src/cudacompat/test/TrajectoryStateSOA_t.h    |   75 ++
 src/cudacompat/test/VertexFinder_t.h          |  350 ++++++
 src/cudacompat/test/cpuClustering_t.cc        |    1 +
 src/cudacompat/test/cpuVertexFinder_t.cc      |    1 +
 src/cudacompat/test/cudastdAlgorithm_t.cu     |   30 +
 src/cudacompat/test/cudastdAlgorithm_t_cpu.cc |   34 +
 src/cudacompat/test/eigenSoA_t.cu             |    1 +
 src/cudacompat/test/eigenSoA_t.h              |  101 ++
 src/cudacompat/test/eigenSoA_t_cpu.cc         |    1 +
 src/cudacompat/test/gpuClustering_t.cu        |    1 +
 src/cudacompat/test/gpuClustering_t.h         |  401 +++++++
 src/cudacompat/test/gpuVertexFinder_t.cu      |    1 +
 src/cudacompat/test/histo.cc                  |   54 +
 src/cudacompat/test/prefixScan_t.cu           |  148 +++
 src/cudacompat/test/radixSort_t.cu            |  204 ++++
 src/cudacompat/test/testBrokenLineFit.cc      |    2 +
 src/cudacompat/test/testEigenGPU.cu           |  341 ++++++
 src/cudacompat/test/testEigenGPUNoFit.cu      |  248 ++++
 src/cudacompat/test/testRiemannFit.cc         |  153 +++
 src/cudacompat/test/test_GPUSimpleVector.cu   |   83 ++
 src/cudacompat/test/test_common.h             |   47 +
 237 files changed, 24093 insertions(+)
 create mode 100644 src/cudacompat/CUDACore/AtomicPairCounter.h
 create mode 100644 src/cudacompat/CUDACore/CachingDeviceAllocator.h
 create mode 100644 src/cudacompat/CUDACore/CachingHostAllocator.h
 create mode 100644 src/cudacompat/CUDACore/ContextState.cc
 create mode 100644 src/cudacompat/CUDACore/ContextState.h
 create mode 100644 src/cudacompat/CUDACore/ESProduct.h
 create mode 100644 src/cudacompat/CUDACore/EventCache.cc
 create mode 100644 src/cudacompat/CUDACore/EventCache.h
 create mode 100644 src/cudacompat/CUDACore/HistoContainer.h
 create mode 100644 src/cudacompat/CUDACore/HostAllocator.h
 create mode 100644 src/cudacompat/CUDACore/HostProduct.h
 create mode 100644 src/cudacompat/CUDACore/Product.h
 create mode 100644 src/cudacompat/CUDACore/ProductBase.cc
 create mode 100644 src/cudacompat/CUDACore/ProductBase.h
 create mode 100644 src/cudacompat/CUDACore/ScopedContext.cc
 create mode 100644 src/cudacompat/CUDACore/ScopedContext.h
 create mode 100644 src/cudacompat/CUDACore/ScopedSetDevice.h
 create mode 100644 src/cudacompat/CUDACore/SharedEventPtr.h
 create mode 100644 src/cudacompat/CUDACore/SharedStreamPtr.h
 create mode 100644 src/cudacompat/CUDACore/SimpleVector.h
 create mode 100644 src/cudacompat/CUDACore/StreamCache.cc
 create mode 100644 src/cudacompat/CUDACore/StreamCache.h
 create mode 100644 src/cudacompat/CUDACore/VecArray.h
 create mode 100644 src/cudacompat/CUDACore/allocate_device.cc
 create mode 100644 src/cudacompat/CUDACore/allocate_device.h
 create mode 100644 src/cudacompat/CUDACore/allocate_host.cc
 create mode 100644 src/cudacompat/CUDACore/allocate_host.h
 create mode 100644 src/cudacompat/CUDACore/chooseDevice.cc
 create mode 100644 src/cudacompat/CUDACore/chooseDevice.h
 create mode 100644 src/cudacompat/CUDACore/copyAsync.h
 create mode 100644 src/cudacompat/CUDACore/cudaCheck.h
 create mode 100644 src/cudacompat/CUDACore/cudaCompat.cc
 create mode 100644 src/cudacompat/CUDACore/cudaCompat.h
 create mode 100644 src/cudacompat/CUDACore/cuda_assert.h
 create mode 100644 src/cudacompat/CUDACore/cuda_cxx17.h
 create mode 100644 src/cudacompat/CUDACore/cudastdAlgorithm.h
 create mode 100644 src/cudacompat/CUDACore/currentDevice.h
 create mode 100644 src/cudacompat/CUDACore/deviceAllocatorStatus.cc
 create mode 100644 src/cudacompat/CUDACore/deviceAllocatorStatus.h
 create mode 100644 src/cudacompat/CUDACore/deviceCount.h
 create mode 100644 src/cudacompat/CUDACore/device_unique_ptr.h
 create mode 100644 src/cudacompat/CUDACore/eigenSoA.h
 create mode 100644 src/cudacompat/CUDACore/eventWorkHasCompleted.h
 create mode 100644 src/cudacompat/CUDACore/getCachingDeviceAllocator.h
 create mode 100644 src/cudacompat/CUDACore/getCachingHostAllocator.h
 create mode 100644 src/cudacompat/CUDACore/host_noncached_unique_ptr.h
 create mode 100644 src/cudacompat/CUDACore/host_unique_ptr.h
 create mode 100644 src/cudacompat/CUDACore/launch.h
 create mode 100644 src/cudacompat/CUDACore/memsetAsync.h
 create mode 100644 src/cudacompat/CUDACore/prefixScan.h
 create mode 100644 src/cudacompat/CUDACore/radixSort.h
 create mode 100644 src/cudacompat/CUDACore/requireDevices.cc
 create mode 100644 src/cudacompat/CUDACore/requireDevices.h
 create mode 100644 src/cudacompat/CUDADataFormats/BeamSpotCUDA.h
 create mode 100644 src/cudacompat/CUDADataFormats/HeterogeneousSoA.h
 create mode 100644 src/cudacompat/CUDADataFormats/PixelTrackHeterogeneous.h
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.cc
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.h
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.h
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.cc
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.h
 create mode 100644 src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.cc
 create mode 100644 src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.h
 create mode 100644 src/cudacompat/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
 create mode 100644 src/cudacompat/CUDADataFormats/TrackingRecHit2DSOAView.h
 create mode 100644 src/cudacompat/CUDADataFormats/TrajectoryStateSoA.h
 create mode 100644 src/cudacompat/CUDADataFormats/ZVertexHeterogeneous.h
 create mode 100644 src/cudacompat/CUDADataFormats/ZVertexSoA.h
 create mode 100644 src/cudacompat/CUDADataFormats/gpuClusteringConstants.h
 create mode 100644 src/cudacompat/CondFormats/PixelCPEFast.cc
 create mode 100644 src/cudacompat/CondFormats/PixelCPEFast.h
 create mode 100644 src/cudacompat/CondFormats/SiPixelFedCablingMapGPU.h
 create mode 100644 src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.cc
 create mode 100644 src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h
 create mode 100644 src/cudacompat/CondFormats/SiPixelFedIds.h
 create mode 100644 src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
 create mode 100644 src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.h
 create mode 100644 src/cudacompat/CondFormats/SiPixelGainForHLTonGPU.h
 create mode 100644 src/cudacompat/CondFormats/pixelCPEforGPU.h
 create mode 100644 src/cudacompat/DataFormats/BeamSpotPOD.h
 create mode 100644 src/cudacompat/DataFormats/DigiClusterCount.h
 create mode 100644 src/cudacompat/DataFormats/FEDHeader.cc
 create mode 100644 src/cudacompat/DataFormats/FEDHeader.h
 create mode 100644 src/cudacompat/DataFormats/FEDNumbering.cc
 create mode 100644 src/cudacompat/DataFormats/FEDNumbering.h
 create mode 100644 src/cudacompat/DataFormats/FEDRawData.cc
 create mode 100644 src/cudacompat/DataFormats/FEDRawData.h
 create mode 100644 src/cudacompat/DataFormats/FEDRawDataCollection.cc
 create mode 100644 src/cudacompat/DataFormats/FEDRawDataCollection.h
 create mode 100644 src/cudacompat/DataFormats/FEDTrailer.cc
 create mode 100644 src/cudacompat/DataFormats/FEDTrailer.h
 create mode 100644 src/cudacompat/DataFormats/PixelErrors.h
 create mode 100644 src/cudacompat/DataFormats/SOARotation.h
 create mode 100644 src/cudacompat/DataFormats/SiPixelDigisSoA.cc
 create mode 100644 src/cudacompat/DataFormats/SiPixelDigisSoA.h
 create mode 100644 src/cudacompat/DataFormats/SiPixelRawDataError.cc
 create mode 100644 src/cudacompat/DataFormats/SiPixelRawDataError.h
 create mode 100644 src/cudacompat/DataFormats/TrackCount.h
 create mode 100644 src/cudacompat/DataFormats/VertexCount.h
 create mode 100644 src/cudacompat/DataFormats/approx_atan2.h
 create mode 100644 src/cudacompat/DataFormats/fed_header.h
 create mode 100644 src/cudacompat/DataFormats/fed_trailer.h
 create mode 100644 src/cudacompat/Framework/EDGetToken.h
 create mode 100644 src/cudacompat/Framework/EDProducer.h
 create mode 100644 src/cudacompat/Framework/EDPutToken.h
 create mode 100644 src/cudacompat/Framework/ESPluginFactory.cc
 create mode 100644 src/cudacompat/Framework/ESPluginFactory.h
 create mode 100644 src/cudacompat/Framework/ESProducer.h
 create mode 100644 src/cudacompat/Framework/EmptyWaitingTask.h
 create mode 100644 src/cudacompat/Framework/Event.h
 create mode 100644 src/cudacompat/Framework/EventSetup.h
 create mode 100644 src/cudacompat/Framework/FunctorTask.h
 create mode 100644 src/cudacompat/Framework/PluginFactory.cc
 create mode 100644 src/cudacompat/Framework/PluginFactory.h
 create mode 100644 src/cudacompat/Framework/ProductRegistry.h
 create mode 100644 src/cudacompat/Framework/ReusableObjectHolder.h
 create mode 100644 src/cudacompat/Framework/RunningAverage.h
 create mode 100644 src/cudacompat/Framework/WaitingTask.h
 create mode 100644 src/cudacompat/Framework/WaitingTaskHolder.h
 create mode 100644 src/cudacompat/Framework/WaitingTaskList.cc
 create mode 100644 src/cudacompat/Framework/WaitingTaskList.h
 create mode 100644 src/cudacompat/Framework/WaitingTaskWithArenaHolder.cc
 create mode 100644 src/cudacompat/Framework/WaitingTaskWithArenaHolder.h
 create mode 100644 src/cudacompat/Framework/Worker.cc
 create mode 100644 src/cudacompat/Framework/Worker.h
 create mode 100644 src/cudacompat/Framework/hardware_pause.h
 create mode 100644 src/cudacompat/Geometry/phase1PixelTopology.h
 create mode 100644 src/cudacompat/Makefile
 create mode 100644 src/cudacompat/Makefile.deps
 create mode 100644 src/cudacompat/bin/EventProcessor.cc
 create mode 100644 src/cudacompat/bin/EventProcessor.h
 create mode 100644 src/cudacompat/bin/PluginManager.cc
 create mode 100644 src/cudacompat/bin/PluginManager.h
 create mode 100644 src/cudacompat/bin/SharedLibrary.cc
 create mode 100644 src/cudacompat/bin/SharedLibrary.h
 create mode 100644 src/cudacompat/bin/Source.cc
 create mode 100644 src/cudacompat/bin/Source.h
 create mode 100644 src/cudacompat/bin/StreamSchedule.cc
 create mode 100644 src/cudacompat/bin/StreamSchedule.h
 create mode 100644 src/cudacompat/bin/main.cc
 create mode 100644 src/cudacompat/plugin-BeamSpotProducer/BeamSpotESProducer.cc
 create mode 100644 src/cudacompat/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
 create mode 100644 src/cudacompat/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/BrokenLine.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
 create mode 100644 src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAConstants.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cu
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/CircleEq.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/FitResult.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/FitUtils.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/GPUCACell.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/RiemannFit.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cc
 create mode 100644 src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cu
 create mode 100644 src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/choleskyInversion.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/gpuFishbone.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/gpuPixelDoublets.h
 create mode 100644 src/cudacompat/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuFitVertices.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuSortByPt2.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuSplitVertices.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cc
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cu
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.h
 create mode 100644 src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.cc
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.h
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/SiPixelFedCablingMapGPUWrapperESProducer.cc
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/SiPixelGainCalibrationForHLTGPUESProducer.cc
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h
 create mode 100644 src/cudacompat/plugin-SiPixelClusterizer/gpuClusteringConstants.h
 create mode 100644 src/cudacompat/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
 create mode 100644 src/cudacompat/plugin-SiPixelRecHits/PixelCPEFastESProducer.cc
 create mode 100644 src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu
 create mode 100644 src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h
 create mode 100644 src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
 create mode 100644 src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
 create mode 100644 src/cudacompat/plugin-Validation/CountValidator.cc
 create mode 100644 src/cudacompat/plugin-Validation/HistoValidator.cc
 create mode 100644 src/cudacompat/plugin-Validation/SimpleAtomicHisto.h
 create mode 100644 src/cudacompat/plugins.txt
 create mode 100644 src/cudacompat/test/AtomicPairCounter_t.cu
 create mode 100644 src/cudacompat/test/HistoContainer_t.cu
 create mode 100644 src/cudacompat/test/HistoContainer_t_cpu.cc
 create mode 100644 src/cudacompat/test/OneHistoContainer_t.cu
 create mode 100644 src/cudacompat/test/OneToManyAssoc_cpu_t.cc
 create mode 100644 src/cudacompat/test/OneToManyAssoc_t.cu
 create mode 100644 src/cudacompat/test/OneToManyAssoc_t.h
 create mode 100644 src/cudacompat/test/TrackingRecHit2DCUDA_t.cu
 create mode 100644 src/cudacompat/test/TrajectoryStateSOA_cpu_t.cc
 create mode 100644 src/cudacompat/test/TrajectoryStateSOA_t.cu
 create mode 100644 src/cudacompat/test/TrajectoryStateSOA_t.h
 create mode 100644 src/cudacompat/test/VertexFinder_t.h
 create mode 100644 src/cudacompat/test/cpuClustering_t.cc
 create mode 100644 src/cudacompat/test/cpuVertexFinder_t.cc
 create mode 100644 src/cudacompat/test/cudastdAlgorithm_t.cu
 create mode 100644 src/cudacompat/test/cudastdAlgorithm_t_cpu.cc
 create mode 100644 src/cudacompat/test/eigenSoA_t.cu
 create mode 100644 src/cudacompat/test/eigenSoA_t.h
 create mode 100644 src/cudacompat/test/eigenSoA_t_cpu.cc
 create mode 100644 src/cudacompat/test/gpuClustering_t.cu
 create mode 100644 src/cudacompat/test/gpuClustering_t.h
 create mode 100644 src/cudacompat/test/gpuVertexFinder_t.cu
 create mode 100644 src/cudacompat/test/histo.cc
 create mode 100644 src/cudacompat/test/prefixScan_t.cu
 create mode 100644 src/cudacompat/test/radixSort_t.cu
 create mode 100644 src/cudacompat/test/testBrokenLineFit.cc
 create mode 100644 src/cudacompat/test/testEigenGPU.cu
 create mode 100644 src/cudacompat/test/testEigenGPUNoFit.cu
 create mode 100644 src/cudacompat/test/testRiemannFit.cc
 create mode 100644 src/cudacompat/test/test_GPUSimpleVector.cu
 create mode 100644 src/cudacompat/test/test_common.h

diff --git a/src/cudacompat/CUDACore/AtomicPairCounter.h b/src/cudacompat/CUDACore/AtomicPairCounter.h
new file mode 100644
index 000000000..19b2781e0
--- /dev/null
+++ b/src/cudacompat/CUDACore/AtomicPairCounter.h
@@ -0,0 +1,58 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
+#define HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
+
+#include <cstdint>
+
+#include "CUDACore/cudaCompat.h"
+
+namespace cms {
+  namespace cuda {
+
+    class AtomicPairCounter {
+    public:
+      using c_type = unsigned long long int;
+
+      AtomicPairCounter() {}
+      AtomicPairCounter(c_type i) { counter.ac = i; }
+
+      __device__ __host__ AtomicPairCounter& operator=(c_type i) {
+        counter.ac = i;
+        return *this;
+      }
+
+      struct Counters {
+        uint32_t n;  // in a "One to Many" association is the number of "One"
+        uint32_t m;  // in a "One to Many" association is the total number of associations
+      };
+
+      union Atomic2 {
+        Counters counters;
+        c_type ac;
+      };
+
+      static constexpr c_type incr = 1UL << 32;
+
+      __device__ __host__ Counters get() const { return counter.counters; }
+
+      // increment n by 1 and m by i.  return previous value
+      __host__ __device__ __forceinline__ Counters add(uint32_t i) {
+        c_type c = i;
+        c += incr;
+        Atomic2 ret;
+#ifdef __CUDA_ARCH__
+        ret.ac = atomicAdd(&counter.ac, c);
+#else
+        ret.ac = counter.ac;
+        counter.ac += c;
+#endif
+        return ret.counters;
+      }
+
+    private:
+      Atomic2 counter;
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
diff --git a/src/cudacompat/CUDACore/CachingDeviceAllocator.h b/src/cudacompat/CUDACore/CachingDeviceAllocator.h
new file mode 100644
index 000000000..50c1ebdb2
--- /dev/null
+++ b/src/cudacompat/CUDACore/CachingDeviceAllocator.h
@@ -0,0 +1,747 @@
+#ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
+#define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Forked to CMSSW by Matti Kortelainen
+ */
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#include <cmath>
+#include <map>
+#include <set>
+#include <mutex>
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/deviceAllocatorStatus.h"
+
+/// CUB namespace
+namespace notcub {
+
+  /**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+  /******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+  /**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+  struct CachingDeviceAllocator {
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int)-1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t)-1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor {
+      void *d_ptr;                     // Device pointer
+      size_t bytes;                    // Size of allocation in bytes
+      size_t bytesRequested;           // CMS: requested allocatoin size (for monitoring only)
+      unsigned int bin;                // Bin enumeration
+      int device;                      // device ordinal
+      cudaStream_t associated_stream;  // Associated associated_stream
+      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+
+      // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+      BlockDescriptor(void *d_ptr, int device)
+          : d_ptr(d_ptr),
+            bytes(0),
+            bytesRequested(0),  // CMS
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+      BlockDescriptor(int device)
+          : d_ptr(nullptr),
+            bytes(0),
+            bytesRequested(0),  // CMS
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Comparison functor for comparing device pointers
+      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
+        if (a.device == b.device)
+          return (a.d_ptr < b.d_ptr);
+        else
+          return (a.device < b.device);
+      }
+
+      // Comparison functor for comparing allocation sizes
+      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
+        if (a.device == b.device)
+          return (a.bytes < b.bytes);
+        else
+          return (a.device < b.device);
+      }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    // CMS: Moved TotalBytes to deviceAllocatorStatus.h
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    // CMS: Moved definition to deviceAllocatorStatus.h
+    using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(unsigned int base, unsigned int exp) {
+      unsigned int retval = 1;
+      while (exp > 0) {
+        if (exp & 1) {
+          retval = retval * base;  // multiply the result by the current base
+        }
+        base = base * base;  // square the base
+        exp = exp >> 1;      // divide the exponent in half
+      }
+      return retval;
+    }
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
+      power = 0;
+      rounded_bytes = 1;
+
+      if (value * base < value) {
+        // Overflow
+        power = sizeof(size_t) * 8;
+        rounded_bytes = size_t(0) - 1;
+        return;
+      }
+
+      while (rounded_bytes < value) {
+        rounded_bytes *= base;
+        power++;
+      }
+    }
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    // CMS: use std::mutex instead of cub::Mutex, declare mutable
+    mutable std::mutex mutex;  /// Mutex for thread-safety
+
+    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
+    unsigned int min_bin;     /// Minimum bin enumeration
+    unsigned int max_bin;     /// Maximum bin enumeration
+
+    size_t min_bin_bytes;     /// Minimum bin size
+    size_t max_bin_bytes;     /// Maximum bin size
+    size_t max_cached_bytes;  /// Maximum aggregate cached bytes per device
+
+    const bool
+        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool debug;        /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes cached_bytes;  /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks cached_blocks;   /// Set of cached device allocations available for reuse
+    BusyBlocks live_blocks;       /// Set of live device allocations currently in use
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool skip_cleanup =
+            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+        : bin_growth(bin_growth),
+          min_bin(min_bin),
+          max_bin(max_bin),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes(max_cached_bytes),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
+        : bin_growth(8),
+          min_bin(3),
+          max_bin(7),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes((max_bin_bytes * 3) - 1),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
+      // Lock
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock mutex_locker(mutex);
+
+      if (debug)
+        // CMS: use raw printf
+        printf("Changing max_cached_bytes (%lld -> %lld)\n",
+               (long long)this->max_cached_bytes,
+               (long long)max_cached_bytes);
+
+      this->max_cached_bytes = max_cached_bytes;
+
+      // Unlock (redundant, kept for style uniformity)
+      mutex_locker.unlock();
+
+      return cudaSuccess;
+    }
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int device,                            ///< [in] Device on which to place the allocation
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
+      *d_ptr = nullptr;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      if (device == INVALID_DEVICE_ORDINAL) {
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
+        device = entrypoint_device;
+      }
+
+      // Create a block descriptor for the requested allocation
+      bool found = false;
+      BlockDescriptor search_key(device);
+      search_key.bytesRequested = bytes;  // CMS
+      search_key.associated_stream = active_stream;
+      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+      if (search_key.bin > max_bin) {
+        // Bin is greater than our maximum bin: allocate the request
+        // exactly and give out-of-bounds bin.  It will not be cached
+        // for reuse when returned.
+        search_key.bin = INVALID_BIN;
+        search_key.bytes = bytes;
+      } else {
+        // Search for a suitable cached allocation: lock
+        mutex_locker.lock();
+
+        if (search_key.bin < min_bin) {
+          // Bin is less than minimum bin: round up
+          search_key.bin = min_bin;
+          search_key.bytes = min_bin_bytes;
+        }
+
+        // Iterate through the range of cached blocks on the same device in the same bin
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+        while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
+               (block_itr->bin == search_key.bin)) {
+          // To prevent races with reusing blocks returned by the host but still
+          // in use by the device, only consider cached blocks that are
+          // either (from the active stream) or (from an idle stream)
+          if ((active_stream == block_itr->associated_stream) ||
+              (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
+            // Reuse existing cache block.  Insert into live blocks.
+            found = true;
+            search_key = *block_itr;
+            search_key.associated_stream = active_stream;
+            live_blocks.insert(search_key);
+
+            // Remove from free blocks
+            cached_bytes[device].free -= search_key.bytes;
+            cached_bytes[device].live += search_key.bytes;
+            cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
+
+            if (debug)
+              // CMS: improved debug message
+              // CMS: use raw printf
+              printf(
+                  "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
+                  "associated with stream %lld, event %lld).\n",
+                  device,
+                  search_key.d_ptr,
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)block_itr->associated_stream,
+                  (long long)block_itr->ready_event);
+
+            cached_blocks.erase(block_itr);
+
+            break;
+          }
+          block_itr++;
+        }
+
+        // Done searching: unlock
+        mutex_locker.unlock();
+      }
+
+      // Allocate the block if necessary
+      if (!found) {
+        // Set runtime's current device to specified device (entrypoint may not be set)
+        if (device != entrypoint_device) {
+          // CMS: throw exception on error
+          cudaCheck(error = cudaGetDevice(&entrypoint_device));
+          cudaCheck(error = cudaSetDevice(device));
+        }
+
+        // Attempt to allocate
+        // CMS: silently ignore errors and retry or pass them to the caller
+        if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
+          // The allocation attempt failed: free all cached blocks on device and retry
+          if (debug)
+            // CMS: use raw printf
+            printf(
+                "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                device,
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream);
+
+          error = cudaSuccess;  // Reset the error we will return
+          cudaGetLastError();   // Reset CUDART's error
+
+          // Lock
+          mutex_locker.lock();
+
+          // Iterate the range of free blocks on the same device
+          BlockDescriptor free_key(device);
+          CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+          while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
+            // No need to worry about synchronization with the device: cudaFree is
+            // blocking and will synchronize across all kernels executing
+            // on the current device
+
+            // Free device memory and destroy stream event.
+            // CMS: silently ignore errors and pass them to the caller
+            if ((error = cudaFree(block_itr->d_ptr)))
+              break;
+            if ((error = cudaEventDestroy(block_itr->ready_event)))
+              break;
+
+            // Reduce balance and erase entry
+            cached_bytes[device].free -= block_itr->bytes;
+
+            if (debug)
+              // CMS: use raw printf
+              printf(
+                  "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
+                  "(%lld bytes) outstanding.\n",
+                  device,
+                  (long long)block_itr->bytes,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes[device].free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes[device].live);
+
+            cached_blocks.erase(block_itr);
+
+            block_itr++;
+          }
+
+          // Unlock
+          mutex_locker.unlock();
+
+          // Return under error
+          if (error)
+            return error;
+
+          // Try to allocate again
+          // CMS: throw exception on error
+          cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
+        }
+
+        // Create ready event
+        // CMS: throw exception on error
+        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
+
+        // Insert into live blocks
+        mutex_locker.lock();
+        live_blocks.insert(search_key);
+        cached_bytes[device].live += search_key.bytes;
+        cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
+        mutex_locker.unlock();
+
+        if (debug)
+          // CMS: improved debug message
+          // CMS: use raw printf
+          printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
+                 device,
+                 search_key.d_ptr,
+                 (long long)search_key.bytes,
+                 (long long)search_key.associated_stream,
+                 (long long)search_key.ready_event);
+
+        // Attempt to revert back to previous device if necessary
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
+          // CMS: throw exception on error
+          cudaCheck(error = cudaSetDevice(entrypoint_device));
+        }
+      }
+
+      // Copy device pointer to output parameter
+      *d_ptr = search_key.d_ptr;
+
+      if (debug)
+        // CMS: use raw printf
+        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+               (long long)cached_blocks.size(),
+               (long long)cached_bytes[device].free,
+               (long long)live_blocks.size(),
+               (long long)cached_bytes[device].live);
+
+      return error;
+    }
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(int device, void *d_ptr) {
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
+
+      if (device == INVALID_DEVICE_ORDINAL) {
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
+        device = entrypoint_device;
+      }
+
+      // Lock
+      mutex_locker.lock();
+
+      // Find corresponding block descriptor
+      bool recached = false;
+      BlockDescriptor search_key(d_ptr, device);
+      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+      if (block_itr != live_blocks.end()) {
+        // Remove from live blocks
+        search_key = *block_itr;
+        live_blocks.erase(block_itr);
+        cached_bytes[device].live -= search_key.bytes;
+        cached_bytes[device].liveRequested -= search_key.bytesRequested;  // CMS
+
+        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+        if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
+          // Insert returned allocation into free blocks
+          recached = true;
+          cached_blocks.insert(search_key);
+          cached_bytes[device].free += search_key.bytes;
+
+          if (debug)
+            // CMS: improved debug message
+            // CMS: use raw printf
+            printf(
+                "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
+                "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                device,
+                (long long)search_key.bytes,
+                d_ptr,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes[device].free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes[device].live);
+        }
+      }
+
+      // First set to specified device (entrypoint may not be set)
+      if (device != entrypoint_device) {
+        // CMS: throw exception on error
+        cudaCheck(error = cudaGetDevice(&entrypoint_device));
+        cudaCheck(error = cudaSetDevice(device));
+      }
+
+      if (recached) {
+        // Insert the ready event in the associated stream (must have current device set properly)
+        // CMS: throw exception on error
+        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
+      }
+
+      // Unlock
+      mutex_locker.unlock();
+
+      if (!recached) {
+        // Free the allocation from the runtime and cleanup the event.
+        // CMS: throw exception on error
+        cudaCheck(error = cudaFree(d_ptr));
+        cudaCheck(error = cudaEventDestroy(search_key.ready_event));
+
+        if (debug)
+          // CMS: improved debug message
+          printf(
+              "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
+              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+              device,
+              (long long)search_key.bytes,
+              d_ptr,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes[device].free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes[device].live);
+      }
+
+      // Reset device
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
+        // CMS: throw exception on error
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached() {
+      cudaError_t error = cudaSuccess;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      int current_device = INVALID_DEVICE_ORDINAL;
+      // CMS: use RAII instead of (un)locking explicitly
+      std::unique_lock<std::mutex> mutex_locker(mutex);
+
+      while (!cached_blocks.empty()) {
+        // Get first block
+        CachedBlocks::iterator begin = cached_blocks.begin();
+
+        // Get entry-point device ordinal if necessary
+        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
+          // CMS: silently ignore errors and pass them to the caller
+          if ((error = cudaGetDevice(&entrypoint_device)))
+            break;
+        }
+
+        // Set current device ordinal if necessary
+        if (begin->device != current_device) {
+          // CMS: silently ignore errors and pass them to the caller
+          if ((error = cudaSetDevice(begin->device)))
+            break;
+          current_device = begin->device;
+        }
+
+        // Free device memory
+        // CMS: silently ignore errors and pass them to the caller
+        if ((error = cudaFree(begin->d_ptr)))
+          break;
+        if ((error = cudaEventDestroy(begin->ready_event)))
+          break;
+
+        // Reduce balance and erase entry
+        cached_bytes[current_device].free -= begin->bytes;
+
+        if (debug)
+          printf(
+              "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+              "bytes) outstanding.\n",
+              current_device,
+              (long long)begin->bytes,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes[current_device].free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes[current_device].live);
+
+        cached_blocks.erase(begin);
+      }
+
+      mutex_locker.unlock();
+
+      // Attempt to revert back to entry-point device if necessary
+      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
+        // CMS: throw exception on error
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
+      }
+
+      return error;
+    }
+
+    // CMS: give access to cache allocation status
+    GpuCachedBytes CacheStatus() const {
+      std::unique_lock mutex_locker(mutex);
+      return cached_bytes;
+    }
+
+    /**
+     * \brief Destructor
+     */
+    // CMS: make the destructor not virtual
+    ~CachingDeviceAllocator() {
+      if (!skip_cleanup)
+        FreeAllCached();
+    }
+  };
+
+  /** @} */  // end group UtilMgmt
+
+}  // namespace notcub
+
+#endif
diff --git a/src/cudacompat/CUDACore/CachingHostAllocator.h b/src/cudacompat/CUDACore/CachingHostAllocator.h
new file mode 100644
index 000000000..a206b2da1
--- /dev/null
+++ b/src/cudacompat/CUDACore/CachingHostAllocator.h
@@ -0,0 +1,648 @@
+#ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
+#define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Modified to cache pinned host allocations by Matti Kortelainen
+ */
+
+/******************************************************************************
+ * Simple caching allocator for pinned host memory allocations. The allocator is
+ * thread-safe.
+ ******************************************************************************/
+
+#include <cmath>
+#include <map>
+#include <set>
+#include <mutex>
+
+#include "CUDACore/cudaCheck.h"
+
+/// CUB namespace
+namespace notcub {
+
+  /**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+  /******************************************************************************
+ * CachingHostAllocator (host use)
+ ******************************************************************************/
+
+  /**
+ * \brief A simple caching allocator pinned host memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe.  It behaves as follows:
+ *
+ * I presume the CUDA stream-safeness is not useful as to read/write
+ * from/to the pinned host memory one needs to synchronize anyway. The
+ * difference wrt. device memory is that in the CPU all operations to
+ * the device memory are scheduled via the CUDA stream, while for the
+ * host memory one can perform operations directly.
+ *
+ * \par
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused host allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations  will exceed
+ *   \p max_cached_bytes, allocations are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingHostAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes
+ *
+ */
+  struct CachingHostAllocator {
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int)-1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t)-1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for pinned host memory allocations
+     */
+    struct BlockDescriptor {
+      void *d_ptr;                     // Host pointer
+      size_t bytes;                    // Size of allocation in bytes
+      unsigned int bin;                // Bin enumeration
+      int device;                      // device ordinal
+      cudaStream_t associated_stream;  // Associated associated_stream
+      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+
+      // Constructor (suitable for searching maps for a specific block, given its pointer)
+      BlockDescriptor(void *d_ptr)
+          : d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Constructor (suitable for searching maps for a range of suitable blocks)
+      BlockDescriptor()
+          : d_ptr(nullptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Comparison functor for comparing host pointers
+      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
+
+      // Comparison functor for comparing allocation sizes
+      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+      size_t free;
+      size_t live;
+      TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(unsigned int base, unsigned int exp) {
+      unsigned int retval = 1;
+      while (exp > 0) {
+        if (exp & 1) {
+          retval = retval * base;  // multiply the result by the current base
+        }
+        base = base * base;  // square the base
+        exp = exp >> 1;      // divide the exponent in half
+      }
+      return retval;
+    }
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
+      power = 0;
+      rounded_bytes = 1;
+
+      if (value * base < value) {
+        // Overflow
+        power = sizeof(size_t) * 8;
+        rounded_bytes = size_t(0) - 1;
+        return;
+      }
+
+      while (rounded_bytes < value) {
+        rounded_bytes *= base;
+        power++;
+      }
+    }
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    std::mutex mutex;  /// Mutex for thread-safety
+
+    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
+    unsigned int min_bin;     /// Minimum bin enumeration
+    unsigned int max_bin;     /// Maximum bin enumeration
+
+    size_t min_bin_bytes;     /// Minimum bin size
+    size_t max_bin_bytes;     /// Maximum bin size
+    size_t max_cached_bytes;  /// Maximum aggregate cached bytes
+
+    const bool
+        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool debug;        /// Whether or not to print (de)allocation events to stdout
+
+    TotalBytes cached_bytes;     /// Aggregate cached bytes
+    CachedBlocks cached_blocks;  /// Set of cached pinned host allocations available for reuse
+    BusyBlocks live_blocks;      /// Set of live pinned host allocations currently in use
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingHostAllocator(
+        unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes (default is no limit)
+        bool skip_cleanup =
+            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+        : bin_growth(bin_growth),
+          min_bin(min_bin),
+          max_bin(max_bin),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes(max_cached_bytes),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes
+     */
+    CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
+        : bin_growth(8),
+          min_bin(3),
+          max_bin(7),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes((max_bin_bytes * 3) - 1),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    void SetMaxCachedBytes(size_t max_cached_bytes) {
+      // Lock
+      std::unique_lock mutex_locker(mutex);
+
+      if (debug)
+        printf("Changing max_cached_bytes (%lld -> %lld)\n",
+               (long long)this->max_cached_bytes,
+               (long long)max_cached_bytes);
+
+      this->max_cached_bytes = max_cached_bytes;
+
+      // Unlock (redundant, kept for style uniformity)
+      mutex_locker.unlock();
+    }
+
+    /**
+     * \brief Provides a suitable allocation of pinned host memory for the given size.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostAllocate(
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
+      *d_ptr = nullptr;
+      int device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      cudaCheck(error = cudaGetDevice(&device));
+
+      // Create a block descriptor for the requested allocation
+      bool found = false;
+      BlockDescriptor search_key;
+      search_key.device = device;
+      search_key.associated_stream = active_stream;
+      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+      if (search_key.bin > max_bin) {
+        // Bin is greater than our maximum bin: allocate the request
+        // exactly and give out-of-bounds bin.  It will not be cached
+        // for reuse when returned.
+        search_key.bin = INVALID_BIN;
+        search_key.bytes = bytes;
+      } else {
+        // Search for a suitable cached allocation: lock
+        mutex_locker.lock();
+
+        if (search_key.bin < min_bin) {
+          // Bin is less than minimum bin: round up
+          search_key.bin = min_bin;
+          search_key.bytes = min_bin_bytes;
+        }
+
+        // Iterate through the range of cached blocks in the same bin
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+        while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
+          // To prevent races with reusing blocks returned by the host but still
+          // in use for transfers, only consider cached blocks that are from an idle stream
+          if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
+            // Reuse existing cache block.  Insert into live blocks.
+            found = true;
+            search_key = *block_itr;
+            search_key.associated_stream = active_stream;
+            if (search_key.device != device) {
+              // If "associated" device changes, need to re-create the event on the right device
+              cudaCheck(error = cudaSetDevice(search_key.device));
+              cudaCheck(error = cudaEventDestroy(search_key.ready_event));
+              cudaCheck(error = cudaSetDevice(device));
+              cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
+              search_key.device = device;
+            }
+
+            live_blocks.insert(search_key);
+
+            // Remove from free blocks
+            cached_bytes.free -= search_key.bytes;
+            cached_bytes.live += search_key.bytes;
+
+            if (debug)
+              printf(
+                  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
+                  "(previously associated with stream %lld, event %lld).\n",
+                  search_key.d_ptr,
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)search_key.device,
+                  (long long)block_itr->associated_stream,
+                  (long long)block_itr->ready_event);
+
+            cached_blocks.erase(block_itr);
+
+            break;
+          }
+          block_itr++;
+        }
+
+        // Done searching: unlock
+        mutex_locker.unlock();
+      }
+
+      // Allocate the block if necessary
+      if (!found) {
+        // Attempt to allocate
+        // TODO: eventually support allocation flags
+        if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
+            cudaErrorMemoryAllocation) {
+          // The allocation attempt failed: free all cached blocks on device and retry
+          if (debug)
+            printf(
+                "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
+                "allocations",
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.device);
+
+          error = cudaSuccess;  // Reset the error we will return
+          cudaGetLastError();   // Reset CUDART's error
+
+          // Lock
+          mutex_locker.lock();
+
+          // Iterate the range of free blocks
+          CachedBlocks::iterator block_itr = cached_blocks.begin();
+
+          while ((block_itr != cached_blocks.end())) {
+            // No need to worry about synchronization with the device: cudaFree is
+            // blocking and will synchronize across all kernels executing
+            // on the current device
+
+            // Free pinned host memory.
+            if ((error = cudaFreeHost(block_itr->d_ptr)))
+              break;
+            if ((error = cudaEventDestroy(block_itr->ready_event)))
+              break;
+
+            // Reduce balance and erase entry
+            cached_bytes.free -= block_itr->bytes;
+
+            if (debug)
+              printf(
+                  "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+                  "bytes) outstanding.\n",
+                  (long long)block_itr->bytes,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes.free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes.live);
+
+            cached_blocks.erase(block_itr);
+
+            block_itr++;
+          }
+
+          // Unlock
+          mutex_locker.unlock();
+
+          // Return under error
+          if (error)
+            return error;
+
+          // Try to allocate again
+          cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
+        }
+
+        // Create ready event
+        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
+
+        // Insert into live blocks
+        mutex_locker.lock();
+        live_blocks.insert(search_key);
+        cached_bytes.live += search_key.bytes;
+        mutex_locker.unlock();
+
+        if (debug)
+          printf(
+              "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
+              "%lld).\n",
+              search_key.d_ptr,
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)search_key.device);
+      }
+
+      // Copy host pointer to output parameter
+      *d_ptr = search_key.d_ptr;
+
+      if (debug)
+        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+               (long long)cached_blocks.size(),
+               (long long)cached_bytes.free,
+               (long long)live_blocks.size(),
+               (long long)cached_bytes.live);
+
+      return error;
+    }
+
+    /**
+     * \brief Frees a live allocation of pinned host memory, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostFree(void *d_ptr) {
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      // Lock
+      std::unique_lock<std::mutex> mutex_locker(mutex);
+
+      // Find corresponding block descriptor
+      bool recached = false;
+      BlockDescriptor search_key(d_ptr);
+      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+      if (block_itr != live_blocks.end()) {
+        // Remove from live blocks
+        search_key = *block_itr;
+        live_blocks.erase(block_itr);
+        cached_bytes.live -= search_key.bytes;
+
+        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+        if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
+          // Insert returned allocation into free blocks
+          recached = true;
+          cached_blocks.insert(search_key);
+          cached_bytes.free += search_key.bytes;
+
+          if (debug)
+            printf(
+                "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
+                "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)search_key.device,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes.free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes.live);
+        }
+      }
+
+      cudaCheck(error = cudaGetDevice(&entrypoint_device));
+      if (entrypoint_device != search_key.device) {
+        cudaCheck(error = cudaSetDevice(search_key.device));
+      }
+
+      if (recached) {
+        // Insert the ready event in the associated stream (must have current device set properly)
+        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
+      }
+
+      // Unlock
+      mutex_locker.unlock();
+
+      if (!recached) {
+        // Free the allocation from the runtime and cleanup the event.
+        cudaCheck(error = cudaFreeHost(d_ptr));
+        cudaCheck(error = cudaEventDestroy(search_key.ready_event));
+
+        if (debug)
+          printf(
+              "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
+              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)search_key.device,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes.free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes.live);
+      }
+
+      // Reset device
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Frees all cached pinned host allocations
+     */
+    cudaError_t FreeAllCached() {
+      cudaError_t error = cudaSuccess;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      int current_device = INVALID_DEVICE_ORDINAL;
+
+      std::unique_lock<std::mutex> mutex_locker(mutex);
+
+      while (!cached_blocks.empty()) {
+        // Get first block
+        CachedBlocks::iterator begin = cached_blocks.begin();
+
+        // Get entry-point device ordinal if necessary
+        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
+          if ((error = cudaGetDevice(&entrypoint_device)))
+            break;
+        }
+
+        // Set current device ordinal if necessary
+        if (begin->device != current_device) {
+          if ((error = cudaSetDevice(begin->device)))
+            break;
+          current_device = begin->device;
+        }
+
+        // Free host memory
+        if ((error = cudaFreeHost(begin->d_ptr)))
+          break;
+        if ((error = cudaEventDestroy(begin->ready_event)))
+          break;
+
+        // Reduce balance and erase entry
+        cached_bytes.free -= begin->bytes;
+
+        if (debug)
+          printf(
+              "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+              "bytes) outstanding.\n",
+              (long long)begin->bytes,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes.free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes.live);
+
+        cached_blocks.erase(begin);
+      }
+
+      mutex_locker.unlock();
+
+      // Attempt to revert back to entry-point device if necessary
+      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
+        cudaCheck(error = cudaSetDevice(entrypoint_device));
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Destructor
+     */
+    ~CachingHostAllocator() {
+      if (!skip_cleanup)
+        FreeAllCached();
+    }
+  };
+
+  /** @} */  // end group UtilMgmt
+
+}  // namespace notcub
+
+#endif
diff --git a/src/cudacompat/CUDACore/ContextState.cc b/src/cudacompat/CUDACore/ContextState.cc
new file mode 100644
index 000000000..7fa792333
--- /dev/null
+++ b/src/cudacompat/CUDACore/ContextState.cc
@@ -0,0 +1,17 @@
+#include "CUDACore/ContextState.h"
+
+#include <stdexcept>
+
+namespace cms::cuda {
+  void ContextState::throwIfStream() const {
+    if (stream_) {
+      throw std::runtime_error("Trying to set ContextState, but it already had a valid state");
+    }
+  }
+
+  void ContextState::throwIfNoStream() const {
+    if (not stream_) {
+      throw std::runtime_error("Trying to get ContextState, but it did not have a valid state");
+    }
+  }
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/ContextState.h b/src/cudacompat/CUDACore/ContextState.h
new file mode 100644
index 000000000..3c73054ab
--- /dev/null
+++ b/src/cudacompat/CUDACore/ContextState.h
@@ -0,0 +1,61 @@
+#ifndef HeterogeneousCore_CUDACore_ContextState_h
+#define HeterogeneousCore_CUDACore_ContextState_h
+
+#include "CUDACore/SharedStreamPtr.h"
+
+#include <memory>
+
+namespace cms {
+  namespace cuda {
+    /**
+     * The purpose of this class is to deliver the device and CUDA stream
+     * information from ExternalWork's acquire() to producer() via a
+     * member/StreamCache variable.
+     */
+    class ContextState {
+    public:
+      ContextState() = default;
+      ~ContextState() = default;
+
+      ContextState(const ContextState&) = delete;
+      ContextState& operator=(const ContextState&) = delete;
+      ContextState(ContextState&&) = delete;
+      ContextState& operator=(ContextState&& other) = delete;
+
+    private:
+      friend class ScopedContextAcquire;
+      friend class ScopedContextProduce;
+      friend class ScopedContextTask;
+
+      void set(int device, SharedStreamPtr stream) {
+        throwIfStream();
+        device_ = device;
+        stream_ = std::move(stream);
+      }
+
+      int device() const { return device_; }
+
+      const SharedStreamPtr& streamPtr() const {
+        throwIfNoStream();
+        return stream_;
+      }
+
+      SharedStreamPtr releaseStreamPtr() {
+        throwIfNoStream();
+        // This function needs to effectively reset stream_ (i.e. stream_
+        // must be empty after this function). This behavior ensures that
+        // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
+        // the next event), and is checked at run time.
+        return std::move(stream_);
+      }
+
+      void throwIfStream() const;
+      void throwIfNoStream() const;
+
+      SharedStreamPtr stream_;
+      int device_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/ESProduct.h b/src/cudacompat/CUDACore/ESProduct.h
new file mode 100644
index 000000000..0e8965a71
--- /dev/null
+++ b/src/cudacompat/CUDACore/ESProduct.h
@@ -0,0 +1,102 @@
+#ifndef HeterogeneousCore_CUDACore_ESProduct_h
+#define HeterogeneousCore_CUDACore_ESProduct_h
+
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+
+#include "CUDACore/EventCache.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/deviceCount.h"
+#include "CUDACore/currentDevice.h"
+#include "CUDACore/eventWorkHasCompleted.h"
+
+namespace cms {
+  namespace cuda {
+    template <typename T>
+    class ESProduct {
+    public:
+      ESProduct() : gpuDataPerDevice_(deviceCount()) {
+        for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
+          gpuDataPerDevice_[i].m_event = getEventCache().get();
+        }
+      }
+      ~ESProduct() = default;
+
+      // transferAsync should be a function of (T&, cudaStream_t)
+      // which enqueues asynchronous transfers (possibly kernels as well)
+      // to the CUDA stream
+      template <typename F>
+      const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
+        auto device = currentDevice();
+
+        auto& data = gpuDataPerDevice_[device];
+
+        // If GPU data has already been filled, we can return it
+        // immediately
+        if (not data.m_filled.load()) {
+          // It wasn't, so need to fill it
+          std::scoped_lock<std::mutex> lk{data.m_mutex};
+
+          if (data.m_filled.load()) {
+            // Other thread marked it filled while we were locking the mutex, so we're free to return it
+            return data.m_data;
+          }
+
+          if (data.m_fillingStream != nullptr) {
+            // Someone else is filling
+
+            // Check first if the recorded event has occurred
+            if (eventWorkHasCompleted(data.m_event.get())) {
+              // It was, so data is accessible from all CUDA streams on
+              // the device. Set the 'filled' for all subsequent calls and
+              // return the value
+              auto should_be_false = data.m_filled.exchange(true);
+              assert(not should_be_false);
+              data.m_fillingStream = nullptr;
+            } else if (data.m_fillingStream != cudaStream) {
+              // Filling is still going on. For other CUDA stream, add
+              // wait on the CUDA stream and return the value. Subsequent
+              // work queued on the stream will wait for the event to
+              // occur (i.e. transfer to finish).
+              cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
+                        "Failed to make a stream to wait for an event");
+            }
+            // else: filling is still going on. But for the same CUDA
+            // stream (which would be a bit strange but fine), we can just
+            // return as all subsequent work should be enqueued to the
+            // same CUDA stream (or stream to be explicitly synchronized
+            // by the caller)
+          } else {
+            // Now we can be sure that the data is not yet on the GPU, and
+            // this thread is the first to try that.
+            transferAsync(data.m_data, cudaStream);
+            assert(data.m_fillingStream == nullptr);
+            data.m_fillingStream = cudaStream;
+            // Now the filling has been enqueued to the cudaStream, so we
+            // can return the GPU data immediately, since all subsequent
+            // work must be either enqueued to the cudaStream, or the cudaStream
+            // must be synchronized by the caller
+          }
+        }
+
+        return data.m_data;
+      }
+
+    private:
+      struct Item {
+        mutable std::mutex m_mutex;
+        mutable SharedEventPtr m_event;  // guarded by m_mutex
+        // non-null if some thread is already filling (cudaStream_t is just a pointer)
+        mutable cudaStream_t m_fillingStream = nullptr;  // guarded by m_mutex
+        mutable std::atomic<bool> m_filled = false;      // easy check if data has been filled already or not
+        mutable T m_data;                                // guarded by m_mutex
+      };
+
+      std::vector<Item> gpuDataPerDevice_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/EventCache.cc b/src/cudacompat/CUDACore/EventCache.cc
new file mode 100644
index 000000000..3f2d4419a
--- /dev/null
+++ b/src/cudacompat/CUDACore/EventCache.cc
@@ -0,0 +1,68 @@
+#include "CUDACore/EventCache.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/currentDevice.h"
+#include "CUDACore/deviceCount.h"
+#include "CUDACore/eventWorkHasCompleted.h"
+#include "CUDACore/ScopedSetDevice.h"
+
+namespace cms::cuda {
+  void EventCache::Deleter::operator()(cudaEvent_t event) const {
+    if (device_ != -1) {
+      ScopedSetDevice deviceGuard{device_};
+      cudaCheck(cudaEventDestroy(event));
+    }
+  }
+
+  // EventCache should be constructed by the first call to
+  // getEventCache() only if we have CUDA devices present
+  EventCache::EventCache() : cache_(deviceCount()) {}
+
+  SharedEventPtr EventCache::get() {
+    const auto dev = currentDevice();
+    auto event = makeOrGet(dev);
+    // captured work has completed, or a just-created event
+    if (eventWorkHasCompleted(event.get())) {
+      return event;
+    }
+
+    // Got an event with incomplete captured work. Try again until we
+    // get a completed (or a just-created) event. Need to keep all
+    // incomplete events until a completed event is found in order to
+    // avoid ping-pong with an incomplete event.
+    std::vector<SharedEventPtr> ptrs{std::move(event)};
+    bool completed;
+    do {
+      event = makeOrGet(dev);
+      completed = eventWorkHasCompleted(event.get());
+      if (not completed) {
+        ptrs.emplace_back(std::move(event));
+      }
+    } while (not completed);
+    return event;
+  }
+
+  SharedEventPtr EventCache::makeOrGet(int dev) {
+    return cache_[dev].makeOrGet([dev]() {
+      cudaEvent_t event;
+      // it should be a bit faster to ignore timings
+      cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      return std::unique_ptr<BareEvent, Deleter>(event, Deleter{dev});
+    });
+  }
+
+  void EventCache::clear() {
+    // Reset the contents of the caches, but leave an
+    // edm::ReusableObjectHolder alive for each device. This is needed
+    // mostly for the unit tests, where the function-static
+    // EventCache lives through multiple tests (and go through
+    // multiple shutdowns of the framework).
+    cache_.clear();
+    cache_.resize(deviceCount());
+  }
+
+  EventCache& getEventCache() {
+    // the public interface is thread safe
+    static EventCache cache;
+    return cache;
+  }
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/EventCache.h b/src/cudacompat/CUDACore/EventCache.h
new file mode 100644
index 000000000..fc887ade1
--- /dev/null
+++ b/src/cudacompat/CUDACore/EventCache.h
@@ -0,0 +1,57 @@
+#ifndef HeterogeneousCore_CUDAUtilities_EventCache_h
+#define HeterogeneousCore_CUDAUtilities_EventCache_h
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "Framework/ReusableObjectHolder.h"
+#include "CUDACore/SharedEventPtr.h"
+
+class CUDAService;
+
+namespace cms {
+  namespace cuda {
+    class EventCache {
+    public:
+      using BareEvent = SharedEventPtr::element_type;
+
+      EventCache();
+
+      // Gets a (cached) CUDA event for the current device. The event
+      // will be returned to the cache by the shared_ptr destructor. The
+      // returned event is guaranteed to be in the state where all
+      // captured work has completed, i.e. cudaEventQuery() == cudaSuccess.
+      //
+      // This function is thread safe
+      SharedEventPtr get();
+
+    private:
+      friend class ::CUDAService;
+
+      // thread safe
+      SharedEventPtr makeOrGet(int dev);
+
+      // not thread safe, intended to be called only from CUDAService destructor
+      void clear();
+
+      class Deleter {
+      public:
+        Deleter() = default;
+        Deleter(int d) : device_{d} {}
+        void operator()(cudaEvent_t event) const;
+
+      private:
+        int device_ = -1;
+      };
+
+      std::vector<edm::ReusableObjectHolder<BareEvent, Deleter>> cache_;
+    };
+
+    // Gets the global instance of a EventCache
+    // This function is thread safe
+    EventCache& getEventCache();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/HistoContainer.h b/src/cudacompat/CUDACore/HistoContainer.h
new file mode 100644
index 000000000..c2ac3308d
--- /dev/null
+++ b/src/cudacompat/CUDACore/HistoContainer.h
@@ -0,0 +1,323 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
+#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
+
+#include <algorithm>
+#ifndef __CUDA_ARCH__
+#include <atomic>
+#endif  // __CUDA_ARCH__
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "CUDACore/AtomicPairCounter.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/cuda_assert.h"
+#include "CUDACore/cudastdAlgorithm.h"
+#include "CUDACore/prefixScan.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <typename Histo, typename T>
+    __global__ void countFromVector(Histo *__restrict__ h,
+                                    uint32_t nh,
+                                    T const *__restrict__ v,
+                                    uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).count(v[i], ih);
+      }
+    }
+
+    template <typename Histo, typename T>
+    __global__ void fillFromVector(Histo *__restrict__ h,
+                                   uint32_t nh,
+                                   T const *__restrict__ v,
+                                   uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).fill(v[i], i, ih);
+      }
+    }
+
+    template <typename Histo>
+    inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
+                                                          cudaStream_t stream
+#ifndef __CUDACC__
+                                                          = cudaStreamDefault
+#endif
+    ) {
+      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
+      assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
+#ifdef __CUDACC__
+      cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
+#else
+      ::memset(poff, 0, size);
+#endif
+    }
+
+    template <typename Histo>
+    inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
+                                                              cudaStream_t stream
+#ifndef __CUDACC__
+                                                              = cudaStreamDefault
+#endif
+    ) {
+#ifdef __CUDACC__
+      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
+      auto nthreads = 1024;
+      auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
+      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
+          poff, poff, Histo::totbins(), ppsws);
+      cudaCheck(cudaGetLastError());
+#else
+      h->finalize();
+#endif
+    }
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
+                                                                  uint32_t nh,
+                                                                  T const *__restrict__ v,
+                                                                  uint32_t const *__restrict__ offsets,
+                                                                  uint32_t totSize,
+                                                                  int nthreads,
+                                                                  cudaStream_t stream
+#ifndef __CUDACC__
+                                                                  = cudaStreamDefault
+#endif
+    ) {
+      launchZero(h, stream);
+#ifdef __CUDACC__
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+      launchFinalize(h, stream);
+      fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+#else
+      countFromVector(h, nh, v, offsets);
+      h->finalize();
+      fillFromVector(h, nh, v, offsets);
+#endif
+    }
+
+    template <typename Assoc>
+    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
+      assoc->bulkFinalizeFill(*apc);
+    }
+
+    // iteratate over N bins left and right of the one containing "v"
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
+      int bs = Hist::bin(value);
+      int be = std::min(int(Hist::nbins() - 1), bs + n);
+      bs = std::max(0, bs - n);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+
+    // iteratate over bins containing all values in window wmin, wmax
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
+      auto bs = Hist::bin(wmin);
+      auto be = Hist::bin(wmax);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+
+    template <typename T,                  // the type of the discretized input values
+              uint32_t NBINS,              // number of bins
+              uint32_t SIZE,               // max number of element
+              uint32_t S = sizeof(T) * 8,  // number of significant bits in T
+              typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+              uint32_t NHISTS = 1     // number of histos stored
+              >
+    class HistoContainer {
+    public:
+      using Counter = uint32_t;
+
+      using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
+
+      using index_type = I;
+      using UT = typename std::make_unsigned<T>::type;
+
+      static constexpr uint32_t ilog2(uint32_t v) {
+        constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+        constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+        uint32_t r = 0;  // result of log2(v) will go here
+        for (auto i = 4; i >= 0; i--)
+          if (v & b[i]) {
+            v >>= s[i];
+            r |= s[i];
+          }
+        return r;
+      }
+
+      static constexpr uint32_t sizeT() { return S; }
+      static constexpr uint32_t nbins() { return NBINS; }
+      static constexpr uint32_t nhists() { return NHISTS; }
+      static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
+      static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
+      static constexpr uint32_t capacity() { return SIZE; }
+
+      static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
+
+      static constexpr UT bin(T t) {
+        constexpr uint32_t shift = sizeT() - nbits();
+        constexpr uint32_t mask = (1 << nbits()) - 1;
+        return (t >> shift) & mask;
+      }
+
+      __host__ __device__ void zero() {
+        for (auto &i : off)
+          i = 0;
+      }
+
+      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
+        for (uint32_t i = 0; i < totbins(); ++i) {
+#ifdef __CUDA_ARCH__
+          atomicAdd(off + i, co.off[i]);
+#else
+          auto &a = (std::atomic<Counter> &)(off[i]);
+          a += co.off[i];
+#endif
+        }
+      }
+
+      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+        return atomicAdd(&x, 1);
+#else
+        auto &a = (std::atomic<Counter> &)(x);
+        return a++;
+#endif
+      }
+
+      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+        return atomicSub(&x, 1);
+#else
+        auto &a = (std::atomic<Counter> &)(x);
+        return a--;
+#endif
+      }
+
+      __host__ __device__ __forceinline__ void countDirect(T b) {
+        assert(b < nbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
+        assert(b < nbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
+        auto c = apc.add(n);
+        if (c.m >= nbins())
+          return -int32_t(c.m);
+        off[c.m] = c.n;
+        for (uint32_t j = 0; j < n; ++j)
+          bins[c.n + j] = v[j];
+        return c.m;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
+        off[apc.get().m] = apc.get().n;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
+        auto m = apc.get().m;
+        auto n = apc.get().n;
+        if (m >= nbins()) {  // overflow!
+          off[nbins()] = uint32_t(off[nbins() - 1]);
+          return;
+        }
+        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
+        for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
+          off[i] = n;
+        }
+      }
+
+      __host__ __device__ __forceinline__ void count(T t) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(T t, index_type j) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        b += histOff(nh);
+        assert(b < totbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        b += histOff(nh);
+        assert(b < totbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
+        assert(off[totbins() - 1] == 0);
+        blockPrefixScan(off, totbins(), ws);
+        assert(off[totbins() - 1] == off[totbins() - 2]);
+      }
+
+      constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
+      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
+
+      constexpr index_type const *begin() const { return bins; }
+      constexpr index_type const *end() const { return begin() + size(); }
+
+      constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
+      constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
+
+      Counter off[totbins()];
+      int32_t psws;  // prefix-scan working space
+      index_type bins[capacity()];
+    };
+
+    template <typename I,        // type stored in the container (usually an index in a vector of the input values)
+              uint32_t MAXONES,  // max number of "ones"
+              uint32_t MAXMANYS  // max number of "manys"
+              >
+    using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
diff --git a/src/cudacompat/CUDACore/HostAllocator.h b/src/cudacompat/CUDACore/HostAllocator.h
new file mode 100644
index 000000000..19c86e31f
--- /dev/null
+++ b/src/cudacompat/CUDACore/HostAllocator.h
@@ -0,0 +1,55 @@
+#ifndef HeterogeneousCore_CUDAUtilities_HostAllocator_h
+#define HeterogeneousCore_CUDAUtilities_HostAllocator_h
+
+#include <memory>
+#include <new>
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+
+    class bad_alloc : public std::bad_alloc {
+    public:
+      bad_alloc(cudaError_t error) noexcept : error_(error) {}
+
+      const char* what() const noexcept override { return cudaGetErrorString(error_); }
+
+    private:
+      cudaError_t error_;
+    };
+
+    template <typename T, unsigned int FLAGS = cudaHostAllocDefault>
+    class HostAllocator {
+    public:
+      using value_type = T;
+
+      template <typename U>
+      struct rebind {
+        using other = HostAllocator<U, FLAGS>;
+      };
+
+      T* allocate(std::size_t n) const __attribute__((warn_unused_result)) __attribute__((malloc))
+      __attribute__((returns_nonnull)) {
+        void* ptr = nullptr;
+        cudaError_t status = cudaMallocHost(&ptr, n * sizeof(T), FLAGS);
+        if (status != cudaSuccess) {
+          throw bad_alloc(status);
+        }
+        if (ptr == nullptr) {
+          throw std::bad_alloc();
+        }
+        return static_cast<T*>(ptr);
+      }
+
+      void deallocate(T* p, std::size_t n) const {
+        cudaError_t status = cudaFreeHost(p);
+        if (status != cudaSuccess) {
+          throw bad_alloc(status);
+        }
+      }
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_HostAllocator_h
diff --git a/src/cudacompat/CUDACore/HostProduct.h b/src/cudacompat/CUDACore/HostProduct.h
new file mode 100644
index 000000000..6aaa38e3e
--- /dev/null
+++ b/src/cudacompat/CUDACore/HostProduct.h
@@ -0,0 +1,29 @@
+#ifndef CUDADataFormatsCommonHostProduct_H
+#define CUDADataFormatsCommonHostProduct_H
+
+#include "CUDACore/host_unique_ptr.h"
+
+// a heterogeneous unique pointer...
+template <typename T>
+class HostProduct {
+public:
+  HostProduct() = default;  // make root happy
+  ~HostProduct() = default;
+  HostProduct(HostProduct&&) = default;
+  HostProduct& operator=(HostProduct&&) = default;
+
+  explicit HostProduct(cms::cuda::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
+  explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
+
+  auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
+
+  auto const& operator*() const { return *get(); }
+
+  auto const* operator->() const { return get(); }
+
+private:
+  cms::cuda::host::unique_ptr<T> hm_ptr;  //!
+  std::unique_ptr<T> std_ptr;             //!
+};
+
+#endif
diff --git a/src/cudacompat/CUDACore/Product.h b/src/cudacompat/CUDACore/Product.h
new file mode 100644
index 000000000..c60e994f0
--- /dev/null
+++ b/src/cudacompat/CUDACore/Product.h
@@ -0,0 +1,60 @@
+#ifndef CUDADataFormats_Common_Product_h
+#define CUDADataFormats_Common_Product_h
+
+#include <memory>
+
+#include "CUDACore/ProductBase.h"
+
+namespace edm {
+  template <typename T>
+  class Wrapper;
+}
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextGetterBase;
+    }
+
+    /**
+     * The purpose of this class is to wrap CUDA data to edm::Event in a
+     * way which forces correct use of various utilities.
+     *
+     * The non-default construction has to be done with cms::cuda::ScopedContext
+     * (in order to properly register the CUDA event).
+     *
+     * The default constructor is needed only for the ROOT dictionary generation.
+     *
+     * The CUDA event is in practice needed only for stream-stream
+     * synchronization, but someone with long-enough lifetime has to own
+     * it. Here is a somewhat natural place. If overhead is too much, we
+     * can use them only where synchronization between streams is needed.
+     */
+    template <typename T>
+    class Product : public ProductBase {
+    public:
+      Product() = default;  // Needed only for ROOT dictionary generation
+
+      Product(const Product&) = delete;
+      Product& operator=(const Product&) = delete;
+      Product(Product&&) = default;
+      Product& operator=(Product&&) = default;
+
+    private:
+      friend class impl::ScopedContextGetterBase;
+      friend class ScopedContextProduce;
+      friend class edm::Wrapper<Product<T>>;
+
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
+
+      template <typename... Args>
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
+
+      T data_;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/ProductBase.cc b/src/cudacompat/CUDACore/ProductBase.cc
new file mode 100644
index 000000000..4abb4ff71
--- /dev/null
+++ b/src/cudacompat/CUDACore/ProductBase.cc
@@ -0,0 +1,29 @@
+#include "CUDACore/ProductBase.h"
+#include "CUDACore/eventWorkHasCompleted.h"
+
+namespace cms::cuda {
+  bool ProductBase::isAvailable() const {
+    // if default-constructed, the product is not available
+    if (not event_) {
+      return false;
+    }
+    return eventWorkHasCompleted(event_.get());
+  }
+
+  ProductBase::~ProductBase() {
+    // Make sure that the production of the product in the GPU is
+    // complete before destructing the product. This is to make sure
+    // that the EDM stream does not move to the next event before all
+    // asynchronous processing of the current is complete.
+
+    // TODO: a callback notifying a WaitingTaskHolder (or similar)
+    // would avoid blocking the CPU, but would also require more work.
+    //
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    if (event_) {
+      cudaEventSynchronize(event_.get());
+    }
+  }
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/ProductBase.h b/src/cudacompat/CUDACore/ProductBase.h
new file mode 100644
index 000000000..cb3fd4db9
--- /dev/null
+++ b/src/cudacompat/CUDACore/ProductBase.h
@@ -0,0 +1,93 @@
+#ifndef CUDADataFormats_Common_ProductBase_h
+#define CUDADataFormats_Common_ProductBase_h
+
+#include <atomic>
+#include <memory>
+
+#include "CUDACore/SharedStreamPtr.h"
+#include "CUDACore/SharedEventPtr.h"
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextBase;
+    }
+
+    /**
+     * Base class for all instantiations of CUDA<T> to hold the
+     * non-T-dependent members.
+     */
+    class ProductBase {
+    public:
+      ProductBase() = default;  // Needed only for ROOT dictionary generation
+      ~ProductBase();
+
+      ProductBase(const ProductBase&) = delete;
+      ProductBase& operator=(const ProductBase&) = delete;
+      ProductBase(ProductBase&& other)
+          : stream_{std::move(other.stream_)},
+            event_{std::move(other.event_)},
+            mayReuseStream_{other.mayReuseStream_.load()},
+            device_{other.device_} {}
+      ProductBase& operator=(ProductBase&& other) {
+        stream_ = std::move(other.stream_);
+        event_ = std::move(other.event_);
+        mayReuseStream_ = other.mayReuseStream_.load();
+        device_ = other.device_;
+        return *this;
+      }
+
+      bool isValid() const { return stream_.get() != nullptr; }
+      bool isAvailable() const;
+
+      int device() const { return device_; }
+
+      // cudaStream_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaStream_t stream() const { return stream_.get(); }
+
+      // cudaEvent_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaEvent_t event() const { return event_.get(); }
+
+    protected:
+      explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
+          : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
+
+    private:
+      friend class impl::ScopedContextBase;
+      friend class ScopedContextProduce;
+
+      // The following function is intended to be used only from ScopedContext
+      const SharedStreamPtr& streamPtr() const { return stream_; }
+
+      bool mayReuseStream() const {
+        bool expected = true;
+        bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+        // If the current thread is the one flipping the flag, it may
+        // reuse the stream.
+        return changed;
+      }
+
+      // The cudaStream_t is really shared among edm::Event products, so
+      // using shared_ptr also here
+      SharedStreamPtr stream_;  //!
+      // shared_ptr because of caching in cms::cuda::EventCache
+      SharedEventPtr event_;  //!
+
+      // This flag tells whether the CUDA stream may be reused by a
+      // consumer or not. The goal is to have a "chain" of modules to
+      // queue their work to the same stream.
+      mutable std::atomic<bool> mayReuseStream_ = true;  //!
+
+      // The CUDA device associated with this product
+      int device_ = -1;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/ScopedContext.cc b/src/cudacompat/CUDACore/ScopedContext.cc
new file mode 100644
index 000000000..14bff04eb
--- /dev/null
+++ b/src/cudacompat/CUDACore/ScopedContext.cc
@@ -0,0 +1,116 @@
+#include "CUDACore/ScopedContext.h"
+
+#include "CUDACore/StreamCache.h"
+#include "CUDACore/cudaCheck.h"
+
+#include "chooseDevice.h"
+
+namespace {
+  struct CallbackData {
+    edm::WaitingTaskWithArenaHolder holder;
+    int device;
+  };
+
+  void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
+    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+    int device = guard->device;
+    if (status == cudaSuccess) {
+      //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+      //          << streamId << std::endl;
+      waitingTaskHolder.doneWaiting(nullptr);
+    } else {
+      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+      try {
+        auto error = cudaGetErrorName(status);
+        auto message = cudaGetErrorString(status);
+        throw std::runtime_error("Callback of CUDA stream " +
+                                 std::to_string(reinterpret_cast<unsigned long>(streamId)) + " in device " +
+                                 std::to_string(device) + " error " + std::string(error) + ": " + std::string(message));
+      } catch (std::exception&) {
+        waitingTaskHolder.doneWaiting(std::current_exception());
+      }
+    }
+  }
+}  // namespace
+
+namespace cms::cuda {
+  namespace impl {
+    ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+      stream_ = getStreamCache().get();
+    }
+
+    ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+      if (data.mayReuseStream()) {
+        stream_ = data.streamPtr();
+      } else {
+        stream_ = getStreamCache().get();
+      }
+    }
+
+    ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
+        : currentDevice_(device), stream_(std::move(stream)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+    }
+
+    ////////////////////
+
+    void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
+                                                     cudaStream_t dataStream,
+                                                     bool available,
+                                                     cudaEvent_t dataEvent) {
+      if (dataDevice != device()) {
+        // Eventually replace with prefetch to current device (assuming unified memory works)
+        // If we won't go to unified memory, need to figure out something else...
+        throw std::runtime_error("Handling data from multiple devices is not yet supported");
+      }
+
+      if (dataStream != stream()) {
+        // Different streams, need to synchronize
+        if (not available) {
+          // Event not yet occurred, so need to add synchronization
+          // here. Sychronization is done by making the CUDA stream to
+          // wait for an event, so all subsequent work in the stream
+          // will run only after the event has "occurred" (i.e. data
+          // product became available).
+          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+        }
+      }
+    }
+
+    void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
+      cudaCheck(
+          cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
+    }
+  }  // namespace impl
+
+  ////////////////////
+
+  ScopedContextAcquire::~ScopedContextAcquire() {
+    holderHelper_.enqueueCallback(device(), stream());
+    if (contextState_) {
+      contextState_->set(device(), streamPtr());
+    }
+  }
+
+  void ScopedContextAcquire::throwNoState() {
+    throw std::runtime_error(
+        "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
+        "ContextState, but that was not the case");
+  }
+
+  ////////////////////
+
+  ScopedContextProduce::~ScopedContextProduce() {
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    cudaEventRecord(event_.get(), stream());
+  }
+
+  ////////////////////
+
+  ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/ScopedContext.h b/src/cudacompat/CUDACore/ScopedContext.h
new file mode 100644
index 000000000..4f6669883
--- /dev/null
+++ b/src/cudacompat/CUDACore/ScopedContext.h
@@ -0,0 +1,241 @@
+#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
+#define HeterogeneousCore_CUDACore_ScopedContext_h
+
+#include <optional>
+
+#include "CUDACore/Product.h"
+#include "Framework/WaitingTaskWithArenaHolder.h"
+#include "Framework/Event.h"
+#include "Framework/EDGetToken.h"
+#include "Framework/EDPutToken.h"
+#include "CUDACore/ContextState.h"
+#include "CUDACore/EventCache.h"
+#include "CUDACore/SharedEventPtr.h"
+#include "CUDACore/SharedStreamPtr.h"
+
+namespace cms {
+  namespace cudatest {
+    class TestScopedContext;
+  }
+
+  namespace cuda {
+
+    namespace impl {
+      // This class is intended to be derived by other ScopedContext*, not for general use
+      class ScopedContextBase {
+      public:
+        int device() const { return currentDevice_; }
+
+        // cudaStream_t is a pointer to a thread-safe object, for which a
+        // mutable access is needed even if the ScopedContext itself
+        // would be const. Therefore it is ok to return a non-const
+        // pointer from a const method here.
+        cudaStream_t stream() const { return stream_.get(); }
+        const SharedStreamPtr& streamPtr() const { return stream_; }
+
+      protected:
+        // The constructors set the current device, but the device
+        // is not set back to the previous value at the destructor. This
+        // should be sufficient (and tiny bit faster) as all CUDA API
+        // functions relying on the current device should be called from
+        // the scope where this context is. The current device doesn't
+        // really matter between modules (or across TBB tasks).
+        explicit ScopedContextBase(edm::StreamID streamID);
+
+        explicit ScopedContextBase(const ProductBase& data);
+
+        explicit ScopedContextBase(int device, SharedStreamPtr stream);
+
+      private:
+        int currentDevice_;
+        SharedStreamPtr stream_;
+      };
+
+      class ScopedContextGetterBase : public ScopedContextBase {
+      public:
+        template <typename T>
+        const T& get(const Product<T>& data) {
+          synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+          return data.data_;
+        }
+
+        template <typename T>
+        const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
+          return get(iEvent.get(token));
+        }
+
+      protected:
+        template <typename... Args>
+        ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward<Args>(args)...) {}
+
+        void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+      };
+
+      class ScopedContextHolderHelper {
+      public:
+        ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+            : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
+
+        template <typename F>
+        void pushNextTask(F&& f, ContextState const* state);
+
+        void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+          waitingTaskHolder_ = std::move(waitingTaskHolder);
+        }
+
+        void enqueueCallback(int device, cudaStream_t stream);
+
+      private:
+        edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+      };
+    }  // namespace impl
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+     * - setting the current device
+     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextAcquire : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to create a new CUDA stream (no need for context beyond acquire())
+      explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
+
+      /// Constructor to create a new CUDA stream, and the context is needed after acquire()
+      explicit ScopedContextAcquire(edm::StreamID streamID,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    ContextState& state)
+          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+      /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
+      explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
+
+      /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
+      explicit ScopedContextAcquire(const ProductBase& data,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    ContextState& state)
+          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+      ~ScopedContextAcquire();
+
+      template <typename F>
+      void pushNextTask(F&& f) {
+        if (contextState_ == nullptr)
+          throwNoState();
+        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      }
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+      }
+
+    private:
+      void throwNoState();
+
+      impl::ScopedContextHolderHelper holderHelper_;
+      ContextState* contextState_ = nullptr;
+    };
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+     * - setting the current device
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextProduce : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to create a new CUDA stream (non-ExternalWork module)
+      explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
+
+      /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
+      explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
+
+      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+      explicit ScopedContextProduce(ContextState& state)
+          : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
+
+      /// Record the CUDA event, all asynchronous work must have been queued before the destructor
+      ~ScopedContextProduce();
+
+      template <typename T>
+      std::unique_ptr<Product<T>> wrap(T data) {
+        // make_unique doesn't work because of private constructor
+        return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
+      }
+
+      template <typename T, typename... Args>
+      auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+        return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
+      }
+
+    private:
+      friend class cudatest::TestScopedContext;
+
+      // This construcor is only meant for testing
+      explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
+          : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
+
+      // create the CUDA Event upfront to catch possible errors from its creation
+      SharedEventPtr event_ = getEventCache().get();
+    };
+
+    /**
+     * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+     * - setting the current device
+     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextTask : public impl::ScopedContextBase {
+    public:
+      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+      explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
+            holderHelper_{std::move(waitingTaskHolder)},
+            contextState_{state} {}
+
+      ~ScopedContextTask();
+
+      template <typename F>
+      void pushNextTask(F&& f) {
+        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      }
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+      }
+
+    private:
+      impl::ScopedContextHolderHelper holderHelper_;
+      ContextState const* contextState_;
+    };
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in analyze()
+     * - setting the current device
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to (possibly) re-use a CUDA stream
+      explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
+    };
+
+    namespace impl {
+      template <typename F>
+      void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
+        replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
+            edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
+                                               std::move(waitingTaskHolder_),
+                                               [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+                                                 func(ScopedContextTask{state, std::move(h)});
+                                               })});
+      }
+    }  // namespace impl
+  }    // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/ScopedSetDevice.h b/src/cudacompat/CUDACore/ScopedSetDevice.h
new file mode 100644
index 000000000..68b252c59
--- /dev/null
+++ b/src/cudacompat/CUDACore/ScopedSetDevice.h
@@ -0,0 +1,30 @@
+#ifndef HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+#define HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+
+#include "CUDACore/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    class ScopedSetDevice {
+    public:
+      explicit ScopedSetDevice(int newDevice) {
+        cudaCheck(cudaGetDevice(&prevDevice_));
+        cudaCheck(cudaSetDevice(newDevice));
+      }
+
+      ~ScopedSetDevice() {
+        // Intentionally don't check the return value to avoid
+        // exceptions to be thrown. If this call fails, the process is
+        // doomed anyway.
+        cudaSetDevice(prevDevice_);
+      }
+
+    private:
+      int prevDevice_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/SharedEventPtr.h b/src/cudacompat/CUDACore/SharedEventPtr.h
new file mode 100644
index 000000000..7aa10327a
--- /dev/null
+++ b/src/cudacompat/CUDACore/SharedEventPtr.h
@@ -0,0 +1,18 @@
+#ifndef HeterogeneousCore_CUDAUtilities_SharedEventPtr_h
+#define HeterogeneousCore_CUDAUtilities_SharedEventPtr_h
+
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // cudaEvent_t itself is a typedef for a pointer, for the use with
+    // edm::ReusableObjectHolder the pointed-to type is more interesting
+    // to avoid extra layer of indirection
+    using SharedEventPtr = std::shared_ptr<std::remove_pointer_t<cudaEvent_t>>;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/SharedStreamPtr.h b/src/cudacompat/CUDACore/SharedStreamPtr.h
new file mode 100644
index 000000000..14f54e35f
--- /dev/null
+++ b/src/cudacompat/CUDACore/SharedStreamPtr.h
@@ -0,0 +1,18 @@
+#ifndef HeterogeneousCore_CUDAUtilities_SharedStreamPtr_h
+#define HeterogeneousCore_CUDAUtilities_SharedStreamPtr_h
+
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // cudaStream_t itself is a typedef for a pointer, for the use with
+    // edm::ReusableObjectHolder the pointed-to type is more interesting
+    // to avoid extra layer of indirection
+    using SharedStreamPtr = std::shared_ptr<std::remove_pointer_t<cudaStream_t>>;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/SimpleVector.h b/src/cudacompat/CUDACore/SimpleVector.h
new file mode 100644
index 000000000..f21f51cf8
--- /dev/null
+++ b/src/cudacompat/CUDACore/SimpleVector.h
@@ -0,0 +1,141 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
+#define HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
+
+//  author: Felice Pantaleo, CERN, 2018
+
+#include <type_traits>
+#include <utility>
+
+#include "CUDACore/cudaCompat.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <class T>
+    struct SimpleVector {
+      constexpr SimpleVector() = default;
+
+      // ownership of m_data stays within the caller
+      constexpr void construct(int capacity, T *data) {
+        m_size = 0;
+        m_capacity = capacity;
+        m_data = data;
+      }
+
+      inline constexpr int push_back_unsafe(const T &element) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < m_capacity) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      constexpr int emplace_back_unsafe(Ts &&... args) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < m_capacity) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      __device__ inline T &back() { return m_data[m_size - 1]; }
+
+      __device__ inline const T &back() const {
+        if (m_size > 0) {
+          return m_data[m_size - 1];
+        } else
+          return T();  //undefined behaviour
+      }
+
+      // thread-safe version of the vector, when used in a CUDA kernel
+      __device__ int push_back(const T &element) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < m_capacity) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      __device__ int emplace_back(Ts &&... args) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < m_capacity) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      // thread safe version of resize
+      __device__ int extend(int size = 1) {
+        auto previousSize = atomicAdd(&m_size, size);
+        if (previousSize < m_capacity) {
+          return previousSize;
+        } else {
+          atomicSub(&m_size, size);
+          return -1;
+        }
+      }
+
+      __device__ int shrink(int size = 1) {
+        auto previousSize = atomicSub(&m_size, size);
+        if (previousSize >= size) {
+          return previousSize - size;
+        } else {
+          atomicAdd(&m_size, size);
+          return -1;
+        }
+      }
+
+      inline constexpr bool empty() const { return m_size <= 0; }
+      inline constexpr bool full() const { return m_size >= m_capacity; }
+      inline constexpr T &operator[](int i) { return m_data[i]; }
+      inline constexpr const T &operator[](int i) const { return m_data[i]; }
+      inline constexpr void reset() { m_size = 0; }
+      inline constexpr int size() const { return m_size; }
+      inline constexpr int capacity() const { return m_capacity; }
+      inline constexpr T const *data() const { return m_data; }
+      inline constexpr void resize(int size) { m_size = size; }
+      inline constexpr void set_data(T *data) { m_data = data; }
+
+    private:
+      int m_size;
+      int m_capacity;
+
+      T *m_data;
+    };
+
+    // ownership of m_data stays within the caller
+    template <class T>
+    SimpleVector<T> make_SimpleVector(int capacity, T *data) {
+      SimpleVector<T> ret;
+      ret.construct(capacity, data);
+      return ret;
+    }
+
+    // ownership of m_data stays within the caller
+    template <class T>
+    SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
+      auto ret = new (mem) SimpleVector<T>();
+      ret->construct(capacity, data);
+      return ret;
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
diff --git a/src/cudacompat/CUDACore/StreamCache.cc b/src/cudacompat/CUDACore/StreamCache.cc
new file mode 100644
index 000000000..c6811683c
--- /dev/null
+++ b/src/cudacompat/CUDACore/StreamCache.cc
@@ -0,0 +1,43 @@
+#include "CUDACore/StreamCache.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/currentDevice.h"
+#include "CUDACore/deviceCount.h"
+#include "CUDACore/ScopedSetDevice.h"
+
+namespace cms::cuda {
+  void StreamCache::Deleter::operator()(cudaStream_t stream) const {
+    if (device_ != -1) {
+      ScopedSetDevice deviceGuard{device_};
+      cudaCheck(cudaStreamDestroy(stream));
+    }
+  }
+
+  // StreamCache should be constructed by the first call to
+  // getStreamCache() only if we have CUDA devices present
+  StreamCache::StreamCache() : cache_(deviceCount()) {}
+
+  SharedStreamPtr StreamCache::get() {
+    const auto dev = currentDevice();
+    return cache_[dev].makeOrGet([dev]() {
+      cudaStream_t stream;
+      cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+      return std::unique_ptr<BareStream, Deleter>(stream, Deleter{dev});
+    });
+  }
+
+  void StreamCache::clear() {
+    // Reset the contents of the caches, but leave an
+    // edm::ReusableObjectHolder alive for each device. This is needed
+    // mostly for the unit tests, where the function-static
+    // StreamCache lives through multiple tests (and go through
+    // multiple shutdowns of the framework).
+    cache_.clear();
+    cache_.resize(deviceCount());
+  }
+
+  StreamCache& getStreamCache() {
+    // the public interface is thread safe
+    static StreamCache cache;
+    return cache;
+  }
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/StreamCache.h b/src/cudacompat/CUDACore/StreamCache.h
new file mode 100644
index 000000000..af705b295
--- /dev/null
+++ b/src/cudacompat/CUDACore/StreamCache.h
@@ -0,0 +1,50 @@
+#ifndef HeterogeneousCore_CUDAUtilities_StreamCache_h
+#define HeterogeneousCore_CUDAUtilities_StreamCache_h
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "Framework/ReusableObjectHolder.h"
+#include "CUDACore/SharedStreamPtr.h"
+
+class CUDAService;
+
+namespace cms {
+  namespace cuda {
+    class StreamCache {
+    public:
+      using BareStream = SharedStreamPtr::element_type;
+
+      StreamCache();
+
+      // Gets a (cached) CUDA stream for the current device. The stream
+      // will be returned to the cache by the shared_ptr destructor.
+      // This function is thread safe
+      SharedStreamPtr get();
+
+    private:
+      friend class ::CUDAService;
+      // not thread safe, intended to be called only from CUDAService destructor
+      void clear();
+
+      class Deleter {
+      public:
+        Deleter() = default;
+        Deleter(int d) : device_{d} {}
+        void operator()(cudaStream_t stream) const;
+
+      private:
+        int device_ = -1;
+      };
+
+      std::vector<edm::ReusableObjectHolder<BareStream, Deleter>> cache_;
+    };
+
+    // Gets the global instance of a StreamCache
+    // This function is thread safe
+    StreamCache& getStreamCache();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/VecArray.h b/src/cudacompat/CUDACore/VecArray.h
new file mode 100644
index 000000000..595238ecd
--- /dev/null
+++ b/src/cudacompat/CUDACore/VecArray.h
@@ -0,0 +1,106 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_VecArray_h
+#define HeterogeneousCore_CUDAUtilities_interface_VecArray_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include "CUDACore/cudaCompat.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <class T, int maxSize>
+    class VecArray {
+    public:
+      using self = VecArray<T, maxSize>;
+      using value_t = T;
+
+      inline constexpr int push_back_unsafe(const T &element) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < maxSize) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      constexpr int emplace_back_unsafe(Ts &&... args) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < maxSize) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      inline constexpr T &back() const {
+        if (m_size > 0) {
+          return m_data[m_size - 1];
+        } else
+          return T();  //undefined behaviour
+      }
+
+      // thread-safe version of the vector, when used in a CUDA kernel
+      __device__ int push_back(const T &element) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < maxSize) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      __device__ int emplace_back(Ts &&... args) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < maxSize) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      inline constexpr T pop_back() {
+        if (m_size > 0) {
+          auto previousSize = m_size--;
+          return m_data[previousSize - 1];
+        } else
+          return T();
+      }
+
+      inline constexpr T const *begin() const { return m_data; }
+      inline constexpr T const *end() const { return m_data + m_size; }
+      inline constexpr T *begin() { return m_data; }
+      inline constexpr T *end() { return m_data + m_size; }
+      inline constexpr int size() const { return m_size; }
+      inline constexpr T &operator[](int i) { return m_data[i]; }
+      inline constexpr const T &operator[](int i) const { return m_data[i]; }
+      inline constexpr void reset() { m_size = 0; }
+      inline static constexpr int capacity() { return maxSize; }
+      inline constexpr T const *data() const { return m_data; }
+      inline constexpr void resize(int size) { m_size = size; }
+      inline constexpr bool empty() const { return 0 == m_size; }
+      inline constexpr bool full() const { return maxSize == m_size; }
+
+    private:
+      T m_data[maxSize];
+
+      int m_size;
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_VecArray_h
diff --git a/src/cudacompat/CUDACore/allocate_device.cc b/src/cudacompat/CUDACore/allocate_device.cc
new file mode 100644
index 000000000..ec55d0dc0
--- /dev/null
+++ b/src/cudacompat/CUDACore/allocate_device.cc
@@ -0,0 +1,39 @@
+#include <limits>
+
+#include "CUDACore/ScopedSetDevice.h"
+#include "CUDACore/allocate_device.h"
+#include "CUDACore/cudaCheck.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
+}
+
+namespace cms::cuda {
+  void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (allocator::useCaching) {
+      if (nbytes > maxAllocationSize) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
+    } else {
+      ScopedSetDevice setDeviceForThisScope(dev);
+      cudaCheck(cudaMalloc(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_device(int device, void *ptr) {
+    if constexpr (allocator::useCaching) {
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
+    } else {
+      ScopedSetDevice setDeviceForThisScope(device);
+      cudaCheck(cudaFree(ptr));
+    }
+  }
+
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/allocate_device.h b/src/cudacompat/CUDACore/allocate_device.h
new file mode 100644
index 000000000..9c271fc2f
--- /dev/null
+++ b/src/cudacompat/CUDACore/allocate_device.h
@@ -0,0 +1,16 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_device_h
+#define HeterogeneousCore_CUDAUtilities_allocate_device_h
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // Allocate device memory
+    void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
+
+    // Free device memory (to be called from unique_ptr)
+    void free_device(int device, void *ptr);
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/allocate_host.cc b/src/cudacompat/CUDACore/allocate_host.cc
new file mode 100644
index 000000000..2f6a79bb5
--- /dev/null
+++ b/src/cudacompat/CUDACore/allocate_host.cc
@@ -0,0 +1,36 @@
+#include <limits>
+
+#include "CUDACore/allocate_host.h"
+#include "CUDACore/cudaCheck.h"
+
+#include "getCachingHostAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
+}
+
+namespace cms::cuda {
+  void *allocate_host(size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (allocator::useCaching) {
+      if (nbytes > maxAllocationSize) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
+    } else {
+      cudaCheck(cudaMallocHost(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_host(void *ptr) {
+    if constexpr (allocator::useCaching) {
+      cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr));
+    } else {
+      cudaCheck(cudaFreeHost(ptr));
+    }
+  }
+
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/allocate_host.h b/src/cudacompat/CUDACore/allocate_host.h
new file mode 100644
index 000000000..1bba45800
--- /dev/null
+++ b/src/cudacompat/CUDACore/allocate_host.h
@@ -0,0 +1,16 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_host_h
+#define HeterogeneousCore_CUDAUtilities_allocate_host_h
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // Allocate pinned host memory (to be called from unique_ptr)
+    void *allocate_host(size_t nbytes, cudaStream_t stream);
+
+    // Free pinned host memory (to be called from unique_ptr)
+    void free_host(void *ptr);
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/chooseDevice.cc b/src/cudacompat/CUDACore/chooseDevice.cc
new file mode 100644
index 000000000..fd3ab4a07
--- /dev/null
+++ b/src/cudacompat/CUDACore/chooseDevice.cc
@@ -0,0 +1,14 @@
+#include "chooseDevice.h"
+#include "deviceCount.h"
+
+namespace cms::cuda {
+  int chooseDevice(edm::StreamID id) {
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+    //
+    // TODO: improve the "assignment" logic
+    return id % deviceCount();
+  }
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/chooseDevice.h b/src/cudacompat/CUDACore/chooseDevice.h
new file mode 100644
index 000000000..cd64d7e0b
--- /dev/null
+++ b/src/cudacompat/CUDACore/chooseDevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseDevice_h
+#define HeterogeneousCore_CUDACore_chooseDevice_h
+
+#include "Framework/Event.h"
+
+namespace cms::cuda {
+  int chooseDevice(edm::StreamID id);
+}
+
+#endif
diff --git a/src/cudacompat/CUDACore/copyAsync.h b/src/cudacompat/CUDACore/copyAsync.h
new file mode 100644
index 000000000..47e55c74a
--- /dev/null
+++ b/src/cudacompat/CUDACore/copyAsync.h
@@ -0,0 +1,69 @@
+#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
+#define HeterogeneousCore_CUDAUtilities_copyAsync_h
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_noncached_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+
+#include <type_traits>
+
+namespace cms {
+  namespace cuda {
+
+    // Single element
+
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T>& dst, const host::unique_ptr<T>& src, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T>& dst, const host::noncached::unique_ptr<T>& src, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    template <typename T>
+    inline void copyAsync(host::unique_ptr<T>& dst, const device::unique_ptr<T>& src, cudaStream_t stream) {
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
+    }
+
+    // Multiple elements
+
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T[]>& dst,
+                          const host::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T[]>& dst,
+                          const host::noncached::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    template <typename T>
+    inline void copyAsync(host::unique_ptr<T[]>& dst,
+                          const device::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/cudaCheck.h b/src/cudacompat/CUDACore/cudaCheck.h
new file mode 100644
index 000000000..821bfcff2
--- /dev/null
+++ b/src/cudacompat/CUDACore/cudaCheck.h
@@ -0,0 +1,61 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cudaCheck_h
+#define HeterogeneousCore_CUDAUtilities_cudaCheck_h
+
+// C++ standard headers
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+// CUDA headers
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+
+    [[noreturn]] inline void abortOnCudaError(const char* file,
+                                              int line,
+                                              const char* cmd,
+                                              const char* error,
+                                              const char* message,
+                                              const char* description = nullptr) {
+      std::ostringstream out;
+      out << "\n";
+      out << file << ", line " << line << ":\n";
+      out << "cudaCheck(" << cmd << ");\n";
+      out << error << ": " << message << "\n";
+      if (description)
+        out << description << "\n";
+      throw std::runtime_error(out.str());
+    }
+
+    inline bool cudaCheck_(
+        const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) {
+      if (result == CUDA_SUCCESS)
+        return true;
+
+      const char* error;
+      const char* message;
+      cuGetErrorName(result, &error);
+      cuGetErrorString(result, &message);
+      abortOnCudaError(file, line, cmd, error, message, description);
+      return false;
+    }
+
+    inline bool cudaCheck_(
+        const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) {
+      if (result == cudaSuccess)
+        return true;
+
+      const char* error = cudaGetErrorName(result);
+      const char* message = cudaGetErrorString(result);
+      abortOnCudaError(file, line, cmd, error, message, description);
+      return false;
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#define cudaCheck(ARG, ...) (cms::cuda::cudaCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__))
+
+#endif  // HeterogeneousCore_CUDAUtilities_cudaCheck_h
diff --git a/src/cudacompat/CUDACore/cudaCompat.cc b/src/cudacompat/CUDACore/cudaCompat.cc
new file mode 100644
index 000000000..e6bb8069d
--- /dev/null
+++ b/src/cudacompat/CUDACore/cudaCompat.cc
@@ -0,0 +1,17 @@
+#include "CUDACore/cudaCompat.h"
+
+namespace cms {
+  namespace cudacompat {
+    thread_local dim3 blockIdx;
+    thread_local dim3 gridDim;
+  }  // namespace cudacompat
+}  // namespace cms
+
+namespace {
+  struct InitGrid {
+    InitGrid() { cms::cudacompat::resetGrid(); }
+  };
+
+  const InitGrid initGrid;
+
+}  // namespace
diff --git a/src/cudacompat/CUDACore/cudaCompat.h b/src/cudacompat/CUDACore/cudaCompat.h
new file mode 100644
index 000000000..f9b4b2f8a
--- /dev/null
+++ b/src/cudacompat/CUDACore/cudaCompat.h
@@ -0,0 +1,112 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
+#define HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
+
+/*
+ * Everything you need to run cuda code in plain sequential c++ code
+ */
+
+#ifndef __CUDACC__
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cudacompat {
+
+#ifndef __CUDA_RUNTIME_H__
+    struct dim3 {
+      uint32_t x, y, z;
+    };
+#endif
+    const dim3 threadIdx = {0, 0, 0};
+    const dim3 blockDim = {1, 1, 1};
+
+    extern thread_local dim3 blockIdx;
+    extern thread_local dim3 gridDim;
+
+    template <typename T1, typename T2>
+    T1 atomicCAS(T1* address, T1 compare, T2 val) {
+      T1 old = *address;
+      *address = old == compare ? val : old;
+      return old;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicInc(T1* a, T2 b) {
+      auto ret = *a;
+      if ((*a) < T1(b))
+        (*a)++;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicAdd(T1* a, T2 b) {
+      auto ret = *a;
+      (*a) += b;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicSub(T1* a, T2 b) {
+      auto ret = *a;
+      (*a) -= b;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicMin(T1* a, T2 b) {
+      auto ret = *a;
+      *a = std::min(*a, T1(b));
+      return ret;
+    }
+    template <typename T1, typename T2>
+    T1 atomicMax(T1* a, T2 b) {
+      auto ret = *a;
+      *a = std::max(*a, T1(b));
+      return ret;
+    }
+
+    inline void __syncthreads() {}
+    inline void __threadfence() {}
+    inline bool __syncthreads_or(bool x) { return x; }
+    inline bool __syncthreads_and(bool x) { return x; }
+    template <typename T>
+    inline T __ldg(T const* x) {
+      return *x;
+    }
+
+    inline void resetGrid() {
+      blockIdx = {0, 0, 0};
+      gridDim = {1, 1, 1};
+    }
+
+  }  // namespace cudacompat
+}  // namespace cms
+
+// some  not needed as done by cuda runtime...
+#ifndef __CUDA_RUNTIME_H__
+#define __host__
+#define __device__
+#define __global__
+#define __shared__
+#define __forceinline__
+#endif
+
+// make sure function are inlined to avoid multiple definition
+#ifndef __CUDA_ARCH__
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+#undef __forceinline__
+#define __forceinline__ inline __attribute__((always_inline))
+#endif
+
+#ifndef __CUDA_ARCH__
+using namespace cms::cudacompat;
+#endif
+
+#endif
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/src/cudacompat/CUDACore/cuda_assert.h b/src/cudacompat/CUDACore/cuda_assert.h
new file mode 100644
index 000000000..f3f452dc3
--- /dev/null
+++ b/src/cudacompat/CUDACore/cuda_assert.h
@@ -0,0 +1,18 @@
+// The omission of #include guards is on purpose: it does make sense to #include
+// this file multiple times, setting a different value of GPU_DEBUG beforehand.
+
+#ifdef __CUDA_ARCH__
+#ifndef GPU_DEBUG
+// disable asserts
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#else
+// enable asserts
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+#endif
+#endif  // __CUDA_ARCH__
+
+#include <cassert>
diff --git a/src/cudacompat/CUDACore/cuda_cxx17.h b/src/cudacompat/CUDACore/cuda_cxx17.h
new file mode 100644
index 000000000..89f131edd
--- /dev/null
+++ b/src/cudacompat/CUDACore/cuda_cxx17.h
@@ -0,0 +1,63 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
+#define HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
+
+#include <initializer_list>
+
+// CUDA does not support C++17 yet, so we define here some of the missing library functions
+#if __cplusplus <= 201402L
+
+namespace std {
+
+  // from https://en.cppreference.com/w/cpp/iterator/size
+  template <class C>
+  constexpr auto size(const C& c) -> decltype(c.size()) {
+    return c.size();
+  }
+
+  template <class T, std::size_t N>
+  constexpr std::size_t size(const T (&array)[N]) noexcept {
+    return N;
+  }
+
+  // from https://en.cppreference.com/w/cpp/iterator/empty
+  template <class C>
+  constexpr auto empty(const C& c) -> decltype(c.empty()) {
+    return c.empty();
+  }
+
+  template <class T, std::size_t N>
+  constexpr bool empty(const T (&array)[N]) noexcept {
+    return false;
+  }
+
+  template <class E>
+  constexpr bool empty(std::initializer_list<E> il) noexcept {
+    return il.size() == 0;
+  }
+
+  // from https://en.cppreference.com/w/cpp/iterator/data
+  template <class C>
+  constexpr auto data(C& c) -> decltype(c.data()) {
+    return c.data();
+  }
+
+  template <class C>
+  constexpr auto data(const C& c) -> decltype(c.data()) {
+    return c.data();
+  }
+
+  template <class T, std::size_t N>
+  constexpr T* data(T (&array)[N]) noexcept {
+    return array;
+  }
+
+  template <class E>
+  constexpr const E* data(std::initializer_list<E> il) noexcept {
+    return il.begin();
+  }
+
+}  // namespace std
+
+#endif
+
+#endif  // HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
diff --git a/src/cudacompat/CUDACore/cudastdAlgorithm.h b/src/cudacompat/CUDACore/cudastdAlgorithm.h
new file mode 100644
index 000000000..4ff01d75a
--- /dev/null
+++ b/src/cudacompat/CUDACore/cudastdAlgorithm.h
@@ -0,0 +1,70 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
+#define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
+
+#include <utility>
+
+#include <cuda_runtime.h>
+
+// reimplementation of std algorithms able to compile with CUDA and run on GPUs,
+// mostly by declaringthem constexpr
+
+namespace cuda_std {
+
+  template <typename T = void>
+  struct less {
+    __host__ __device__ constexpr bool operator()(const T &lhs, const T &rhs) const { return lhs < rhs; }
+  };
+
+  template <>
+  struct less<void> {
+    template <typename T, typename U>
+    __host__ __device__ constexpr bool operator()(const T &lhs, const U &rhs) const {
+      return lhs < rhs;
+    }
+  };
+
+  template <typename RandomIt, typename T, typename Compare = less<T>>
+  __host__ __device__ constexpr RandomIt lower_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) {
+    auto count = last - first;
+
+    while (count > 0) {
+      auto it = first;
+      auto step = count / 2;
+      it += step;
+      if (comp(*it, value)) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    return first;
+  }
+
+  template <typename RandomIt, typename T, typename Compare = less<T>>
+  __host__ __device__ constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) {
+    auto count = last - first;
+
+    while (count > 0) {
+      auto it = first;
+      auto step = count / 2;
+      it += step;
+      if (!comp(value, *it)) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    return first;
+  }
+
+  template <typename RandomIt, typename T, typename Compare = cuda_std::less<T>>
+  __host__ __device__ constexpr RandomIt binary_find(RandomIt first, RandomIt last, const T &value, Compare comp = {}) {
+    first = cuda_std::lower_bound(first, last, value, comp);
+    return first != last && !comp(value, *first) ? first : last;
+  }
+
+}  // namespace cuda_std
+
+#endif  // HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
diff --git a/src/cudacompat/CUDACore/currentDevice.h b/src/cudacompat/CUDACore/currentDevice.h
new file mode 100644
index 000000000..ee50102fa
--- /dev/null
+++ b/src/cudacompat/CUDACore/currentDevice.h
@@ -0,0 +1,18 @@
+#ifndef HeterogenousCore_CUDAUtilities_currentDevice_h
+#define HeterogenousCore_CUDAUtilities_currentDevice_h
+
+#include "CUDACore/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    inline int currentDevice() {
+      int dev;
+      cudaCheck(cudaGetDevice(&dev));
+      return dev;
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/deviceAllocatorStatus.cc b/src/cudacompat/CUDACore/deviceAllocatorStatus.cc
new file mode 100644
index 000000000..5d4a0ca09
--- /dev/null
+++ b/src/cudacompat/CUDACore/deviceAllocatorStatus.cc
@@ -0,0 +1,7 @@
+#include "CUDACore/deviceAllocatorStatus.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace cms::cuda {
+  allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); }
+}  // namespace cms::cuda
diff --git a/src/cudacompat/CUDACore/deviceAllocatorStatus.h b/src/cudacompat/CUDACore/deviceAllocatorStatus.h
new file mode 100644
index 000000000..92f9f87e8
--- /dev/null
+++ b/src/cudacompat/CUDACore/deviceAllocatorStatus.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+
+#include <map>
+
+namespace cms {
+  namespace cuda {
+    namespace allocator {
+      struct TotalBytes {
+        size_t free;
+        size_t live;
+        size_t liveRequested;  // CMS: monitor also requested amount
+        TotalBytes() { free = live = liveRequested = 0; }
+      };
+      /// Map type of device ordinals to the number of cached bytes cached by each device
+      using GpuCachedBytes = std::map<int, TotalBytes>;
+    }  // namespace allocator
+
+    allocator::GpuCachedBytes deviceAllocatorStatus();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/deviceCount.h b/src/cudacompat/CUDACore/deviceCount.h
new file mode 100644
index 000000000..51396e88f
--- /dev/null
+++ b/src/cudacompat/CUDACore/deviceCount.h
@@ -0,0 +1,18 @@
+#ifndef HeterogenousCore_CUDAUtilities_deviceCount_h
+#define HeterogenousCore_CUDAUtilities_deviceCount_h
+
+#include "CUDACore/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    inline int deviceCount() {
+      int ndevices;
+      cudaCheck(cudaGetDeviceCount(&ndevices));
+      return ndevices;
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/device_unique_ptr.h b/src/cudacompat/CUDACore/device_unique_ptr.h
new file mode 100644
index 000000000..2c78117c8
--- /dev/null
+++ b/src/cudacompat/CUDACore/device_unique_ptr.h
@@ -0,0 +1,101 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include "CUDACore/allocate_device.h"
+#include "CUDACore/currentDevice.h"
+
+namespace cms {
+  namespace cuda {
+    namespace device {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class DeviceDeleter {
+        public:
+          DeviceDeleter() = default;  // for edm::Wrapper
+          DeviceDeleter(int device) : device_{device} {}
+
+          void operator()(void *ptr) {
+            if (device_ >= 0) {
+              free_device(device_, ptr);
+            }
+          }
+
+        private:
+          int device_ = -1;
+        };
+      }  // namespace impl
+
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_device_unique_selector {
+          using non_array = cms::cuda::device::unique_ptr<T>;
+        };
+        template <typename T>
+        struct make_device_unique_selector<T[]> {
+          using unbounded_array = cms::cuda::device::unique_ptr<T[]>;
+        };
+        template <typename T, size_t N>
+        struct make_device_unique_selector<T[N]> {
+          struct bounded_array {};
+        };
+      }  // namespace impl
+    }    // namespace device
+
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T), stream);
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
+                                                                                              cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type), stream);
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T, typename... Args>
+    typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
+
+    // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
+        cudaStream_t stream) {
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T), stream);
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
+        size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type), stream);
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T, typename... Args>
+    typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique_uninitialized(Args &&...) =
+        delete;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/eigenSoA.h b/src/cudacompat/CUDACore/eigenSoA.h
new file mode 100644
index 000000000..4b9672f75
--- /dev/null
+++ b/src/cudacompat/CUDACore/eigenSoA.h
@@ -0,0 +1,55 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h
+#define HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include <Eigen/Core>
+
+#include "CUDACore/cudaCompat.h"
+
+namespace eigenSoA {
+
+  constexpr bool isPowerOf2(int32_t v) { return v && !(v & (v - 1)); }
+
+  template <typename T, int S>
+  class alignas(128) ScalarSoA {
+  public:
+    using Scalar = T;
+
+    __host__ __device__ constexpr Scalar& operator()(int32_t i) { return data_[i]; }
+    __device__ constexpr const Scalar operator()(int32_t i) const { return __ldg(data_ + i); }
+    __host__ __device__ constexpr Scalar& operator[](int32_t i) { return data_[i]; }
+    __device__ constexpr const Scalar operator[](int32_t i) const { return __ldg(data_ + i); }
+
+    __host__ __device__ constexpr Scalar* data() { return data_; }
+    __host__ __device__ constexpr Scalar const* data() const { return data_; }
+
+  private:
+    Scalar data_[S];
+    static_assert(isPowerOf2(S), "SoA stride not a power of 2");
+    static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128");
+  };
+
+  template <typename M, int S>
+  class alignas(128) MatrixSoA {
+  public:
+    using Scalar = typename M::Scalar;
+    using Map = Eigen::Map<M, 0, Eigen::Stride<M::RowsAtCompileTime * S, S> >;
+    using CMap = Eigen::Map<const M, 0, Eigen::Stride<M::RowsAtCompileTime * S, S> >;
+
+    __host__ __device__ constexpr Map operator()(int32_t i) { return Map(data_ + i); }
+    __host__ __device__ constexpr CMap operator()(int32_t i) const { return CMap(data_ + i); }
+    __host__ __device__ constexpr Map operator[](int32_t i) { return Map(data_ + i); }
+    __host__ __device__ constexpr CMap operator[](int32_t i) const { return CMap(data_ + i); }
+
+  private:
+    Scalar data_[S * M::RowsAtCompileTime * M::ColsAtCompileTime];
+    static_assert(isPowerOf2(S), "SoA stride not a power of 2");
+    static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128");
+  };
+
+}  // namespace eigenSoA
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h
diff --git a/src/cudacompat/CUDACore/eventWorkHasCompleted.h b/src/cudacompat/CUDACore/eventWorkHasCompleted.h
new file mode 100644
index 000000000..1d814acdf
--- /dev/null
+++ b/src/cudacompat/CUDACore/eventWorkHasCompleted.h
@@ -0,0 +1,32 @@
+#ifndef HeterogeneousCore_CUDAUtilities_eventWorkHasCompleted_h
+#define HeterogeneousCore_CUDAUtilities_eventWorkHasCompleted_h
+
+#include "CUDACore/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    /**
+   * Returns true if the work captured by the event (=queued to the
+   * CUDA stream at the point of cudaEventRecord()) has completed.
+   *
+   * Returns false if any captured work is incomplete.
+   *
+   * In case of errors, throws an exception.
+   */
+    inline bool eventWorkHasCompleted(cudaEvent_t event) {
+      const auto ret = cudaEventQuery(event);
+      if (ret == cudaSuccess) {
+        return true;
+      } else if (ret == cudaErrorNotReady) {
+        return false;
+      }
+      // leave error case handling to cudaCheck
+      cudaCheck(ret);
+      return false;  // to keep compiler happy
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/getCachingDeviceAllocator.h b/src/cudacompat/CUDACore/getCachingDeviceAllocator.h
new file mode 100644
index 000000000..b467d4b90
--- /dev/null
+++ b/src/cudacompat/CUDACore/getCachingDeviceAllocator.h
@@ -0,0 +1,77 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+
+#include <iomanip>
+#include <iostream>
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/deviceCount.h"
+#include "CachingDeviceAllocator.h"
+
+namespace cms::cuda::allocator {
+  // Use caching or not
+  constexpr bool useCaching = true;
+  // Growth factor (bin_growth in cub::CachingDeviceAllocator
+  constexpr unsigned int binGrowth = 2;
+  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+  constexpr unsigned int minBin = 8;
+  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+  constexpr unsigned int maxBin = 30;
+  // Total storage for the allocator. 0 means no limit.
+  constexpr size_t maxCachedBytes = 0;
+  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
+  constexpr double maxCachedFraction = 0.8;
+  constexpr bool debug = false;
+
+  inline size_t minCachedBytes() {
+    size_t ret = std::numeric_limits<size_t>::max();
+    int currentDevice;
+    cudaCheck(cudaGetDevice(&currentDevice));
+    const int numberOfDevices = deviceCount();
+    for (int i = 0; i < numberOfDevices; ++i) {
+      size_t freeMemory, totalMemory;
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
+    }
+    cudaCheck(cudaSetDevice(currentDevice));
+    if (maxCachedBytes > 0) {
+      ret = std::min(ret, maxCachedBytes);
+    }
+    return ret;
+  }
+
+  inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingDeviceAllocator settings\n"
+                << "  bin growth " << binGrowth << "\n"
+                << "  min bin    " << minBin << "\n"
+                << "  max bin    " << maxBin << "\n"
+                << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
+    // the public interface is thread safe
+    static notcub::CachingDeviceAllocator allocator{binGrowth,
+                                                    minBin,
+                                                    maxBin,
+                                                    minCachedBytes(),
+                                                    false,  // do not skip cleanup
+                                                    debug};
+    return allocator;
+  }
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/src/cudacompat/CUDACore/getCachingHostAllocator.h b/src/cudacompat/CUDACore/getCachingHostAllocator.h
new file mode 100644
index 000000000..d29080795
--- /dev/null
+++ b/src/cudacompat/CUDACore/getCachingHostAllocator.h
@@ -0,0 +1,46 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+
+#include <iomanip>
+#include <iostream>
+
+#include "CUDACore/cudaCheck.h"
+#include "CachingHostAllocator.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace cms::cuda::allocator {
+  inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingHostAllocator settings\n"
+                << "  bin growth " << binGrowth << "\n"
+                << "  min bin    " << minBin << "\n"
+                << "  max bin    " << maxBin << "\n"
+                << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
+    // the public interface is thread safe
+    static notcub::CachingHostAllocator allocator{binGrowth,
+                                                  minBin,
+                                                  maxBin,
+                                                  minCachedBytes(),
+                                                  false,  // do not skip cleanup
+                                                  debug};
+    return allocator;
+  }
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/src/cudacompat/CUDACore/host_noncached_unique_ptr.h b/src/cudacompat/CUDACore/host_noncached_unique_ptr.h
new file mode 100644
index 000000000..276e83778
--- /dev/null
+++ b/src/cudacompat/CUDACore/host_noncached_unique_ptr.h
@@ -0,0 +1,74 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+
+#include <memory>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/cudaCheck.h"
+
+namespace cms {
+  namespace cuda {
+    namespace host {
+      namespace noncached {
+        namespace impl {
+          // Additional layer of types to distinguish from host::unique_ptr
+          class HostDeleter {
+          public:
+            void operator()(void *ptr) { cudaCheck(cudaFreeHost(ptr)); }
+          };
+        }  // namespace impl
+
+        template <typename T>
+        using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+        namespace impl {
+          template <typename T>
+          struct make_host_unique_selector {
+            using non_array = cms::cuda::host::noncached::unique_ptr<T>;
+          };
+          template <typename T>
+          struct make_host_unique_selector<T[]> {
+            using unbounded_array = cms::cuda::host::noncached::unique_ptr<T[]>;
+          };
+          template <typename T, size_t N>
+          struct make_host_unique_selector<T[N]> {
+            struct bounded_array {};
+          };
+        }  // namespace impl
+      }    // namespace noncached
+    }      // namespace host
+
+    /**
+   * The difference wrt. make_host_unique is that these
+   * do not cache, so they should not be called per-event.
+   */
+    template <typename T>
+    typename host::noncached::impl::make_host_unique_selector<T>::non_array make_host_noncached_unique(
+        unsigned int flags = cudaHostAllocDefault) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem;
+      cudaCheck(cudaHostAlloc(&mem, sizeof(T), flags));
+      return typename host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
+    }
+
+    template <typename T>
+    typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array make_host_noncached_unique(
+        size_t n, unsigned int flags = cudaHostAllocDefault) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem;
+      cudaCheck(cudaHostAlloc(&mem, n * sizeof(element_type), flags));
+      return typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array(
+          reinterpret_cast<element_type *>(mem));
+    }
+
+    template <typename T, typename... Args>
+    typename host::noncached::impl::make_host_unique_selector<T>::bounded_array make_host_noncached_unique(Args &&...) =
+        delete;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/host_unique_ptr.h b/src/cudacompat/CUDACore/host_unique_ptr.h
new file mode 100644
index 000000000..f34798da3
--- /dev/null
+++ b/src/cudacompat/CUDACore/host_unique_ptr.h
@@ -0,0 +1,80 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include "CUDACore/allocate_host.h"
+
+namespace cms {
+  namespace cuda {
+    namespace host {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class HostDeleter {
+        public:
+          void operator()(void *ptr) { cms::cuda::free_host(ptr); }
+        };
+      }  // namespace impl
+
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_host_unique_selector {
+          using non_array = cms::cuda::host::unique_ptr<T>;
+        };
+        template <typename T>
+        struct make_host_unique_selector<T[]> {
+          using unbounded_array = cms::cuda::host::unique_ptr<T[]>;
+        };
+        template <typename T, size_t N>
+        struct make_host_unique_selector<T[N]> {
+          struct bounded_array {};
+        };
+      }  // namespace impl
+    }    // namespace host
+
+    // Allocate pinned host memory
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(sizeof(T), stream);
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
+
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(n * sizeof(element_type), stream);
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
+
+    template <typename T, typename... Args>
+    typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique(Args &&...) = delete;
+
+    // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
+      void *mem = allocate_host(sizeof(T), stream);
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
+
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(
+        size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      void *mem = allocate_host(n * sizeof(element_type), stream);
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
+
+    template <typename T, typename... Args>
+    typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique_uninitialized(Args &&...) = delete;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/launch.h b/src/cudacompat/CUDACore/launch.h
new file mode 100644
index 000000000..9db7ee352
--- /dev/null
+++ b/src/cudacompat/CUDACore/launch.h
@@ -0,0 +1,147 @@
+#ifndef HeterogeneousCore_CUDAUtilities_launch_h
+#define HeterogeneousCore_CUDAUtilities_launch_h
+
+#include <tuple>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/cudaCheck.h"
+
+/*
+ * `cms::cuda::launch` and `cms::cuda::launch_cooperative` are wrappers around
+ * the CUDA Runtime API calls to setup and call a CUDA kernel from the host.
+ *
+ * `kernel` should be a pointer to a __global__ void(...) function.
+ * `config` describe the launch configuration: the grid size and block size, the
+ *          dynamic shared memory size (default to 0) and the CUDA stream to use
+ *          (default to 0, the default stream).
+ * `args` are the arguments passed (by value) to the kernel.
+ *
+ *  Currently this is requires an extra copy to perform the necessary implicit
+ *  conversions and ensure that the arguments match the kernel function signature;
+ *  the extra copy could eventually be avoided for arguments that are already of
+ *  the exact type.
+ *
+ *  Unlike the `kernel<<<...>>>(...)` syntax and the `cuda::launch(...)` 
+ *  implementation from the CUDA API Wrappers, `cms::cuda::launch(...)` and 
+ *  `cms::cuda::launch_cooperative` can be called from standard C++ host code.
+ *
+ *  Possible optimisations
+ *
+ *    - once C++17 is available in CUDA, replace the `pointer_setter` functor
+ *      with a simpler function using fold expressions:
+ *
+ *  template<int N, class Tuple, std::size_t... Is>
+ *  void pointer_setter(void* ptrs[N], Tuple const& t, std::index_sequence<Is...>)
+ *  {
+ *    ((ptrs[Is] = & std::get<Is>(t)), ...);
+ *  }
+ *
+ *    - add a template specialisation to `launch` and `launch_cooperative` to
+ *      avoid making a temporary copy of the parameters when they match the
+ *      kernel signature.
+ */
+
+namespace cms {
+  namespace cuda {
+
+    struct LaunchParameters {
+      dim3 gridDim;
+      dim3 blockDim;
+      size_t sharedMem;
+      cudaStream_t stream;
+
+      LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+          : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+
+      LaunchParameters(int gridDim, int blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+          : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+    };
+
+    namespace detail {
+
+      template <typename T>
+      struct kernel_traits;
+
+      template <typename... Args>
+      struct kernel_traits<void(Args...)> {
+        static constexpr size_t arguments_size = sizeof...(Args);
+
+        using argument_type_tuple = std::tuple<Args...>;
+
+        template <size_t i>
+        using argument_type = typename std::tuple_element<i, argument_type_tuple>::type;
+      };
+
+      // fill an array with the pointers to the elements of a tuple
+      template <int I>
+      struct pointer_setter {
+        template <typename Tuple>
+        void operator()(void const* ptrs[], Tuple const& t) {
+          pointer_setter<I - 1>()(ptrs, t);
+          ptrs[I - 1] = &std::get<I - 1>(t);
+        }
+      };
+
+      template <>
+      struct pointer_setter<0> {
+        template <typename Tuple>
+        void operator()(void const* ptrs[], Tuple const& t) {}
+      };
+
+    }  // namespace detail
+
+    // wrappers for cudaLaunchKernel
+
+    inline void launch(void (*kernel)(), LaunchParameters config) {
+      cudaCheck(cudaLaunchKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+    }
+
+    template <typename F, typename... Args>
+#if __cplusplus >= 201703L
+    std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+#else
+    std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+#endif
+    launch(F* kernel, LaunchParameters config, Args&&... args) {
+      using function_type = detail::kernel_traits<F>;
+      typename function_type::argument_type_tuple args_copy(args...);
+
+      constexpr auto size = function_type::arguments_size;
+      void const* pointers[size];
+
+      detail::pointer_setter<size>()(pointers, args_copy);
+      cudaCheck(cudaLaunchKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+    }
+
+    // wrappers for cudaLaunchCooperativeKernel
+
+    inline void launch_cooperative(void (*kernel)(), LaunchParameters config) {
+      cudaCheck(cudaLaunchCooperativeKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+    }
+
+    template <typename F, typename... Args>
+#if __cplusplus >= 201703L
+    std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+#else
+    std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+#endif
+    launch_cooperative(F* kernel, LaunchParameters config, Args&&... args) {
+      using function_type = detail::kernel_traits<F>;
+      typename function_type::argument_type_tuple args_copy(args...);
+
+      constexpr auto size = function_type::arguments_size;
+      void const* pointers[size];
+
+      detail::pointer_setter<size>()(pointers, args_copy);
+      cudaCheck(cudaLaunchCooperativeKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_launch_h
diff --git a/src/cudacompat/CUDACore/memsetAsync.h b/src/cudacompat/CUDACore/memsetAsync.h
new file mode 100644
index 000000000..0f7e86422
--- /dev/null
+++ b/src/cudacompat/CUDACore/memsetAsync.h
@@ -0,0 +1,33 @@
+#ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
+#define HeterogeneousCore_CUDAUtilities_memsetAsync_h
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+
+#include <type_traits>
+
+namespace cms {
+  namespace cuda {
+    template <typename T>
+    inline void memsetAsync(device::unique_ptr<T>& ptr, T value, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
+    }
+
+    /**
+   * The type of `value` is `int` because of `cudaMemsetAsync()` takes
+   * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
+   * each **byte** to `value`. This may lead to unexpected results if
+   * `sizeof(T) > 1` and `value != 0`.
+   */
+    template <typename T>
+    inline void memsetAsync(device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
+      cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudacompat/CUDACore/prefixScan.h b/src/cudacompat/CUDACore/prefixScan.h
new file mode 100644
index 000000000..5624af03f
--- /dev/null
+++ b/src/cudacompat/CUDACore/prefixScan.h
@@ -0,0 +1,188 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
+#define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
+
+#include <cstdint>
+
+#include "CUDACore/cudaCompat.h"
+#include "CUDACore/cuda_assert.h"
+
+#ifdef __CUDA_ARCH__
+
+template <typename T>
+__device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
+  // ci and co may be the same
+  auto x = ci[i];
+  auto laneId = threadIdx.x & 0x1f;
+#pragma unroll
+  for (int offset = 1; offset < 32; offset <<= 1) {
+    auto y = __shfl_up_sync(mask, x, offset);
+    if (laneId >= offset)
+      x += y;
+  }
+  co[i] = x;
+}
+
+template <typename T>
+__device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
+  auto x = c[i];
+  auto laneId = threadIdx.x & 0x1f;
+#pragma unroll
+  for (int offset = 1; offset < 32; offset <<= 1) {
+    auto y = __shfl_up_sync(mask, x, offset);
+    if (laneId >= offset)
+      x += y;
+  }
+  c[i] = x;
+}
+
+#endif
+
+namespace cms {
+  namespace cuda {
+
+    // limited to 32*32 elements....
+    template <typename VT, typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
+                                                             VT* co,
+                                                             uint32_t size,
+                                                             T* ws
+#ifndef __CUDA_ARCH__
+                                                             = nullptr
+#endif
+    ) {
+#ifdef __CUDA_ARCH__
+      assert(ws);
+      assert(size <= 1024);
+      assert(0 == blockDim.x % 32);
+      auto first = threadIdx.x;
+      auto mask = __ballot_sync(0xffffffff, first < size);
+
+      for (auto i = first; i < size; i += blockDim.x) {
+        warpPrefixScan(ci, co, i, mask);
+        auto laneId = threadIdx.x & 0x1f;
+        auto warpId = i / 32;
+        assert(warpId < 32);
+        if (31 == laneId)
+          ws[warpId] = co[i];
+        mask = __ballot_sync(mask, i + blockDim.x < size);
+      }
+      __syncthreads();
+      if (size <= 32)
+        return;
+      if (threadIdx.x < 32)
+        warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+      __syncthreads();
+      for (auto i = first + 32; i < size; i += blockDim.x) {
+        auto warpId = i / 32;
+        co[i] += ws[warpId - 1];
+      }
+      __syncthreads();
+#else
+      co[0] = ci[0];
+      for (uint32_t i = 1; i < size; ++i)
+        co[i] = ci[i] + co[i - 1];
+#endif
+    }
+
+    // same as above, may remove
+    // limited to 32*32 elements....
+    template <typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(T* c,
+                                                             uint32_t size,
+                                                             T* ws
+#ifndef __CUDA_ARCH__
+                                                             = nullptr
+#endif
+    ) {
+#ifdef __CUDA_ARCH__
+      assert(ws);
+      assert(size <= 1024);
+      assert(0 == blockDim.x % 32);
+      auto first = threadIdx.x;
+      auto mask = __ballot_sync(0xffffffff, first < size);
+
+      for (auto i = first; i < size; i += blockDim.x) {
+        warpPrefixScan(c, i, mask);
+        auto laneId = threadIdx.x & 0x1f;
+        auto warpId = i / 32;
+        assert(warpId < 32);
+        if (31 == laneId)
+          ws[warpId] = c[i];
+        mask = __ballot_sync(mask, i + blockDim.x < size);
+      }
+      __syncthreads();
+      if (size <= 32)
+        return;
+      if (threadIdx.x < 32)
+        warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+      __syncthreads();
+      for (auto i = first + 32; i < size; i += blockDim.x) {
+        auto warpId = i / 32;
+        c[i] += ws[warpId - 1];
+      }
+      __syncthreads();
+#else
+      for (uint32_t i = 1; i < size; ++i)
+        c[i] += c[i - 1];
+#endif
+    }
+
+#ifdef __CUDA_ARCH__
+    // see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
+    __device__ __forceinline__ unsigned dynamic_smem_size() {
+      unsigned ret;
+      asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
+      return ret;
+    }
+#endif
+
+    // in principle not limited....
+    template <typename T>
+    __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
+      volatile T const* ci = ici;
+      volatile T* co = ico;
+      __shared__ T ws[32];
+#ifdef __CUDA_ARCH__
+      assert(sizeof(T) * gridDim.x <= dynamic_smem_size());  // size of psum below
+#endif
+      assert(blockDim.x * gridDim.x >= size);
+      // first each block does a scan
+      int off = blockDim.x * blockIdx.x;
+      if (size - off > 0)
+        blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);
+
+      // count blocks that finished
+      __shared__ bool isLastBlockDone;
+      if (0 == threadIdx.x) {
+        __threadfence();
+        auto value = atomicAdd(pc, 1);  // block counter
+        isLastBlockDone = (value == (int(gridDim.x) - 1));
+      }
+
+      __syncthreads();
+
+      if (!isLastBlockDone)
+        return;
+
+      assert(int(gridDim.x) == *pc);
+
+      // good each block has done its work and now we are left in last block
+
+      // let's get the partial sums from each block
+      extern __shared__ T psum[];
+      for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
+        auto j = blockDim.x * i + blockDim.x - 1;
+        psum[i] = (j < size) ? co[j] : T(0);
+      }
+      __syncthreads();
+      blockPrefixScan(psum, psum, gridDim.x, ws);
+
+      // now it would have been handy to have the other blocks around...
+      for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
+        co[i] += psum[k];
+      }
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
diff --git a/src/cudacompat/CUDACore/radixSort.h b/src/cudacompat/CUDACore/radixSort.h
new file mode 100644
index 000000000..ff1da2d46
--- /dev/null
+++ b/src/cudacompat/CUDACore/radixSort.h
@@ -0,0 +1,277 @@
+#ifndef HeterogeneousCoreCUDAUtilities_radixSort_H
+#define HeterogeneousCoreCUDAUtilities_radixSort_H
+
+#ifdef __CUDACC__
+
+#include <cstdint>
+#include <type_traits>
+
+#include "CUDACore/cuda_assert.h"
+
+template <typename T>
+__device__ inline void dummyReorder(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
+
+template <typename T>
+__device__ inline void reorderSigned(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  //move negative first...
+
+  int32_t first = threadIdx.x;
+  __shared__ uint32_t firstNeg;
+  firstNeg = a[ind[0]] < 0 ? 0 : size;
+  __syncthreads();
+
+  // find first negative
+  for (auto i = first; i < size - 1; i += blockDim.x) {
+    if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
+      firstNeg = i + 1;
+  }
+
+  __syncthreads();
+
+  auto ii = first;
+  for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii += blockDim.x;
+  }
+  __syncthreads();
+  ii = size - firstNeg + threadIdx.x;
+  assert(ii >= 0);
+  for (auto i = first; i < firstNeg; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii += blockDim.x;
+  }
+  __syncthreads();
+  for (auto i = first; i < size; i += blockDim.x)
+    ind[i] = ind2[i];
+}
+
+template <typename T>
+__device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  //move negative first...
+
+  int32_t first = threadIdx.x;
+  __shared__ uint32_t firstNeg;
+  firstNeg = a[ind[0]] < 0 ? 0 : size;
+  __syncthreads();
+
+  // find first negative
+  for (auto i = first; i < size - 1; i += blockDim.x) {
+    if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
+      firstNeg = i + 1;
+  }
+
+  __syncthreads();
+
+  int ii = size - firstNeg - threadIdx.x - 1;
+  for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii -= blockDim.x;
+  }
+  __syncthreads();
+  ii = size - firstNeg + threadIdx.x;
+  assert(ii >= 0);
+  for (auto i = first; i < firstNeg; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii += blockDim.x;
+  }
+  __syncthreads();
+  for (auto i = first; i < size; i += blockDim.x)
+    ind[i] = ind2[i];
+}
+
+template <typename T,  // shall be interger
+          int NS,      // number of significant bytes to use in sorting
+          typename RF>
+__device__ __forceinline__ void radixSortImpl(
+    T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
+  constexpr int d = 8, w = 8 * sizeof(T);
+  constexpr int sb = 1 << d;
+  constexpr int ps = int(sizeof(T)) - NS;
+
+  __shared__ int32_t c[sb], ct[sb], cu[sb];
+
+  __shared__ int ibs;
+  __shared__ int p;
+
+  assert(size > 0);
+  assert(blockDim.x >= sb);
+
+  // bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
+
+  p = ps;
+
+  auto j = ind;
+  auto k = ind2;
+
+  int32_t first = threadIdx.x;
+  for (auto i = first; i < size; i += blockDim.x)
+    j[i] = i;
+  __syncthreads();
+
+  while (__syncthreads_and(p < w / d)) {
+    if (threadIdx.x < sb)
+      c[threadIdx.x] = 0;
+    __syncthreads();
+
+    // fill bins
+    for (auto i = first; i < size; i += blockDim.x) {
+      auto bin = (a[j[i]] >> d * p) & (sb - 1);
+      atomicAdd(&c[bin], 1);
+    }
+    __syncthreads();
+
+    // prefix scan "optimized"???...
+    if (threadIdx.x < sb) {
+      auto x = c[threadIdx.x];
+      auto laneId = threadIdx.x & 0x1f;
+#pragma unroll
+      for (int offset = 1; offset < 32; offset <<= 1) {
+        auto y = __shfl_up_sync(0xffffffff, x, offset);
+        if (laneId >= offset)
+          x += y;
+      }
+      ct[threadIdx.x] = x;
+    }
+    __syncthreads();
+    if (threadIdx.x < sb) {
+      auto ss = (threadIdx.x / 32) * 32 - 1;
+      c[threadIdx.x] = ct[threadIdx.x];
+      for (int i = ss; i > 0; i -= 32)
+        c[threadIdx.x] += ct[i];
+    }
+    /* 
+    //prefix scan for the nulls  (for documentation)
+    if (threadIdx.x==0)
+      for (int i = 1; i < sb; ++i) c[i] += c[i-1];
+    */
+
+    // broadcast
+    ibs = size - 1;
+    __syncthreads();
+    while (__syncthreads_and(ibs > 0)) {
+      int i = ibs - threadIdx.x;
+      if (threadIdx.x < sb) {
+        cu[threadIdx.x] = -1;
+        ct[threadIdx.x] = -1;
+      }
+      __syncthreads();
+      int32_t bin = -1;
+      if (threadIdx.x < sb) {
+        if (i >= 0) {
+          bin = (a[j[i]] >> d * p) & (sb - 1);
+          ct[threadIdx.x] = bin;
+          atomicMax(&cu[bin], int(i));
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x < sb) {
+        if (i >= 0 && i == cu[bin])  // ensure to keep them in order
+          for (int ii = threadIdx.x; ii < sb; ++ii)
+            if (ct[ii] == bin) {
+              auto oi = ii - threadIdx.x;
+              // assert(i>=oi);if(i>=oi)
+              k[--c[bin]] = j[i - oi];
+            }
+      }
+      __syncthreads();
+      if (bin >= 0)
+        assert(c[bin] >= 0);
+      if (threadIdx.x == 0)
+        ibs -= sb;
+      __syncthreads();
+    }
+
+    /*
+    // broadcast for the nulls  (for documentation)
+    if (threadIdx.x==0)
+    for (int i=size-first-1; i>=0; i--) { // =blockDim.x) {
+      auto bin = (a[j[i]] >> d*p)&(sb-1);
+      auto ik = atomicSub(&c[bin],1);
+      k[ik-1] = j[i];
+    }
+    */
+
+    __syncthreads();
+    assert(c[0] == 0);
+
+    // swap (local, ok)
+    auto t = j;
+    j = k;
+    k = t;
+
+    if (threadIdx.x == 0)
+      ++p;
+    __syncthreads();
+  }
+
+  if ((w != 8) && (0 == (NS & 1)))
+    assert(j == ind);  // w/d is even so ind is correct
+
+  if (j != ind)  // odd...
+    for (auto i = first; i < size; i += blockDim.x)
+      ind[i] = ind2[i];
+
+  __syncthreads();
+
+  // now move negative first... (if signed)
+  reorder(a, ind, ind2, size);
+}
+
+template <typename T,
+          int NS = sizeof(T),  // number of significant bytes to use in sorting
+          typename std::enable_if<std::is_unsigned<T>::value, T>::type* = nullptr>
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  radixSortImpl<T, NS>(a, ind, ind2, size, dummyReorder<T>);
+}
+
+template <typename T,
+          int NS = sizeof(T),  // number of significant bytes to use in sorting
+          typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  radixSortImpl<T, NS>(a, ind, ind2, size, reorderSigned<T>);
+}
+
+template <typename T,
+          int NS = sizeof(T),  // number of significant bytes to use in sorting
+          typename std::enable_if<std::is_floating_point<T>::value, T>::type* = nullptr>
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  using I = int;
+  radixSortImpl<I, NS>((I const*)(a), ind, ind2, size, reorderFloat<I>);
+}
+
+template <typename T, int NS = sizeof(T)>
+__device__ __forceinline__ void radixSortMulti(T const* v,
+                                               uint16_t* index,
+                                               uint32_t const* offsets,
+                                               uint16_t* workspace) {
+  extern __shared__ uint16_t ws[];
+
+  auto a = v + offsets[blockIdx.x];
+  auto ind = index + offsets[blockIdx.x];
+  auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx.x];
+  auto size = offsets[blockIdx.x + 1] - offsets[blockIdx.x];
+  assert(offsets[blockIdx.x + 1] >= offsets[blockIdx.x]);
+  if (size > 0)
+    radixSort<T, NS>(a, ind, ind2, size);
+}
+
+namespace cms {
+  namespace cuda {
+
+    template <typename T, int NS = sizeof(T)>
+    __global__ void __launch_bounds__(256, 4)
+        radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+      radixSortMulti<T, NS>(v, index, offsets, workspace);
+    }
+
+    template <typename T, int NS = sizeof(T)>
+    __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+      radixSortMulti<T, NS>(v, index, offsets, workspace);
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // __CUDACC__
+
+#endif  // HeterogeneousCoreCUDAUtilities_radixSort_H
diff --git a/src/cudacompat/CUDACore/requireDevices.cc b/src/cudacompat/CUDACore/requireDevices.cc
new file mode 100644
index 000000000..86eb408ff
--- /dev/null
+++ b/src/cudacompat/CUDACore/requireDevices.cc
@@ -0,0 +1,30 @@
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/requireDevices.h"
+
+namespace cms::cudatest {
+  bool testDevices() {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess) {
+      std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    if (devices == 0) {
+      std::cerr << "No CUDA devices available, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    return true;
+  }
+
+  void requireDevices() {
+    if (not testDevices()) {
+      exit(EXIT_SUCCESS);
+    }
+  }
+}  // namespace cms::cudatest
diff --git a/src/cudacompat/CUDACore/requireDevices.h b/src/cudacompat/CUDACore/requireDevices.h
new file mode 100644
index 000000000..0795175b3
--- /dev/null
+++ b/src/cudacompat/CUDACore/requireDevices.h
@@ -0,0 +1,17 @@
+#ifndef HeterogeneousCore_CUDAUtilities_requireDevices_h
+#define HeterogeneousCore_CUDAUtilities_requireDevices_h
+
+/**
+ * These functions are meant to be called only from unit tests.
+ */
+namespace cms {
+  namespace cudatest {
+    /// In presence of CUDA devices, return true; otherwise print message and return false
+    bool testDevices();
+
+    /// Print message and exit if there are no CUDA devices
+    void requireDevices();
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_requireDevices_h
diff --git a/src/cudacompat/CUDADataFormats/BeamSpotCUDA.h b/src/cudacompat/CUDADataFormats/BeamSpotCUDA.h
new file mode 100644
index 000000000..a090ef347
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/BeamSpotCUDA.h
@@ -0,0 +1,33 @@
+#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+
+#include <cuda_runtime.h>
+
+#include "DataFormats/BeamSpotPOD.h"
+#include "CUDACore/device_unique_ptr.h"
+
+class BeamSpotCUDA {
+public:
+  // default constructor, required by cms::cuda::Product<BeamSpotCUDA>
+  BeamSpotCUDA() = default;
+
+  // constructor that allocates cached device memory on the given CUDA stream
+  BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique<BeamSpotPOD>(stream); }
+
+  // movable, non-copiable
+  BeamSpotCUDA(BeamSpotCUDA const&) = delete;
+  BeamSpotCUDA(BeamSpotCUDA&&) = default;
+  BeamSpotCUDA& operator=(BeamSpotCUDA const&) = delete;
+  BeamSpotCUDA& operator=(BeamSpotCUDA&&) = default;
+
+  BeamSpotPOD* data() { return data_d_.get(); }
+  BeamSpotPOD const* data() const { return data_d_.get(); }
+
+  cms::cuda::device::unique_ptr<BeamSpotPOD>& ptr() { return data_d_; }
+  cms::cuda::device::unique_ptr<BeamSpotPOD> const& ptr() const { return data_d_; }
+
+private:
+  cms::cuda::device::unique_ptr<BeamSpotPOD> data_d_;
+};
+
+#endif  // CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
diff --git a/src/cudacompat/CUDADataFormats/HeterogeneousSoA.h b/src/cudacompat/CUDADataFormats/HeterogeneousSoA.h
new file mode 100644
index 000000000..cfaad449c
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/HeterogeneousSoA.h
@@ -0,0 +1,189 @@
+#ifndef CUDADataFormatsCommonHeterogeneousSoA_H
+#define CUDADataFormatsCommonHeterogeneousSoA_H
+
+#include <cassert>
+
+#include "CUDACore/copyAsync.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+
+// a heterogeneous unique pointer...
+template <typename T>
+class HeterogeneousSoA {
+public:
+  using Product = T;
+
+  HeterogeneousSoA() = default;  // make root happy
+  ~HeterogeneousSoA() = default;
+  HeterogeneousSoA(HeterogeneousSoA &&) = default;
+  HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default;
+
+  explicit HeterogeneousSoA(cms::cuda::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cms::cuda::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
+
+  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+
+  auto const &operator*() const { return *get(); }
+
+  auto const *operator->() const { return get(); }
+
+  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+
+  auto &operator*() { return *get(); }
+
+  auto *operator->() { return get(); }
+
+  // in reality valid only for GPU version...
+  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
+    assert(dm_ptr);
+    auto ret = cms::cuda::make_host_unique<T>(stream);
+    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
+    return ret;
+  }
+
+private:
+  // a union wan't do it, a variant will not be more efficienct
+  cms::cuda::device::unique_ptr<T> dm_ptr;  //!
+  cms::cuda::host::unique_ptr<T> hm_ptr;    //!
+  std::unique_ptr<T> std_ptr;               //!
+};
+
+namespace cms {
+  namespace cudacompat {
+
+    struct GPUTraits {
+      template <typename T>
+      using unique_ptr = cms::cuda::device::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+    };
+
+    struct HostTraits {
+      template <typename T>
+      using unique_ptr = cms::cuda::host::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+    };
+
+    struct CPUTraits {
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_unique(size_t size, cudaStream_t) {
+        return std::make_unique<T>(size);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t) {
+        return std::make_unique<T>(size);
+      }
+    };
+
+  }  // namespace cudacompat
+}  // namespace cms
+
+// a heterogeneous unique pointer (of a different sort) ...
+template <typename T, typename Traits>
+class HeterogeneousSoAImpl {
+public:
+  template <typename V>
+  using unique_ptr = typename Traits::template unique_ptr<V>;
+
+  HeterogeneousSoAImpl() = default;  // make root happy
+  ~HeterogeneousSoAImpl() = default;
+  HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
+  HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
+
+  explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
+  explicit HeterogeneousSoAImpl(cudaStream_t stream);
+
+  T const *get() const { return m_ptr.get(); }
+
+  T *get() { return m_ptr.get(); }
+
+  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
+
+private:
+  unique_ptr<T> m_ptr;  //!
+};
+
+template <typename T, typename Traits>
+HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
+  m_ptr = Traits::template make_unique<T>(stream);
+}
+
+// in reality valid only for GPU version...
+template <typename T, typename Traits>
+cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<T>(stream);
+  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
+  return ret;
+}
+
+template <typename T>
+using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cms::cudacompat::GPUTraits>;
+template <typename T>
+using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cms::cudacompat::CPUTraits>;
+template <typename T>
+using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cms::cudacompat::HostTraits>;
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/PixelTrackHeterogeneous.h b/src/cudacompat/CUDADataFormats/PixelTrackHeterogeneous.h
new file mode 100644
index 000000000..579c67092
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/PixelTrackHeterogeneous.h
@@ -0,0 +1,74 @@
+#ifndef CUDADataFormatsTrackTrackHeterogeneous_H
+#define CUDADataFormatsTrackTrackHeterogeneous_H
+
+#include "CUDADataFormats/TrajectoryStateSoA.h"
+#include "CUDACore/HistoContainer.h"
+
+#include "CUDADataFormats/HeterogeneousSoA.h"
+
+namespace trackQuality {
+  enum Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAT {
+public:
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = trackQuality::Quality;
+  using hindex_type = uint16_t;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+  eigenSoA::ScalarSoA<uint8_t, S> m_quality;
+  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i)); }
+  constexpr Quality &quality(int32_t i) { return (Quality &)(m_quality(i)); }
+  constexpr Quality const *qualityData() const { return (Quality const *)(m_quality.data()); }
+  constexpr Quality *qualityData() { return (Quality *)(m_quality.data()); }
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i); }
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoA<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+
+  // total number of tracks (including those not fitted)
+  uint32_t m_nTracks;
+};
+
+namespace pixelTrack {
+
+#ifdef GPU_SMALL_EVENTS
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+  using TrackSoA = TrackSoAT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoA<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+  using Quality = trackQuality::Quality;
+
+}  // namespace pixelTrack
+
+using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+
+#endif  // CUDADataFormatsTrackTrackSoA_H
diff --git a/src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.cc b/src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.cc
new file mode 100644
index 000000000..fd46a81bf
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.cc
@@ -0,0 +1,21 @@
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CUDACore/copyAsync.h"
+
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) {
+  moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
+  clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
+
+  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
+  view->moduleStart_ = moduleStart_d.get();
+  view->clusInModule_ = clusInModule_d.get();
+  view->moduleId_ = moduleId_d.get();
+  view->clusModuleStart_ = clusModuleStart_d.get();
+
+  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  cms::cuda::copyAsync(view_d, view, stream);
+}
diff --git a/src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.h b/src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.h
new file mode 100644
index 000000000..e41b8ea5c
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelClustersCUDA.h
@@ -0,0 +1,73 @@
+#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CUDACore/cudaCompat.h"
+
+#include <cuda_runtime.h>
+
+class SiPixelClustersCUDA {
+public:
+  SiPixelClustersCUDA() = default;
+  explicit SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream);
+  ~SiPixelClustersCUDA() = default;
+
+  SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete;
+  SiPixelClustersCUDA &operator=(const SiPixelClustersCUDA &) = delete;
+  SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default;
+  SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default;
+
+  void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; }
+
+  uint32_t nClusters() const { return nClusters_h; }
+
+  uint32_t *moduleStart() { return moduleStart_d.get(); }
+  uint32_t *clusInModule() { return clusInModule_d.get(); }
+  uint32_t *moduleId() { return moduleId_d.get(); }
+  uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
+
+  uint32_t const *moduleStart() const { return moduleStart_d.get(); }
+  uint32_t const *clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *moduleId() const { return moduleId_d.get(); }
+  uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
+  uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *c_moduleId() const { return moduleId_d.get(); }
+  uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  class DeviceConstView {
+  public:
+    // DeviceConstView() = default;
+
+    __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); }
+    __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); }
+    __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); }
+    __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); }
+
+    friend SiPixelClustersCUDA;
+
+    //   private:
+    uint32_t const *moduleStart_;
+    uint32_t const *clusInModule_;
+    uint32_t const *moduleId_;
+    uint32_t const *clusModuleStart_;
+  };
+
+  DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  cms::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
+  cms::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
+  cms::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
+
+  // originally from rechits
+  cms::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;  // index of the first cluster of each module
+
+  cms::cuda::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
+
+  uint32_t nClusters_h;
+};
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.cc b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
new file mode 100644
index 000000000..b19664874
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
@@ -0,0 +1,42 @@
+#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
+
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CUDACore/copyAsync.h"
+#include "CUDACore/memsetAsync.h"
+
+#include <cassert>
+
+SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
+    : formatterErrors_h(std::move(errors)) {
+  error_d = cms::cuda::make_device_unique<cms::cuda::SimpleVector<PixelErrorCompact>>(stream);
+  data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+
+  cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
+
+  error_h = cms::cuda::make_host_unique<cms::cuda::SimpleVector<PixelErrorCompact>>(stream);
+  cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
+  assert(error_h->empty());
+  assert(error_h->capacity() == static_cast<int>(maxFedWords));
+
+  cms::cuda::copyAsync(error_d, error_h, stream);
+}
+
+void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) {
+  cms::cuda::copyAsync(error_h, error_d, stream);
+}
+
+SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const {
+  // On one hand size() could be sufficient. On the other hand, if
+  // someone copies the SimpleVector<>, (s)he might expect the data
+  // buffer to actually have space for capacity() elements.
+  auto data = cms::cuda::make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
+
+  // but transfer only the required amount
+  if (not error_h->empty()) {
+    cms::cuda::copyAsync(data, data_d, error_h->size(), stream);
+  }
+  auto err = *error_h;
+  err.set_data(data.get());
+  return HostDataError(err, std::move(data));
+}
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.h b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.h
new file mode 100644
index 000000000..9c7c874ee
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsCUDA.h
@@ -0,0 +1,41 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/SimpleVector.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "DataFormats/PixelErrors.h"
+
+class SiPixelDigiErrorsCUDA {
+public:
+  SiPixelDigiErrorsCUDA() = default;
+  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream);
+  ~SiPixelDigiErrorsCUDA() = default;
+
+  SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA& operator=(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default;
+  SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default;
+
+  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
+
+  cms::cuda::SimpleVector<PixelErrorCompact>* error() { return error_d.get(); }
+  cms::cuda::SimpleVector<PixelErrorCompact> const* error() const { return error_d.get(); }
+  cms::cuda::SimpleVector<PixelErrorCompact> const* c_error() const { return error_d.get(); }
+
+  using HostDataError =
+      std::pair<cms::cuda::SimpleVector<PixelErrorCompact>, cms::cuda::host::unique_ptr<PixelErrorCompact[]>>;
+  HostDataError dataErrorToHostAsync(cudaStream_t stream) const;
+
+  void copyErrorToHostAsync(cudaStream_t stream);
+
+private:
+  cms::cuda::device::unique_ptr<PixelErrorCompact[]> data_d;
+  cms::cuda::device::unique_ptr<cms::cuda::SimpleVector<PixelErrorCompact>> error_d;
+  cms::cuda::host::unique_ptr<cms::cuda::SimpleVector<PixelErrorCompact>> error_h;
+  PixelFormatterErrors formatterErrors_h;
+};
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.cc
new file mode 100644
index 000000000..5f096ab18
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.cc
@@ -0,0 +1,50 @@
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CUDACore/copyAsync.h"
+
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
+  xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream);
+
+  pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
+  rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
+
+  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
+  view->xx_ = xx_d.get();
+  view->yy_ = yy_d.get();
+  view->adc_ = adc_d.get();
+  view->moduleInd_ = moduleInd_d.get();
+  view->clus_ = clus_d.get();
+
+  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  cms::cuda::copyAsync(view_d, view, stream);
+}
+
+cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, adc_d, nDigis(), stream);
+  return ret;
+}
+
+cms::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<int32_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, clus_d, nDigis(), stream);
+  return ret;
+}
+
+cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, pdigi_d, nDigis(), stream);
+  return ret;
+}
+
+cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+  return ret;
+}
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.h b/src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.h
new file mode 100644
index 000000000..647f5b42e
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigisCUDA.h
@@ -0,0 +1,98 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CUDACore/cudaCompat.h"
+
+#include <cuda_runtime.h>
+
+class SiPixelDigisCUDA {
+public:
+  SiPixelDigisCUDA() = default;
+  explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
+  ~SiPixelDigisCUDA() = default;
+
+  SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete;
+  SiPixelDigisCUDA &operator=(const SiPixelDigisCUDA &) = delete;
+  SiPixelDigisCUDA(SiPixelDigisCUDA &&) = default;
+  SiPixelDigisCUDA &operator=(SiPixelDigisCUDA &&) = default;
+
+  void setNModulesDigis(uint32_t nModules, uint32_t nDigis) {
+    nModules_h = nModules;
+    nDigis_h = nDigis;
+  }
+
+  uint32_t nModules() const { return nModules_h; }
+  uint32_t nDigis() const { return nDigis_h; }
+
+  uint16_t *xx() { return xx_d.get(); }
+  uint16_t *yy() { return yy_d.get(); }
+  uint16_t *adc() { return adc_d.get(); }
+  uint16_t *moduleInd() { return moduleInd_d.get(); }
+  int32_t *clus() { return clus_d.get(); }
+  uint32_t *pdigi() { return pdigi_d.get(); }
+  uint32_t *rawIdArr() { return rawIdArr_d.get(); }
+
+  uint16_t const *xx() const { return xx_d.get(); }
+  uint16_t const *yy() const { return yy_d.get(); }
+  uint16_t const *adc() const { return adc_d.get(); }
+  uint16_t const *moduleInd() const { return moduleInd_d.get(); }
+  int32_t const *clus() const { return clus_d.get(); }
+  uint32_t const *pdigi() const { return pdigi_d.get(); }
+  uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
+
+  uint16_t const *c_xx() const { return xx_d.get(); }
+  uint16_t const *c_yy() const { return yy_d.get(); }
+  uint16_t const *c_adc() const { return adc_d.get(); }
+  uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
+  int32_t const *c_clus() const { return clus_d.get(); }
+  uint32_t const *c_pdigi() const { return pdigi_d.get(); }
+  uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
+
+  cms::cuda::host::unique_ptr<uint16_t[]> adcToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<int32_t[]> clusToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cudaStream_t stream) const;
+
+  class DeviceConstView {
+  public:
+    // DeviceConstView() = default;
+
+    __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); }
+    __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); }
+    __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); }
+    __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); }
+    __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); }
+
+    friend class SiPixelDigisCUDA;
+
+    // private:
+    uint16_t const *xx_;
+    uint16_t const *yy_;
+    uint16_t const *adc_;
+    uint16_t const *moduleInd_;
+    int32_t const *clus_;
+  };
+
+  const DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  // These are consumed by downstream device code
+  cms::cuda::device::unique_ptr<uint16_t[]> xx_d;         // local coordinates of each pixel
+  cms::cuda::device::unique_ptr<uint16_t[]> yy_d;         //
+  cms::cuda::device::unique_ptr<uint16_t[]> adc_d;        // ADC of each pixel
+  cms::cuda::device::unique_ptr<uint16_t[]> moduleInd_d;  // module id of each pixel
+  cms::cuda::device::unique_ptr<int32_t[]> clus_d;        // cluster id of each pixel
+  cms::cuda::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
+
+  // These are for CPU output; should we (eventually) place them to a
+  // separate product?
+  cms::cuda::device::unique_ptr<uint32_t[]> pdigi_d;
+  cms::cuda::device::unique_ptr<uint32_t[]> rawIdArr_d;
+
+  uint32_t nModules_h = 0;
+  uint32_t nDigis_h = 0;
+};
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.cc b/src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.cc
new file mode 100644
index 000000000..81b5e5571
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.cc
@@ -0,0 +1,43 @@
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDACore/copyAsync.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+
+template <>
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
+  return ret;
+}
+
+template <>
+cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(2001, stream);
+  cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
+  return ret;
+}
+
+template <>
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::globalCoordToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
+  cudaCheck(cudaMemcpyAsync(
+      ret.get(), m_store32.get() + 4 * nHits(), 4 * nHits() * sizeof(float), cudaMemcpyDefault, stream));
+  return ret;
+}
+
+template <>
+cms::cuda::host::unique_ptr<int32_t[]> TrackingRecHit2DCUDA::chargeToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<int32_t[]>(nHits(), stream);
+  cudaCheck(
+      cudaMemcpyAsync(ret.get(), m_store32.get() + 8 * nHits(), nHits() * sizeof(int32_t), cudaMemcpyDefault, stream));
+  return ret;
+}
+
+template <>
+cms::cuda::host::unique_ptr<int16_t[]> TrackingRecHit2DCUDA::sizeToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<int16_t[]>(2 * nHits(), stream);
+  cudaCheck(cudaMemcpyAsync(
+      ret.get(), m_store16.get() + 2 * nHits(), 2 * nHits() * sizeof(int16_t), cudaMemcpyDefault, stream));
+  return ret;
+}
diff --git a/src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.h b/src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.h
new file mode 100644
index 000000000..54b74d97b
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/TrackingRecHit2DCUDA.h
@@ -0,0 +1 @@
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
diff --git a/src/cudacompat/CUDADataFormats/TrackingRecHit2DHeterogeneous.h b/src/cudacompat/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
new file mode 100644
index 000000000..2320fa6d6
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
@@ -0,0 +1,155 @@
+#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
+#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
+
+#include "CUDADataFormats/TrackingRecHit2DSOAView.h"
+#include "CUDADataFormats/HeterogeneousSoA.h"
+
+template <typename Traits>
+class TrackingRecHit2DHeterogeneous {
+public:
+  template <typename T>
+  using unique_ptr = typename Traits::template unique_ptr<T>;
+
+  using Hist = TrackingRecHit2DSOAView::Hist;
+
+  TrackingRecHit2DHeterogeneous() = default;
+
+  explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
+                                         pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                         uint32_t const* hitsModuleStart,
+                                         cudaStream_t stream);
+
+  ~TrackingRecHit2DHeterogeneous() = default;
+
+  TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete;
+  TrackingRecHit2DHeterogeneous& operator=(const TrackingRecHit2DHeterogeneous&) = delete;
+  TrackingRecHit2DHeterogeneous(TrackingRecHit2DHeterogeneous&&) = default;
+  TrackingRecHit2DHeterogeneous& operator=(TrackingRecHit2DHeterogeneous&&) = default;
+
+  TrackingRecHit2DSOAView* view() { return m_view.get(); }
+  TrackingRecHit2DSOAView const* view() const { return m_view.get(); }
+
+  auto nHits() const { return m_nHits; }
+
+  auto hitsModuleStart() const { return m_hitsModuleStart; }
+  auto hitsLayerStart() { return m_hitsLayerStart; }
+  auto phiBinner() { return m_hist; }
+  auto iphi() { return m_iphi; }
+
+  // only the local coord and detector index
+  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
+
+  // for validation
+  cms::cuda::host::unique_ptr<float[]> globalCoordToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<int32_t[]> chargeToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<int16_t[]> sizeToHostAsync(cudaStream_t stream) const;
+
+private:
+  static constexpr uint32_t n16 = 4;
+  static constexpr uint32_t n32 = 9;
+  static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
+
+  unique_ptr<uint16_t[]> m_store16;  //!
+  unique_ptr<float[]> m_store32;     //!
+
+  unique_ptr<TrackingRecHit2DSOAView::Hist> m_HistStore;                        //!
+  unique_ptr<TrackingRecHit2DSOAView::AverageGeometry> m_AverageGeometryStore;  //!
+
+  unique_ptr<TrackingRecHit2DSOAView> m_view;  //!
+
+  uint32_t m_nHits;
+
+  uint32_t const* m_hitsModuleStart;  // needed for legacy, this is on GPU!
+
+  // needed as kernel params...
+  Hist* m_hist;
+  uint32_t* m_hitsLayerStart;
+  int16_t* m_iphi;
+};
+
+#include "CUDACore/copyAsync.h"
+#include "CUDACore/cudaCheck.h"
+
+template <typename Traits>
+TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
+                                                                     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                                                     uint32_t const* hitsModuleStart,
+                                                                     cudaStream_t stream)
+    : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
+  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
+
+  view->m_nHits = nHits;
+  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
+  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  view->m_averageGeometry = m_AverageGeometryStore.get();
+  view->m_cpeParams = cpeParams;
+  view->m_hitsModuleStart = hitsModuleStart;
+
+  // if empy do not bother
+  if (0 == nHits) {
+    if
+#ifndef __CUDACC__
+        constexpr
+#endif
+        (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+      cms::cuda::copyAsync(m_view, view, stream);
+    } else {
+      m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
+    }
+    return;
+  }
+
+  // the single arrays are not 128 bit alligned...
+  // the hits are actually accessed in order only in building
+  // if ordering is relevant they may have to be stored phi-ordered by layer or so
+  // this will break 1to1 correspondence with cluster and module locality
+  // so unless proven VERY inefficient we keep it ordered as generated
+  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
+  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
+  m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);
+
+  auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
+  auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
+
+  // copy all the pointers
+  m_hist = view->m_hist = m_HistStore.get();
+
+  view->m_xl = get32(0);
+  view->m_yl = get32(1);
+  view->m_xerr = get32(2);
+  view->m_yerr = get32(3);
+
+  view->m_xg = get32(4);
+  view->m_yg = get32(5);
+  view->m_zg = get32(6);
+  view->m_rg = get32(7);
+
+  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(0));
+
+  view->m_charge = reinterpret_cast<int32_t*>(get32(8));
+  view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
+  view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
+  view->m_detInd = get16(1);
+
+  m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
+
+  // transfer view
+  if
+#ifndef __CUDACC__
+      constexpr
+#endif
+      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+    cms::cuda::copyAsync(m_view, view, stream);
+  } else {
+    m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
+  }
+}
+
+using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
+using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
+using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
+using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
+
+#endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
diff --git a/src/cudacompat/CUDADataFormats/TrackingRecHit2DSOAView.h b/src/cudacompat/CUDADataFormats/TrackingRecHit2DSOAView.h
new file mode 100644
index 000000000..faaa4378c
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/TrackingRecHit2DSOAView.h
@@ -0,0 +1,101 @@
+#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
+#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cudaCompat.h"
+#include "Geometry/phase1PixelTopology.h"
+
+namespace pixelCPEforGPU {
+  struct ParamsOnGPU;
+}
+
+class TrackingRecHit2DSOAView {
+public:
+  static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
+  using hindex_type = uint16_t;  // if above is <=2^16
+
+  using Hist =
+      cms::cuda::HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), uint16_t, 10>;
+
+  using AverageGeometry = phase1PixelTopology::AverageGeometry;
+
+  template <typename>
+  friend class TrackingRecHit2DHeterogeneous;
+
+  __device__ __forceinline__ uint32_t nHits() const { return m_nHits; }
+
+  __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; }
+  __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); }
+  __device__ __forceinline__ float& yLocal(int i) { return m_yl[i]; }
+  __device__ __forceinline__ float yLocal(int i) const { return __ldg(m_yl + i); }
+
+  __device__ __forceinline__ float& xerrLocal(int i) { return m_xerr[i]; }
+  __device__ __forceinline__ float xerrLocal(int i) const { return __ldg(m_xerr + i); }
+  __device__ __forceinline__ float& yerrLocal(int i) { return m_yerr[i]; }
+  __device__ __forceinline__ float yerrLocal(int i) const { return __ldg(m_yerr + i); }
+
+  __device__ __forceinline__ float& xGlobal(int i) { return m_xg[i]; }
+  __device__ __forceinline__ float xGlobal(int i) const { return __ldg(m_xg + i); }
+  __device__ __forceinline__ float& yGlobal(int i) { return m_yg[i]; }
+  __device__ __forceinline__ float yGlobal(int i) const { return __ldg(m_yg + i); }
+  __device__ __forceinline__ float& zGlobal(int i) { return m_zg[i]; }
+  __device__ __forceinline__ float zGlobal(int i) const { return __ldg(m_zg + i); }
+  __device__ __forceinline__ float& rGlobal(int i) { return m_rg[i]; }
+  __device__ __forceinline__ float rGlobal(int i) const { return __ldg(m_rg + i); }
+
+  __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; }
+  __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); }
+
+  __device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; }
+  __device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); }
+  __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; }
+  __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); }
+  __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; }
+  __device__ __forceinline__ int16_t clusterSizeY(int i) const { return __ldg(m_ysize + i); }
+  __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; }
+  __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); }
+
+  __device__ __forceinline__ pixelCPEforGPU::ParamsOnGPU const& cpeParams() const { return *m_cpeParams; }
+
+  __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); }
+
+  __device__ __forceinline__ uint32_t* hitsLayerStart() { return m_hitsLayerStart; }
+  __device__ __forceinline__ uint32_t const* hitsLayerStart() const { return m_hitsLayerStart; }
+
+  __device__ __forceinline__ Hist& phiBinner() { return *m_hist; }
+  __device__ __forceinline__ Hist const& phiBinner() const { return *m_hist; }
+
+  __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; }
+  __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; }
+
+private:
+  // local coord
+  float *m_xl, *m_yl;
+  float *m_xerr, *m_yerr;
+
+  // global coord
+  float *m_xg, *m_yg, *m_zg, *m_rg;
+  int16_t* m_iphi;
+
+  // cluster properties
+  int32_t* m_charge;
+  int16_t* m_xsize;
+  int16_t* m_ysize;
+  uint16_t* m_detInd;
+
+  // supporting objects
+  AverageGeometry* m_averageGeometry;  // owned (corrected for beam spot: not sure where to host it otherwise)
+  pixelCPEforGPU::ParamsOnGPU const* m_cpeParams;  // forwarded from setup, NOT owned
+  uint32_t const* m_hitsModuleStart;               // forwarded from clusters
+
+  uint32_t* m_hitsLayerStart;
+
+  Hist* m_hist;
+
+  uint32_t m_nHits;
+};
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/TrajectoryStateSoA.h b/src/cudacompat/CUDADataFormats/TrajectoryStateSoA.h
new file mode 100644
index 000000000..49ca2b525
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/TrajectoryStateSoA.h
@@ -0,0 +1,59 @@
+#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H
+#define CUDADataFormatsTrackTrajectoryStateSOA_H
+
+#include <Eigen/Dense>
+#include "CUDACore/eigenSoA.h"
+
+template <int32_t S>
+struct TrajectoryStateSoA {
+  using Vector5f = Eigen::Matrix<float, 5, 1>;
+  using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::MatrixSoA<Vector5f, S> state;
+  eigenSoA::MatrixSoA<Vector15f, S> covariance;
+
+  template <typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__ inline void copyFromCircle(
+      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
+    state(i) << cp.template cast<float>(), lp.template cast<float>();
+    state(i)(2) *= b;
+    auto cov = covariance(i);
+    cov(0) = ccov(0, 0);
+    cov(1) = ccov(0, 1);
+    cov(2) = b * float(ccov(0, 2));
+    cov(4) = cov(3) = 0;
+    cov(5) = ccov(1, 1);
+    cov(6) = b * float(ccov(1, 2));
+    cov(8) = cov(7) = 0;
+    cov(9) = b * b * float(ccov(2, 2));
+    cov(11) = cov(10) = 0;
+    cov(12) = lcov(0, 0);
+    cov(13) = lcov(0, 1);
+    cov(14) = lcov(1, 1);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
+    state(i) = v.template cast<float>();
+    for (int j = 0, ind = 0; j < 5; ++j)
+      for (auto k = j; k < 5; ++k)
+        covariance(i)(ind++) = cov(j, k);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
+    v = state(i).template cast<typename V5::Scalar>();
+    for (int j = 0, ind = 0; j < 5; ++j) {
+      cov(j, j) = covariance(i)(ind++);
+      for (auto k = j + 1; k < 5; ++k)
+        cov(k, j) = cov(j, k) = covariance(i)(ind++);
+    }
+  }
+};
+
+#endif  // CUDADataFormatsTrackTrajectoryStateSOA_H
diff --git a/src/cudacompat/CUDADataFormats/ZVertexHeterogeneous.h b/src/cudacompat/CUDADataFormats/ZVertexHeterogeneous.h
new file mode 100644
index 000000000..67fe6e398
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/ZVertexHeterogeneous.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
+#define CUDADataFormatsVertexZVertexHeterogeneous_H
+
+#include "CUDADataFormats/ZVertexSoA.h"
+#include "CUDADataFormats/HeterogeneousSoA.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+
+using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
+#ifndef __CUDACC__
+#include "CUDACore/Product.h"
+using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
+#endif
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/ZVertexSoA.h b/src/cudacompat/CUDADataFormats/ZVertexSoA.h
new file mode 100644
index 000000000..ecdf76d8e
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/ZVertexSoA.h
@@ -0,0 +1,26 @@
+#ifndef CUDADataFormatsVertexZVertexSoA_H
+#define CUDADataFormatsVertexZVertexSoA_H
+
+#include <cstdint>
+#include "CUDACore/cudaCompat.h"
+
+// SOA for vertices
+// These vertices are clusterized and fitted only along the beam line (z)
+// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
+struct ZVertexSoA {
+  static constexpr uint32_t MAXTRACKS = 32 * 1024;
+  static constexpr uint32_t MAXVTX = 1024;
+
+  int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
+  float zv[MAXVTX];          // output z-posistion of found vertices
+  float wv[MAXVTX];          // output weight (1/error^2) on the above
+  float chi2[MAXVTX];        // vertices chi2
+  float ptv2[MAXVTX];        // vertices pt^2
+  int32_t ndof[MAXTRACKS];   // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
+  uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
+  uint32_t nvFinal;          // the number of vertices
+
+  __host__ __device__ void init() { nvFinal = 0; }
+};
+
+#endif  // CUDADataFormatsVertexZVertexSoA.H
diff --git a/src/cudacompat/CUDADataFormats/gpuClusteringConstants.h b/src/cudacompat/CUDADataFormats/gpuClusteringConstants.h
new file mode 100644
index 000000000..1430606ab
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/gpuClusteringConstants.h
@@ -0,0 +1,32 @@
+#ifndef CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h
+#define CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h
+
+#include <cstdint>
+
+namespace pixelGPUConstants {
+#ifdef GPU_SMALL_EVENTS
+  constexpr uint32_t maxNumberOfHits = 24 * 1024;
+#else
+  constexpr uint32_t maxNumberOfHits =
+      48 * 1024;  // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away
+#endif
+}  // namespace pixelGPUConstants
+
+namespace gpuClustering {
+#ifdef GPU_SMALL_EVENTS
+  constexpr uint32_t maxHitsInIter() { return 64; }
+#else
+  // optimized for real data PU 50
+  constexpr uint32_t maxHitsInIter() { return 160; }
+#endif
+  constexpr uint32_t maxHitsInModule() { return 1024; }
+
+  constexpr uint32_t MaxNumModules = 2000;
+  constexpr int32_t MaxNumClustersPerModules = maxHitsInModule();
+  constexpr uint32_t MaxHitsInModule = maxHitsInModule();  // as above
+  constexpr uint32_t MaxNumClusters = pixelGPUConstants::maxNumberOfHits;
+  constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
+
+}  // namespace gpuClustering
+
+#endif  // CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h
diff --git a/src/cudacompat/CondFormats/PixelCPEFast.cc b/src/cudacompat/CondFormats/PixelCPEFast.cc
new file mode 100644
index 000000000..dd79bd389
--- /dev/null
+++ b/src/cudacompat/CondFormats/PixelCPEFast.cc
@@ -0,0 +1,85 @@
+#include <iostream>
+#include <fstream>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "Geometry/phase1PixelTopology.h"
+#include "CUDACore/cudaCheck.h"
+#include "CondFormats/PixelCPEFast.h"
+
+// Services
+// this is needed to get errors from templates
+
+namespace {
+  constexpr float micronsToCm = 1.0e-4;
+}
+
+//-----------------------------------------------------------------------------
+//!  The constructor.
+//-----------------------------------------------------------------------------
+PixelCPEFast::PixelCPEFast(std::string const &path) {
+  {
+    std::ifstream in(path, std::ios::binary);
+    in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+    in.read(reinterpret_cast<char *>(&m_commonParamsGPU), sizeof(pixelCPEforGPU::CommonParams));
+    unsigned int ndetParams;
+    in.read(reinterpret_cast<char *>(&ndetParams), sizeof(unsigned int));
+    m_detParamsGPU.resize(ndetParams);
+    in.read(reinterpret_cast<char *>(m_detParamsGPU.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams));
+    in.read(reinterpret_cast<char *>(&m_averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry));
+    in.read(reinterpret_cast<char *>(&m_layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry));
+  }
+
+  cpuData_ = {
+      &m_commonParamsGPU,
+      m_detParamsGPU.data(),
+      &m_layerGeometry,
+      &m_averageGeometry,
+  };
+}
+
+const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cudaStream_t cudaStream) const {
+  const auto &data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData &data, cudaStream_t stream) {
+    // and now copy to device...
+    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
+    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_detParams,
+                         this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams)));
+    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry)));
+    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry)));
+    cudaCheck(cudaMalloc((void **)&data.d_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU)));
+
+    cudaCheck(cudaMemcpyAsync(
+        data.d_paramsOnGPU, &data.h_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_commonParams,
+                              &this->m_commonParamsGPU,
+                              sizeof(pixelCPEforGPU::CommonParams),
+                              cudaMemcpyDefault,
+                              stream));
+    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_averageGeometry,
+                              &this->m_averageGeometry,
+                              sizeof(pixelCPEforGPU::AverageGeometry),
+                              cudaMemcpyDefault,
+                              stream));
+    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_layerGeometry,
+                              &this->m_layerGeometry,
+                              sizeof(pixelCPEforGPU::LayerGeometry),
+                              cudaMemcpyDefault,
+                              stream));
+    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_detParams,
+                              this->m_detParamsGPU.data(),
+                              this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams),
+                              cudaMemcpyDefault,
+                              stream));
+  });
+  return data.d_paramsOnGPU;
+}
+
+PixelCPEFast::GPUData::~GPUData() {
+  if (d_paramsOnGPU != nullptr) {
+    cudaFree((void *)h_paramsOnGPU.m_commonParams);
+    cudaFree((void *)h_paramsOnGPU.m_detParams);
+    cudaFree((void *)h_paramsOnGPU.m_averageGeometry);
+    cudaFree(d_paramsOnGPU);
+  }
+}
diff --git a/src/cudacompat/CondFormats/PixelCPEFast.h b/src/cudacompat/CondFormats/PixelCPEFast.h
new file mode 100644
index 000000000..eb0f21c28
--- /dev/null
+++ b/src/cudacompat/CondFormats/PixelCPEFast.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h
+#define RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h
+
+#include <utility>
+
+#include "CUDACore/ESProduct.h"
+#include "CUDACore/HostAllocator.h"
+#include "CondFormats/pixelCPEforGPU.h"
+
+class PixelCPEFast {
+public:
+  PixelCPEFast(std::string const &path);
+
+  ~PixelCPEFast() = default;
+
+  // The return value can only be used safely in kernels launched on
+  // the same cudaStream, or after cudaStreamSynchronize.
+  const pixelCPEforGPU::ParamsOnGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
+
+  pixelCPEforGPU::ParamsOnGPU const &getCPUProduct() const { return cpuData_; }
+
+private:
+  // allocate it with posix malloc to be ocmpatible with cpu wf
+  std::vector<pixelCPEforGPU::DetParams> m_detParamsGPU;
+  // std::vector<pixelCPEforGPU::DetParams, cms::cuda::HostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
+  pixelCPEforGPU::CommonParams m_commonParamsGPU;
+  pixelCPEforGPU::LayerGeometry m_layerGeometry;
+  pixelCPEforGPU::AverageGeometry m_averageGeometry;
+
+  pixelCPEforGPU::ParamsOnGPU cpuData_;
+
+  struct GPUData {
+    ~GPUData();
+    // not needed if not used on CPU...
+    pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU;
+    pixelCPEforGPU::ParamsOnGPU *d_paramsOnGPU = nullptr;  // copy of the above on the Device
+  };
+  cms::cuda::ESProduct<GPUData> gpuData_;
+
+  void fillParamsForGpu();
+};
+
+#endif  // RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h
diff --git a/src/cudacompat/CondFormats/SiPixelFedCablingMapGPU.h b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPU.h
new file mode 100644
index 000000000..900307ae0
--- /dev/null
+++ b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPU.h
@@ -0,0 +1,26 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPU_h
+#define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPU_h
+
+namespace pixelgpudetails {
+  // Maximum fed for phase1 is 150 but not all of them are filled
+  // Update the number FED based on maximum fed found in the cabling map
+  constexpr unsigned int MAX_FED = 150;
+  constexpr unsigned int MAX_LINK = 48;  // maximum links/channels for Phase 1
+  constexpr unsigned int MAX_ROC = 8;
+  constexpr unsigned int MAX_SIZE = MAX_FED * MAX_LINK * MAX_ROC;
+  constexpr unsigned int MAX_SIZE_BYTE_BOOL = MAX_SIZE * sizeof(unsigned char);
+}  // namespace pixelgpudetails
+
+// TODO: since this has more information than just cabling map, maybe we should invent a better name?
+struct SiPixelFedCablingMapGPU {
+  alignas(128) unsigned int fed[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned int link[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned int roc[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned int RawId[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned int rocInDet[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned int moduleId[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned char badRocs[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned int size = 0;
+};
+
+#endif
diff --git a/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.cc b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.cc
new file mode 100644
index 000000000..56a3dc7ea
--- /dev/null
+++ b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.cc
@@ -0,0 +1,54 @@
+// C++ includes
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+// CUDA includes
+#include <cuda_runtime.h>
+
+// CMSSW includes
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"
+
+SiPixelFedCablingMapGPUWrapper::SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const& cablingMap,
+                                                               std::vector<unsigned char> modToUnp)
+    : modToUnpDefault(modToUnp.size()), hasQuality_(true) {
+  cudaCheck(cudaMallocHost(&cablingMapHost, sizeof(SiPixelFedCablingMapGPU)));
+  std::memcpy(cablingMapHost, &cablingMap, sizeof(SiPixelFedCablingMapGPU));
+
+  std::copy(modToUnp.begin(), modToUnp.end(), modToUnpDefault.begin());
+}
+
+SiPixelFedCablingMapGPUWrapper::~SiPixelFedCablingMapGPUWrapper() { cudaCheck(cudaFreeHost(cablingMapHost)); }
+
+const SiPixelFedCablingMapGPU* SiPixelFedCablingMapGPUWrapper::getGPUProductAsync(cudaStream_t cudaStream) const {
+  const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
+    // allocate
+    cudaCheck(cudaMalloc(&data.cablingMapDevice, sizeof(SiPixelFedCablingMapGPU)));
+
+    // transfer
+    cudaCheck(cudaMemcpyAsync(
+        data.cablingMapDevice, this->cablingMapHost, sizeof(SiPixelFedCablingMapGPU), cudaMemcpyDefault, stream));
+  });
+  return data.cablingMapDevice;
+}
+
+const unsigned char* SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cudaStream_t cudaStream) const {
+  const auto& data =
+      modToUnp_.dataForCurrentDeviceAsync(cudaStream, [this](ModulesToUnpack& data, cudaStream_t stream) {
+        cudaCheck(cudaMalloc((void**)&data.modToUnpDefault, pixelgpudetails::MAX_SIZE_BYTE_BOOL));
+        cudaCheck(cudaMemcpyAsync(data.modToUnpDefault,
+                                  this->modToUnpDefault.data(),
+                                  this->modToUnpDefault.size() * sizeof(unsigned char),
+                                  cudaMemcpyDefault,
+                                  stream));
+      });
+  return data.modToUnpDefault;
+}
+
+SiPixelFedCablingMapGPUWrapper::GPUData::~GPUData() { cudaCheck(cudaFree(cablingMapDevice)); }
+
+SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::~ModulesToUnpack() { cudaCheck(cudaFree(modToUnpDefault)); }
diff --git a/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h
new file mode 100644
index 000000000..027e7d25c
--- /dev/null
+++ b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h
@@ -0,0 +1,46 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
+#define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
+
+#include "CUDACore/ESProduct.h"
+#include "CUDACore/HostAllocator.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CondFormats/SiPixelFedCablingMapGPU.h"
+
+#include <cuda_runtime.h>
+
+#include <set>
+
+class SiPixelFedCablingMapGPUWrapper {
+public:
+  explicit SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const &cablingMap,
+                                          std::vector<unsigned char> modToUnp);
+  ~SiPixelFedCablingMapGPUWrapper();
+
+  bool hasQuality() const { return hasQuality_; }
+
+  // returns pointer to GPU memory
+  const SiPixelFedCablingMapGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
+
+  // returns pointer to GPU memory
+  const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const;
+
+private:
+  std::vector<unsigned char, cms::cuda::HostAllocator<unsigned char>> modToUnpDefault;
+  bool hasQuality_;
+
+  SiPixelFedCablingMapGPU *cablingMapHost = nullptr;  // pointer to struct in CPU
+
+  struct GPUData {
+    ~GPUData();
+    SiPixelFedCablingMapGPU *cablingMapDevice = nullptr;  // pointer to struct in GPU
+  };
+  cms::cuda::ESProduct<GPUData> gpuData_;
+
+  struct ModulesToUnpack {
+    ~ModulesToUnpack();
+    unsigned char *modToUnpDefault = nullptr;  // pointer to GPU
+  };
+  cms::cuda::ESProduct<ModulesToUnpack> modToUnp_;
+};
+
+#endif
diff --git a/src/cudacompat/CondFormats/SiPixelFedIds.h b/src/cudacompat/CondFormats/SiPixelFedIds.h
new file mode 100644
index 000000000..ffbd44491
--- /dev/null
+++ b/src/cudacompat/CondFormats/SiPixelFedIds.h
@@ -0,0 +1,17 @@
+#ifndef CondFormats_SiPixelFedIds_h
+#define CondFormats_SiPixelFedIds_h
+
+#include <vector>
+
+// Stripped-down version of SiPixelFedCablingMap
+class SiPixelFedIds {
+public:
+  explicit SiPixelFedIds(std::vector<unsigned int> fedIds) : fedIds_(std::move(fedIds)) {}
+
+  std::vector<unsigned int> const& fedIds() const { return fedIds_; }
+
+private:
+  std::vector<unsigned int> fedIds_;
+};
+
+#endif
diff --git a/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
new file mode 100644
index 000000000..76e64e8f3
--- /dev/null
+++ b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
@@ -0,0 +1,38 @@
+#include <cuda.h>
+
+#include "CondFormats/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/SiPixelGainForHLTonGPU.h"
+#include "CUDACore/cudaCheck.h"
+
+SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const& gain,
+                                                                 std::vector<char> gainData)
+    : gainData_(std::move(gainData)) {
+  cudaCheck(cudaMallocHost(&gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU)));
+  *gainForHLTonHost_ = gain;
+}
+
+SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { cudaCheck(cudaFreeHost(gainForHLTonHost_)); }
+
+SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
+  cudaCheck(cudaFree(gainForHLTonGPU));
+  cudaCheck(cudaFree(gainDataOnGPU));
+}
+
+const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cudaStream_t cudaStream) const {
+  const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
+    cudaCheck(cudaMalloc((void**)&data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU)));
+    cudaCheck(cudaMalloc((void**)&data.gainDataOnGPU, this->gainData_.size()));
+    // gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory
+    cudaCheck(
+        cudaMemcpyAsync(data.gainDataOnGPU, this->gainData_.data(), this->gainData_.size(), cudaMemcpyDefault, stream));
+
+    cudaCheck(cudaMemcpyAsync(
+        data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals),
+                              &(data.gainDataOnGPU),
+                              sizeof(SiPixelGainForHLTonGPU_DecodingStructure*),
+                              cudaMemcpyDefault,
+                              stream));
+  });
+  return data.gainForHLTonGPU;
+}
diff --git a/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.h b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.h
new file mode 100644
index 000000000..e5920a08c
--- /dev/null
+++ b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.h
@@ -0,0 +1,28 @@
+#ifndef CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h
+#define CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h
+
+#include "CUDACore/ESProduct.h"
+
+class SiPixelGainForHLTonGPU;
+struct SiPixelGainForHLTonGPU_DecodingStructure;
+
+class SiPixelGainCalibrationForHLTGPU {
+public:
+  explicit SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const &gain, std::vector<char> gainData);
+  ~SiPixelGainCalibrationForHLTGPU();
+
+  const SiPixelGainForHLTonGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
+  const SiPixelGainForHLTonGPU *getCPUProduct() const { return gainForHLTonHost_; }
+
+private:
+  SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr;
+  std::vector<char> gainData_;
+  struct GPUData {
+    ~GPUData();
+    SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr;
+    SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr;
+  };
+  cms::cuda::ESProduct<GPUData> gpuData_;
+};
+
+#endif  // CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h
diff --git a/src/cudacompat/CondFormats/SiPixelGainForHLTonGPU.h b/src/cudacompat/CondFormats/SiPixelGainForHLTonGPU.h
new file mode 100644
index 000000000..5bcdc7a66
--- /dev/null
+++ b/src/cudacompat/CondFormats/SiPixelGainForHLTonGPU.h
@@ -0,0 +1,74 @@
+#ifndef CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
+#define CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
+
+#include <cstdint>
+#include <cstdio>
+#include <tuple>
+
+// including <cuda_runtime.h> would pull in the dependency on all of CUDA;
+// instead, just define away the CUDA specific attributes to keep GCC happy.
+#ifndef __CUDACC__
+#ifndef __host__
+#define __host__
+#endif  // __host__
+#ifndef __device__
+#define __device__
+#endif  // __device__
+#endif  // __CUDACC__
+
+#include "CUDACore/cuda_assert.h"
+
+struct SiPixelGainForHLTonGPU_DecodingStructure {
+  uint8_t gain;
+  uint8_t ped;
+};
+
+// copy of SiPixelGainCalibrationForHLT
+class SiPixelGainForHLTonGPU {
+public:
+  using DecodingStructure = SiPixelGainForHLTonGPU_DecodingStructure;
+
+  using Range = std::pair<uint32_t, uint32_t>;
+
+  inline __host__ __device__ std::pair<float, float> getPedAndGain(
+      uint32_t moduleInd, int col, int row, bool& isDeadColumn, bool& isNoisyColumn) const {
+    auto range = rangeAndCols[moduleInd].first;
+    auto nCols = rangeAndCols[moduleInd].second;
+
+    // determine what averaged data block we are in (there should be 1 or 2 of these depending on if plaquette is 1 by X or 2 by X
+    unsigned int lengthOfColumnData = (range.second - range.first) / nCols;
+    unsigned int lengthOfAveragedDataInEachColumn = 2;  // we always only have two values per column averaged block
+    unsigned int numberOfDataBlocksToSkip = row / numberOfRowsAveragedOver_;
+
+    auto offset = range.first + col * lengthOfColumnData + lengthOfAveragedDataInEachColumn * numberOfDataBlocksToSkip;
+
+    assert(offset < range.second);
+    assert(offset < 3088384);
+    assert(0 == offset % 2);
+
+    DecodingStructure const* __restrict__ lp = v_pedestals;
+    auto s = lp[offset / 2];
+
+    isDeadColumn = (s.ped & 0xFF) == deadFlag_;
+    isNoisyColumn = (s.ped & 0xFF) == noisyFlag_;
+
+    return std::make_pair(decodePed(s.ped & 0xFF), decodeGain(s.gain & 0xFF));
+  }
+
+  constexpr float decodeGain(unsigned int gain) const { return gain * gainPrecision + minGain_; }
+  constexpr float decodePed(unsigned int ped) const { return ped * pedPrecision + minPed_; }
+
+  DecodingStructure* v_pedestals;
+  std::pair<Range, int> rangeAndCols[2000];
+
+  float minPed_, maxPed_, minGain_, maxGain_;
+
+  float pedPrecision, gainPrecision;
+
+  unsigned int numberOfRowsAveragedOver_;  // this is 80!!!!
+  unsigned int nBinsToUseForEncoding_;
+  unsigned int deadFlag_;
+  unsigned int noisyFlag_;
+};
+
+#endif  // CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
diff --git a/src/cudacompat/CondFormats/pixelCPEforGPU.h b/src/cudacompat/CondFormats/pixelCPEforGPU.h
new file mode 100644
index 000000000..f1eca60fd
--- /dev/null
+++ b/src/cudacompat/CondFormats/pixelCPEforGPU.h
@@ -0,0 +1,344 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
+#define RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "DataFormats/SOARotation.h"
+#include "Geometry/phase1PixelTopology.h"
+#include "CUDACore/cudaCompat.h"
+
+namespace pixelCPEforGPU {
+
+  using Frame = SOAFrame<float>;
+  using Rotation = SOARotation<float>;
+
+  // all modules are identical!
+  struct CommonParams {
+    float theThicknessB;
+    float theThicknessE;
+    float thePitchX;
+    float thePitchY;
+  };
+
+  struct DetParams {
+    bool isBarrel;
+    bool isPosZ;
+    uint16_t layer;
+    uint16_t index;
+    uint32_t rawId;
+
+    float shiftX;
+    float shiftY;
+    float chargeWidthX;
+    float chargeWidthY;
+    // CMSSW 11.2.x adds
+    //uint16_t pixmx;  // max pix charge
+    // which would break reading the binary dumps
+
+    float x0, y0, z0;  // the vertex in the local coord of the detector
+
+    float sx[3], sy[3];  // the errors...
+
+    Frame frame;
+  };
+
+  using phase1PixelTopology::AverageGeometry;
+
+  struct LayerGeometry {
+    uint32_t layerStart[phase1PixelTopology::numberOfLayers + 1];
+    uint8_t layer[phase1PixelTopology::layerIndexSize];
+  };
+
+  struct ParamsOnGPU {
+    CommonParams const* m_commonParams;
+    DetParams const* m_detParams;
+    LayerGeometry const* m_layerGeometry;
+    AverageGeometry const* m_averageGeometry;
+
+    constexpr CommonParams const& __restrict__ commonParams() const {
+      CommonParams const* __restrict__ l = m_commonParams;
+      return *l;
+    }
+    constexpr DetParams const& __restrict__ detParams(int i) const {
+      DetParams const* __restrict__ l = m_detParams;
+      return l[i];
+    }
+    constexpr LayerGeometry const& __restrict__ layerGeometry() const { return *m_layerGeometry; }
+    constexpr AverageGeometry const& __restrict__ averageGeometry() const { return *m_averageGeometry; }
+
+    __device__ uint8_t layer(uint16_t id) const {
+      return __ldg(m_layerGeometry->layer + id / phase1PixelTopology::maxModuleStride);
+    };
+  };
+
+  // SOA (on device)
+  template <uint32_t N>
+  struct ClusParamsT {
+    uint32_t minRow[N];
+    uint32_t maxRow[N];
+    uint32_t minCol[N];
+    uint32_t maxCol[N];
+
+    int32_t Q_f_X[N];
+    int32_t Q_l_X[N];
+    int32_t Q_f_Y[N];
+    int32_t Q_l_Y[N];
+
+    int32_t charge[N];
+
+    float xpos[N];
+    float ypos[N];
+
+    float xerr[N];
+    float yerr[N];
+
+    int16_t xsize[N];  // clipped at 127 if negative is edge....
+    int16_t ysize[N];
+  };
+
+  constexpr int32_t MaxHitsInIter = gpuClustering::maxHitsInIter();
+  using ClusParams = ClusParamsT<MaxHitsInIter>;
+
+  constexpr inline void computeAnglesFromDet(
+      DetParams const& __restrict__ detParams, float const x, float const y, float& cotalpha, float& cotbeta) {
+    // x,y local position on det
+    auto gvx = x - detParams.x0;
+    auto gvy = y - detParams.y0;
+    auto gvz = -1.f / detParams.z0;
+    // normalization not required as only ratio used...
+    // calculate angles
+    cotalpha = gvx * gvz;
+    cotbeta = gvy * gvz;
+  }
+
+  constexpr inline float correction(int sizeM1,
+                                    int Q_f,                        //!< Charge in the first pixel.
+                                    int Q_l,                        //!< Charge in the last pixel.
+                                    uint16_t upper_edge_first_pix,  //!< As the name says.
+                                    uint16_t lower_edge_last_pix,   //!< As the name says.
+                                    float lorentz_shift,            //!< L-shift at half thickness
+                                    float theThickness,             //detector thickness
+                                    float cot_angle,                //!< cot of alpha_ or beta_
+                                    float pitch,                    //!< thePitchX or thePitchY
+                                    bool first_is_big,              //!< true if the first is big
+                                    bool last_is_big)               //!< true if the last is big
+  {
+    if (0 == sizeM1)  // size 1
+      return 0;
+
+    float W_eff = 0;
+    bool simple = true;
+    if (1 == sizeM1) {  // size 2
+      //--- Width of the clusters minus the edge (first and last) pixels.
+      //--- In the note, they are denoted x_F and x_L (and y_F and y_L)
+      // assert(lower_edge_last_pix >= upper_edge_first_pix);
+      auto W_inner = pitch * float(lower_edge_last_pix - upper_edge_first_pix);  // in cm
+
+      //--- Predicted charge width from geometry
+      auto W_pred = theThickness * cot_angle  // geometric correction (in cm)
+                    - lorentz_shift;          // (in cm) &&& check fpix!
+
+      W_eff = std::abs(W_pred) - W_inner;
+
+      //--- If the observed charge width is inconsistent with the expectations
+      //--- based on the track, do *not* use W_pred-W_inner.  Instead, replace
+      //--- it with an *average* effective charge width, which is the average
+      //--- length of the edge pixels.
+      simple =
+          (W_eff < 0.0f) | (W_eff > pitch);  // this produces "large" regressions for very small numeric differences...
+    }
+
+    if (simple) {
+      //--- Total length of the two edge pixels (first+last)
+      float sum_of_edge = 2.0f;
+      if (first_is_big)
+        sum_of_edge += 1.0f;
+      if (last_is_big)
+        sum_of_edge += 1.0f;
+      W_eff = pitch * 0.5f * sum_of_edge;  // ave. length of edge pixels (first+last) (cm)
+    }
+
+    //--- Finally, compute the position in this projection
+    float Qdiff = Q_l - Q_f;
+    float Qsum = Q_l + Q_f;
+
+    //--- Temporary fix for clusters with both first and last pixel with charge = 0
+    if (Qsum == 0)
+      Qsum = 1.0f;
+
+    return 0.5f * (Qdiff / Qsum) * W_eff;
+  }
+
+  constexpr inline void position(CommonParams const& __restrict__ comParams,
+                                 DetParams const& __restrict__ detParams,
+                                 ClusParams& cp,
+                                 uint32_t ic) {
+    //--- Upper Right corner of Lower Left pixel -- in measurement frame
+    uint16_t llx = cp.minRow[ic] + 1;
+    uint16_t lly = cp.minCol[ic] + 1;
+
+    //--- Lower Left corner of Upper Right pixel -- in measurement frame
+    uint16_t urx = cp.maxRow[ic];
+    uint16_t ury = cp.maxCol[ic];
+
+    auto llxl = phase1PixelTopology::localX(llx);
+    auto llyl = phase1PixelTopology::localY(lly);
+    auto urxl = phase1PixelTopology::localX(urx);
+    auto uryl = phase1PixelTopology::localY(ury);
+
+    auto mx = llxl + urxl;
+    auto my = llyl + uryl;
+
+    auto xsize = int(urxl) + 2 - int(llxl);
+    auto ysize = int(uryl) + 2 - int(llyl);
+    assert(xsize >= 0);  // 0 if bixpix...
+    assert(ysize >= 0);
+
+    if (phase1PixelTopology::isBigPixX(cp.minRow[ic]))
+      ++xsize;
+    if (phase1PixelTopology::isBigPixX(cp.maxRow[ic]))
+      ++xsize;
+    if (phase1PixelTopology::isBigPixY(cp.minCol[ic]))
+      ++ysize;
+    if (phase1PixelTopology::isBigPixY(cp.maxCol[ic]))
+      ++ysize;
+
+    int unbalanceX = 8. * std::abs(float(cp.Q_f_X[ic] - cp.Q_l_X[ic])) / float(cp.Q_f_X[ic] + cp.Q_l_X[ic]);
+    int unbalanceY = 8. * std::abs(float(cp.Q_f_Y[ic] - cp.Q_l_Y[ic])) / float(cp.Q_f_Y[ic] + cp.Q_l_Y[ic]);
+    xsize = 8 * xsize - unbalanceX;
+    ysize = 8 * ysize - unbalanceY;
+
+    cp.xsize[ic] = std::min(xsize, 1023);
+    cp.ysize[ic] = std::min(ysize, 1023);
+
+    if (cp.minRow[ic] == 0 || cp.maxRow[ic] == phase1PixelTopology::lastRowInModule)
+      cp.xsize[ic] = -cp.xsize[ic];
+    if (cp.minCol[ic] == 0 || cp.maxCol[ic] == phase1PixelTopology::lastColInModule)
+      cp.ysize[ic] = -cp.ysize[ic];
+
+    // apply the lorentz offset correction
+    auto xPos = detParams.shiftX + comParams.thePitchX * (0.5f * float(mx) + float(phase1PixelTopology::xOffset));
+    auto yPos = detParams.shiftY + comParams.thePitchY * (0.5f * float(my) + float(phase1PixelTopology::yOffset));
+
+    float cotalpha = 0, cotbeta = 0;
+
+    computeAnglesFromDet(detParams, xPos, yPos, cotalpha, cotbeta);
+
+    auto thickness = detParams.isBarrel ? comParams.theThicknessB : comParams.theThicknessE;
+
+    auto xcorr = correction(cp.maxRow[ic] - cp.minRow[ic],
+                            cp.Q_f_X[ic],
+                            cp.Q_l_X[ic],
+                            llxl,
+                            urxl,
+                            detParams.chargeWidthX,  // lorentz shift in cm
+                            thickness,
+                            cotalpha,
+                            comParams.thePitchX,
+                            phase1PixelTopology::isBigPixX(cp.minRow[ic]),
+                            phase1PixelTopology::isBigPixX(cp.maxRow[ic]));
+
+    auto ycorr = correction(cp.maxCol[ic] - cp.minCol[ic],
+                            cp.Q_f_Y[ic],
+                            cp.Q_l_Y[ic],
+                            llyl,
+                            uryl,
+                            detParams.chargeWidthY,  // lorentz shift in cm
+                            thickness,
+                            cotbeta,
+                            comParams.thePitchY,
+                            phase1PixelTopology::isBigPixY(cp.minCol[ic]),
+                            phase1PixelTopology::isBigPixY(cp.maxCol[ic]));
+
+    cp.xpos[ic] = xPos + xcorr;
+    cp.ypos[ic] = yPos + ycorr;
+  }
+
+  constexpr inline void errorFromSize(CommonParams const& __restrict__ comParams,
+                                      DetParams const& __restrict__ detParams,
+                                      ClusParams& cp,
+                                      uint32_t ic) {
+    // Edge cluster errors
+    cp.xerr[ic] = 0.0050;
+    cp.yerr[ic] = 0.0085;
+
+    // FIXME these are errors form Run1
+    constexpr float xerr_barrel_l1[] = {0.00115, 0.00120, 0.00088};
+    constexpr float xerr_barrel_l1_def = 0.00200;  // 0.01030;
+    constexpr float yerr_barrel_l1[] = {
+        0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240};
+    constexpr float yerr_barrel_l1_def = 0.00210;
+    constexpr float xerr_barrel_ln[] = {0.00115, 0.00120, 0.00088};
+    constexpr float xerr_barrel_ln_def = 0.00200;  // 0.01030;
+    constexpr float yerr_barrel_ln[] = {
+        0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240};
+    constexpr float yerr_barrel_ln_def = 0.00210;
+    constexpr float xerr_endcap[] = {0.0020, 0.0020};
+    constexpr float xerr_endcap_def = 0.0020;
+    constexpr float yerr_endcap[] = {0.00210};
+    constexpr float yerr_endcap_def = 0.00210;
+
+    auto sx = cp.maxRow[ic] - cp.minRow[ic];
+    auto sy = cp.maxCol[ic] - cp.minCol[ic];
+
+    // is edgy ?
+    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
+    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    // is one and big?
+    bool isBig1X = (0 == sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
+    bool isBig1Y = (0 == sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+
+    if (!isEdgeX && !isBig1X) {
+      if (not detParams.isBarrel) {
+        cp.xerr[ic] = sx < std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.xerr[ic] = sx < std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx] : xerr_barrel_l1_def;
+      } else {
+        cp.xerr[ic] = sx < std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx] : xerr_barrel_ln_def;
+      }
+    }
+
+    if (!isEdgeY && !isBig1Y) {
+      if (not detParams.isBarrel) {
+        cp.yerr[ic] = sy < std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.yerr[ic] = sy < std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy] : yerr_barrel_l1_def;
+      } else {
+        cp.yerr[ic] = sy < std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy] : yerr_barrel_ln_def;
+      }
+    }
+  }
+
+  constexpr inline void errorFromDB(CommonParams const& __restrict__ comParams,
+                                    DetParams const& __restrict__ detParams,
+                                    ClusParams& cp,
+                                    uint32_t ic) {
+    // Edge cluster errors
+    cp.xerr[ic] = 0.0050f;
+    cp.yerr[ic] = 0.0085f;
+
+    auto sx = cp.maxRow[ic] - cp.minRow[ic];
+    auto sy = cp.maxCol[ic] - cp.minCol[ic];
+
+    // is edgy ?
+    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
+    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    // is one and big?
+    uint32_t ix = (0 == sx);
+    uint32_t iy = (0 == sy);
+    ix += (0 == sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
+    iy += (0 == sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+
+    if (not isEdgeX)
+      cp.xerr[ic] = detParams.sx[ix];
+    if (not isEdgeY)
+      cp.yerr[ic] = detParams.sy[iy];
+  }
+
+}  // namespace pixelCPEforGPU
+
+#endif  // RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
diff --git a/src/cudacompat/DataFormats/BeamSpotPOD.h b/src/cudacompat/DataFormats/BeamSpotPOD.h
new file mode 100644
index 000000000..7b4c7c241
--- /dev/null
+++ b/src/cudacompat/DataFormats/BeamSpotPOD.h
@@ -0,0 +1,21 @@
+#ifndef DataFormats_BeamSpot_interface_BeamSpotPOD_h
+#define DataFormats_BeamSpot_interface_BeamSpotPOD_h
+
+// This struct is a transient-only, simplified representation of the beamspot
+// data used as the underlying type for data transfers and operations in
+// heterogeneous code (e.g. in CUDA code).
+
+// The covariance matrix is not used in that code, so is left out here.
+
+// CMSSW 11.2.x uses alignas(128) to align to the CUDA L1 cache line size
+// here it would break reading the beamspot data from the binary dumps
+struct BeamSpotPOD {
+  float x, y, z;  // position
+  float sigmaZ;
+  float beamWidthX, beamWidthY;
+  float dxdz, dydz;
+  float emittanceX, emittanceY;
+  float betaStar;
+};
+
+#endif  // DataFormats_BeamSpot_interface_BeamSpotPOD_h
diff --git a/src/cudacompat/DataFormats/DigiClusterCount.h b/src/cudacompat/DataFormats/DigiClusterCount.h
new file mode 100644
index 000000000..2abfda375
--- /dev/null
+++ b/src/cudacompat/DataFormats/DigiClusterCount.h
@@ -0,0 +1,17 @@
+#ifndef DataFormats_DigiClusterCount_h
+#define DataFormats_DigiClusterCount_h
+
+class DigiClusterCount {
+public:
+  explicit DigiClusterCount(unsigned int nm, unsigned int nd, unsigned int nc)
+      : modules_(nm), digis_(nd), clusters_(nc) {}
+
+  unsigned int nModules() const { return modules_; }
+  unsigned int nDigis() const { return digis_; }
+  unsigned int nClusters() const { return clusters_; }
+
+private:
+  unsigned int modules_, digis_, clusters_;
+};
+
+#endif
diff --git a/src/cudacompat/DataFormats/FEDHeader.cc b/src/cudacompat/DataFormats/FEDHeader.cc
new file mode 100644
index 000000000..c1aae97e9
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDHeader.cc
@@ -0,0 +1,46 @@
+/** \file
+ *
+ *  \author N. Amapane - CERN, R. Mommsen - FNAL
+ */
+
+#include "DataFormats/FEDHeader.h"
+#include "DataFormats/fed_header.h"
+
+FEDHeader::FEDHeader(const unsigned char* header) : theHeader(reinterpret_cast<const fedh_t*>(header)) {}
+
+FEDHeader::~FEDHeader() {}
+
+uint8_t FEDHeader::triggerType() const { return FED_EVTY_EXTRACT(theHeader->eventid); }
+
+uint32_t FEDHeader::lvl1ID() const { return FED_LVL1_EXTRACT(theHeader->eventid); }
+
+uint16_t FEDHeader::bxID() const { return FED_BXID_EXTRACT(theHeader->sourceid); }
+
+uint16_t FEDHeader::sourceID() const { return FED_SOID_EXTRACT(theHeader->sourceid); }
+
+uint8_t FEDHeader::version() const { return FED_VERSION_EXTRACT(theHeader->sourceid); }
+
+bool FEDHeader::moreHeaders() const { return (FED_MORE_HEADERS_EXTRACT(theHeader->sourceid) != 0); }
+
+void FEDHeader::set(unsigned char* header,
+                    uint8_t triggerType,
+                    uint32_t lvl1ID,
+                    uint16_t bxID,
+                    uint16_t sourceID,
+                    uint8_t version,
+                    bool moreHeaders) {
+  // FIXME: should check that input ranges are OK!!!
+  fedh_t* h = reinterpret_cast<fedh_t*>(header);
+  h->eventid = (FED_SLINK_START_MARKER << FED_HCTRLID_SHIFT) | ((triggerType << FED_EVTY_SHIFT) & FED_EVTY_MASK) |
+               ((lvl1ID << FED_LVL1_SHIFT) & FED_LVL1_MASK);
+
+  h->sourceid = ((bxID << FED_BXID_SHIFT) & FED_BXID_MASK) | ((sourceID << FED_SOID_SHIFT) & FED_SOID_MASK) |
+                ((version << FED_VERSION_SHIFT) & FED_VERSION_MASK);
+
+  if (moreHeaders)
+    h->sourceid |= (FED_MORE_HEADERS_WIDTH << FED_MORE_HEADERS_SHIFT);
+}
+
+bool FEDHeader::check() const { return (FED_HCTRLID_EXTRACT(theHeader->eventid) == FED_SLINK_START_MARKER); }
+
+const uint32_t FEDHeader::length = sizeof(fedh_t);
diff --git a/src/cudacompat/DataFormats/FEDHeader.h b/src/cudacompat/DataFormats/FEDHeader.h
new file mode 100644
index 000000000..5550877d6
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDHeader.h
@@ -0,0 +1,59 @@
+#ifndef DataFormats_FEDRawData_FEDHeader_h
+#define DataFormats_FEDRawData_FEDHeader_h
+
+/** \class FEDHeader
+ *  Helper class to interpret/create FED header words.
+ *
+ *  \author N. Amapane - CERN, R. Mommsen - FNAL
+ */
+
+#include <cstdint>
+
+struct fedh_struct;
+
+class FEDHeader {
+public:
+  /// Constructor
+  FEDHeader(const unsigned char* header);
+
+  /// Destructor
+  ~FEDHeader();
+
+  /// Event Trigger type identifier
+  uint8_t triggerType() const;
+
+  /// Level-1 event number generated by the TTC system
+  uint32_t lvl1ID() const;
+
+  /// The bunch crossing number
+  uint16_t bxID() const;
+
+  /// Identifier of the FED
+  uint16_t sourceID() const;
+
+  /// Version identifier of the FED data format
+  uint8_t version() const;
+
+  /// 0 -> the current header word is the last one.
+  /// 1-> other header words can follow
+  /// (always 1 for ECAL)
+  bool moreHeaders() const;
+
+  /// Check that the header is OK
+  bool check() const;
+
+  /// Set all fields in the header
+  static void set(unsigned char* header,
+                  uint8_t triggerType,
+                  uint32_t lvl1ID,
+                  uint16_t bxID,
+                  uint16_t sourceID,
+                  uint8_t version = 0,
+                  bool moreHeaders = false);
+
+  static const uint32_t length;
+
+private:
+  const fedh_struct* theHeader;
+};
+#endif  // DataFormats_FEDRawData_FEDHeader_h
diff --git a/src/cudacompat/DataFormats/FEDNumbering.cc b/src/cudacompat/DataFormats/FEDNumbering.cc
new file mode 100644
index 000000000..30367d9e8
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDNumbering.cc
@@ -0,0 +1,111 @@
+/** \file
+ *
+ *  \author G. Bruno  - CERN, EP Division
+ */
+#include "DataFormats/FEDNumbering.h"
+#include <cassert>
+
+using namespace std;
+
+namespace {
+
+  constexpr std::array<bool, FEDNumbering::FEDNumbering::MAXFEDID + 1> initIn() {
+    std::array<bool, FEDNumbering::MAXFEDID + 1> in = {{false}};
+
+    int i = 0;
+    for (i = 0; i < FEDNumbering::lastFEDId(); i++)
+      in[i] = false;
+    for (i = FEDNumbering::MINSiPixelFEDID; i <= FEDNumbering::MAXSiPixelFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINSiStripFEDID; i <= FEDNumbering::MAXSiStripFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINPreShowerFEDID; i <= FEDNumbering::MAXPreShowerFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINECALFEDID; i <= FEDNumbering::MAXECALFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINCASTORFEDID; i <= FEDNumbering::MAXCASTORFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINHCALFEDID; i <= FEDNumbering::MAXHCALFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINLUMISCALERSFEDID; i <= FEDNumbering::MAXLUMISCALERSFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINCSCFEDID; i <= FEDNumbering::MAXCSCFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINCSCTFFEDID; i <= FEDNumbering::MAXCSCTFFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINDTFEDID; i <= FEDNumbering::MAXDTFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINDTTFFEDID; i <= FEDNumbering::MAXDTTFFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINRPCFEDID; i <= FEDNumbering::MAXRPCFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINTriggerGTPFEDID; i <= FEDNumbering::MAXTriggerGTPFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINTriggerEGTPFEDID; i <= FEDNumbering::MAXTriggerEGTPFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINTriggerGCTFEDID; i <= FEDNumbering::MAXTriggerGCTFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINTriggerLTCFEDID; i <= FEDNumbering::MAXTriggerLTCFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINTriggerLTCmtccFEDID; i <= FEDNumbering::MAXTriggerLTCmtccFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINCSCDDUFEDID; i <= FEDNumbering::MAXCSCDDUFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINCSCContingencyFEDID; i <= FEDNumbering::MAXCSCContingencyFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINCSCTFSPFEDID; i <= FEDNumbering::MAXCSCTFSPFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINDAQeFEDFEDID; i <= FEDNumbering::MAXDAQeFEDFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINDAQmFEDFEDID; i <= FEDNumbering::MAXDAQmFEDFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINTCDSuTCAFEDID; i <= FEDNumbering::MAXTCDSuTCAFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINHCALuTCAFEDID; i <= FEDNumbering::MAXHCALuTCAFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINSiPixeluTCAFEDID; i <= FEDNumbering::MAXSiPixeluTCAFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINDTUROSFEDID; i <= FEDNumbering::MAXDTUROSFEDID; i++) {
+      in[i] = true;
+    }
+    for (i = FEDNumbering::MINTriggerUpgradeFEDID; i <= FEDNumbering::MAXTriggerUpgradeFEDID; i++) {
+      in[i] = true;
+    }
+    return in;
+  }
+
+  constexpr std::array<bool, FEDNumbering::MAXFEDID + 1> in_ = initIn();
+
+}  // namespace
+
+bool FEDNumbering::inRange(int i) { return in_[i]; }
+bool FEDNumbering::inRangeNoGT(int i) {
+  if ((i >= MINTriggerGTPFEDID && i <= MAXTriggerGTPFEDID) || (i >= MINTriggerEGTPFEDID && i <= MAXTriggerEGTPFEDID))
+    return false;
+  return in_[i];
+}
diff --git a/src/cudacompat/DataFormats/FEDNumbering.h b/src/cudacompat/DataFormats/FEDNumbering.h
new file mode 100644
index 000000000..d819b780f
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDNumbering.h
@@ -0,0 +1,132 @@
+#ifndef FEDRawData_FEDNumbering_h
+#define FEDRawData_FEDNumbering_h
+
+/** \class FEDNumbering
+ *
+ *  This class holds the fed numbering scheme for the CMS geometry.
+ *  No two feds should have the same id. Each subdetector has a reserved range.
+ *  Gaps between ranges give flexibility to the numbering.
+ *
+ *  $Log
+ *
+ *  \author G. Bruno - CERN, EP Division
+ */
+
+#include <array>
+
+class FEDNumbering {
+public:
+  static constexpr int lastFEDId() { return MAXFEDID; }
+
+  static bool inRange(int);
+  static bool inRangeNoGT(int);
+
+  enum {
+    NOT_A_FEDID = -1,
+    MAXFEDID = 4096,  // must be larger than largest used FED id
+    MINSiPixelFEDID = 0,
+    MAXSiPixelFEDID = 40,  // increase from 39 for the pilot blade fed
+    MINSiStripFEDID = 50,
+    MAXSiStripFEDID = 489,
+    MINPreShowerFEDID = 520,
+    MAXPreShowerFEDID = 575,
+    MINTotemTriggerFEDID = 577,
+    MAXTotemTriggerFEDID = 577,
+    MINTotemRPHorizontalFEDID = 578,
+    MAXTotemRPHorizontalFEDID = 581,
+    MINCTPPSDiamondFEDID = 582,
+    MAXCTPPSDiamondFEDID = 583,
+    MINTotemRPVerticalFEDID = 584,
+    MAXTotemRPVerticalFEDID = 585,
+    MINTotemRPTimingVerticalFEDID = 586,
+    MAXTotemRPTimingVerticalFEDID = 587,
+    MINECALFEDID = 600,
+    MAXECALFEDID = 670,
+    MINCASTORFEDID = 690,
+    MAXCASTORFEDID = 693,
+    MINHCALFEDID = 700,
+    MAXHCALFEDID = 731,
+    MINLUMISCALERSFEDID = 735,
+    MAXLUMISCALERSFEDID = 735,
+    MINCSCFEDID = 750,
+    MAXCSCFEDID = 757,
+    MINCSCTFFEDID = 760,
+    MAXCSCTFFEDID = 760,
+    MINDTFEDID = 770,
+    MAXDTFEDID = 779,
+    MINDTTFFEDID = 780,
+    MAXDTTFFEDID = 780,
+    MINRPCFEDID = 790,
+    MAXRPCFEDID = 795,
+    MINTriggerGTPFEDID = 812,
+    MAXTriggerGTPFEDID = 813,
+    MINTriggerEGTPFEDID = 814,
+    MAXTriggerEGTPFEDID = 814,
+    MINTriggerGCTFEDID = 745,
+    MAXTriggerGCTFEDID = 749,
+    MINTriggerLTCFEDID = 816,
+    MAXTriggerLTCFEDID = 824,
+    MINTriggerLTCmtccFEDID = 815,
+    MAXTriggerLTCmtccFEDID = 815,
+    MINTriggerLTCTriggerFEDID = 816,
+    MAXTriggerLTCTriggerFEDID = 816,
+    MINTriggerLTCHCALFEDID = 817,
+    MAXTriggerLTCHCALFEDID = 817,
+    MINTriggerLTCSiStripFEDID = 818,
+    MAXTriggerLTCSiStripFEDID = 818,
+    MINTriggerLTCECALFEDID = 819,
+    MAXTriggerLTCECALFEDID = 819,
+    MINTriggerLTCTotemCastorFEDID = 820,
+    MAXTriggerLTCTotemCastorFEDID = 820,
+    MINTriggerLTCRPCFEDID = 821,
+    MAXTriggerLTCRPCFEDID = 821,
+    MINTriggerLTCCSCFEDID = 822,
+    MAXTriggerLTCCSCFEDID = 822,
+    MINTriggerLTCDTFEDID = 823,
+    MAXTriggerLTCDTFEDID = 823,
+    MINTriggerLTCSiPixelFEDID = 824,
+    MAXTriggerLTCSiPixelFEDID = 824,
+    MINCSCDDUFEDID = 830,
+    MAXCSCDDUFEDID = 869,
+    MINCSCContingencyFEDID = 880,
+    MAXCSCContingencyFEDID = 887,
+    MINCSCTFSPFEDID = 890,
+    MAXCSCTFSPFEDID = 901,
+    MINDAQeFEDFEDID = 902,
+    MAXDAQeFEDFEDID = 931,
+    MINMetaDataSoftFEDID = 1022,
+    MAXMetaDataSoftFEDID = 1022,
+    MINDAQmFEDFEDID = 1023,
+    MAXDAQmFEDFEDID = 1023,
+    MINTCDSuTCAFEDID = 1024,
+    MAXTCDSuTCAFEDID = 1099,
+    MINHCALuTCAFEDID = 1100,
+    MAXHCALuTCAFEDID = 1199,
+    MINSiPixeluTCAFEDID = 1200,
+    MAXSiPixeluTCAFEDID = 1349,
+    MINRCTFEDID = 1350,
+    MAXRCTFEDID = 1359,
+    MINCalTrigUp = 1360,
+    MAXCalTrigUp = 1367,
+    MINDTUROSFEDID = 1369,
+    MAXDTUROSFEDID = 1371,
+    MINTriggerUpgradeFEDID = 1372,
+    MAXTriggerUpgradeFEDID = 1409,
+    MINSiPixel2nduTCAFEDID = 1500,
+    MAXSiPixel2nduTCAFEDID = 1649,
+    MINSiPixelTestFEDID = 1450,
+    MAXSiPixelTestFEDID = 1461,
+    MINSiPixelAMC13FEDID = 1410,
+    MAXSiPixelAMC13FEDID = 1449,
+    MINCTPPSPixelsFEDID = 1462,
+    MAXCTPPSPixelsFEDID = 1466,
+    MINGEMFEDID = 1467,
+    MAXGEMFEDID = 1472,
+    MINME0FEDID = 1473,
+    MAXME0FEDID = 1478,
+    MINDAQvFEDFEDID = 2815,
+    MAXDAQvFEDFEDID = 4095
+  };
+};
+
+#endif  // FEDNumbering_H
diff --git a/src/cudacompat/DataFormats/FEDRawData.cc b/src/cudacompat/DataFormats/FEDRawData.cc
new file mode 100644
index 000000000..026281d3e
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDRawData.cc
@@ -0,0 +1,34 @@
+/** \file
+   implementation of class FedRawData
+
+   \author Stefano ARGIRO
+   \date 28 Jun 2005
+*/
+
+#include "DataFormats/FEDRawData.h"
+#include <iostream>
+
+using namespace std;
+
+FEDRawData::FEDRawData() {}
+
+FEDRawData::FEDRawData(size_t newsize) : data_(newsize) {
+  if (newsize % 8 != 0)
+    throw std::runtime_error("FEDRawData::resize: " + std::to_string(newsize) + " is not a multiple of 8 bytes.");
+}
+
+FEDRawData::FEDRawData(const FEDRawData &in) : data_(in.data_) {}
+FEDRawData::~FEDRawData() {}
+const unsigned char *FEDRawData::data() const { return data_.data(); }
+
+unsigned char *FEDRawData::data() { return data_.data(); }
+
+void FEDRawData::resize(size_t newsize) {
+  if (size() == newsize)
+    return;
+
+  data_.resize(newsize);
+
+  if (newsize % 8 != 0)
+    throw std::runtime_error("FEDRawData::resize: " + std::to_string(newsize) + " is not a multiple of 8 bytes.");
+}
diff --git a/src/cudacompat/DataFormats/FEDRawData.h b/src/cudacompat/DataFormats/FEDRawData.h
new file mode 100644
index 000000000..8def41c8e
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDRawData.h
@@ -0,0 +1,55 @@
+#ifndef FEDRawData_FEDRawData_h
+#define FEDRawData_FEDRawData_h
+
+/** \class FEDRawData
+ *
+ *  Class representing the raw data for one FED.
+ *  The raw data is owned as a binary buffer. It is required that the 
+ *  lenght of the data is a multiple of the S-Link64 word lenght (8 byte).
+ *  The FED data should include the standard FED header and trailer.
+ *
+ *  \author G. Bruno - CERN, EP Division
+ *  \author S. Argiro - CERN and INFN - 
+ *                      Refactoring and Modifications to fit into CMSSW
+ */
+
+#include <vector>
+#include <cstddef>
+
+class FEDRawData {
+public:
+  typedef std::vector<unsigned char> Data;
+  typedef Data::iterator iterator;
+
+  /// Default ctor
+  FEDRawData();
+
+  /// Ctor specifying the size to be preallocated, in bytes.
+  /// It is required that the size is a multiple of the size of a FED
+  /// word (8 bytes)
+  FEDRawData(size_t newsize);
+
+  /// Copy constructor
+  FEDRawData(const FEDRawData &);
+
+  /// Dtor
+  ~FEDRawData();
+
+  /// Return a const pointer to the beginning of the data buffer
+  const unsigned char *data() const;
+
+  /// Return a pointer to the beginning of the data buffer
+  unsigned char *data();
+
+  /// Lenght of the data buffer in bytes
+  size_t size() const { return data_.size(); }
+
+  /// Resize to the specified size in bytes. It is required that
+  /// the size is a multiple of the size of a FED word (8 bytes)
+  void resize(size_t newsize);
+
+private:
+  Data data_;
+};
+
+#endif
diff --git a/src/cudacompat/DataFormats/FEDRawDataCollection.cc b/src/cudacompat/DataFormats/FEDRawDataCollection.cc
new file mode 100644
index 000000000..9b179d121
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDRawDataCollection.cc
@@ -0,0 +1,17 @@
+/** \file
+ *  implementation of DaqRawDataCollection
+ *
+ *  \author N. Amapane - S. Argiro'
+ */
+
+#include "DataFormats/FEDRawDataCollection.h"
+#include "DataFormats/FEDNumbering.h"
+
+FEDRawDataCollection::FEDRawDataCollection() : data_(FEDNumbering::lastFEDId() + 1) {}
+
+FEDRawDataCollection::FEDRawDataCollection(const FEDRawDataCollection& in) : data_(in.data_) {}
+FEDRawDataCollection::~FEDRawDataCollection() {}
+
+const FEDRawData& FEDRawDataCollection::FEDData(int fedid) const { return data_[fedid]; }
+
+FEDRawData& FEDRawDataCollection::FEDData(int fedid) { return data_[fedid]; }
diff --git a/src/cudacompat/DataFormats/FEDRawDataCollection.h b/src/cudacompat/DataFormats/FEDRawDataCollection.h
new file mode 100644
index 000000000..d38319bd8
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDRawDataCollection.h
@@ -0,0 +1,38 @@
+#ifndef FEDRawData_FEDRawDataCollection_h
+#define FEDRawData_FEDRawDataCollection_h
+
+/** \class FEDRawDataCollection
+ *  An EDCollection storing the raw data for all  FEDs in a Event.
+ *  
+ *  Reference: DaqPrototype/DaqPersistentData/interface/DaqFEDOpaqueData.h
+ *
+ *  \author N. Amapane - S. Argiro'
+ */
+
+#include "DataFormats/FEDRawData.h"
+
+#include <vector>
+
+class FEDRawDataCollection {
+public:
+  FEDRawDataCollection();
+
+  virtual ~FEDRawDataCollection();
+
+  /// retrieve data for fed @param fedid
+  const FEDRawData& FEDData(int fedid) const;
+
+  /// retrieve data for fed @param fedid
+  FEDRawData& FEDData(int fedid);
+
+  FEDRawDataCollection(const FEDRawDataCollection&);
+
+  void swap(FEDRawDataCollection& other) { data_.swap(other.data_); }
+
+private:
+  std::vector<FEDRawData> data_;  ///< the raw data
+};
+
+inline void swap(FEDRawDataCollection& a, FEDRawDataCollection& b) { a.swap(b); }
+
+#endif
diff --git a/src/cudacompat/DataFormats/FEDTrailer.cc b/src/cudacompat/DataFormats/FEDTrailer.cc
new file mode 100644
index 000000000..dab3ef05e
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDTrailer.cc
@@ -0,0 +1,47 @@
+/** \file
+ *
+ *  \author N. Amapane - CERN, R. Mommsen - FNAL
+ */
+
+#include "DataFormats/FEDTrailer.h"
+#include "DataFormats/fed_trailer.h"
+
+FEDTrailer::FEDTrailer(const unsigned char* trailer) : theTrailer(reinterpret_cast<const fedt_t*>(trailer)) {}
+
+FEDTrailer::~FEDTrailer() {}
+
+uint32_t FEDTrailer::fragmentLength() const { return FED_EVSZ_EXTRACT(theTrailer->eventsize); }
+
+uint16_t FEDTrailer::crc() const { return FED_CRCS_EXTRACT(theTrailer->conscheck); }
+
+uint8_t FEDTrailer::evtStatus() const { return FED_STAT_EXTRACT(theTrailer->conscheck); }
+
+uint8_t FEDTrailer::ttsBits() const { return FED_TTSI_EXTRACT(theTrailer->conscheck); }
+
+bool FEDTrailer::moreTrailers() const { return (FED_MORE_TRAILERS_EXTRACT(theTrailer->conscheck) != 0); }
+
+bool FEDTrailer::crcModified() const { return (FED_CRC_MODIFIED_EXTRACT(theTrailer->conscheck) != 0); }
+
+bool FEDTrailer::slinkError() const { return (FED_SLINK_ERROR_EXTRACT(theTrailer->conscheck) != 0); }
+
+bool FEDTrailer::wrongFedId() const { return (FED_WRONG_FEDID_EXTRACT(theTrailer->conscheck) != 0); }
+
+uint32_t FEDTrailer::conscheck() const { return theTrailer->conscheck; }
+
+void FEDTrailer::set(
+    unsigned char* trailer, uint32_t lenght, uint16_t crc, uint8_t evtStatus, uint8_t ttsBits, bool moreTrailers) {
+  // FIXME: should check that input ranges are OK!!!
+  fedt_t* t = reinterpret_cast<fedt_t*>(trailer);
+
+  t->eventsize = (FED_SLINK_END_MARKER << FED_TCTRLID_SHIFT) | ((lenght << FED_EVSZ_SHIFT) & FED_EVSZ_MASK);
+
+  t->conscheck = ((crc << FED_CRCS_SHIFT) & FED_CRCS_MASK) | ((evtStatus << FED_STAT_SHIFT) & FED_STAT_MASK) |
+                 ((ttsBits << FED_TTSI_SHIFT) & FED_TTSI_MASK);
+
+  if (moreTrailers)
+    t->conscheck |= (FED_MORE_TRAILERS_WIDTH << FED_MORE_TRAILERS_SHIFT);
+}
+
+bool FEDTrailer::check() const { return (FED_TCTRLID_EXTRACT(theTrailer->eventsize) == FED_SLINK_END_MARKER); }
+
+const uint32_t FEDTrailer::length = sizeof(fedt_t);
diff --git a/src/cudacompat/DataFormats/FEDTrailer.h b/src/cudacompat/DataFormats/FEDTrailer.h
new file mode 100644
index 000000000..44781eea0
--- /dev/null
+++ b/src/cudacompat/DataFormats/FEDTrailer.h
@@ -0,0 +1,62 @@
+#ifndef DataFormats_FEDRawData_FEDTrailer_h
+#define DataFormats_FEDRawData_FEDTrailer_h
+
+/** \class FEDTrailer
+ *  Helper class to interpret/create FED trailer words.
+ *
+ *  \author N. Amapane - CERN, R. Mommsen - FNAL
+ */
+
+#include <cstdint>
+
+struct fedt_struct;
+
+class FEDTrailer {
+public:
+  /// Constructor
+  FEDTrailer(const unsigned char* trailer);
+
+  /// Destructor
+  virtual ~FEDTrailer();
+
+  /// The length of the event fragment counted in 64-bit words including header and trailer
+  uint32_t fragmentLength() const;
+
+  /// Cyclic Redundancy Code of the event fragment including header and trailer
+  uint16_t crc() const;
+
+  /// Event fragment status information
+  uint8_t evtStatus() const;
+
+  /// Current value of the Trigger Throttling System bits
+  uint8_t ttsBits() const;
+
+  /// 0 -> the current trailer word is the last one.
+  /// 1 -> other trailer words can follow
+  bool moreTrailers() const;
+
+  /// True if the CRC value has been modified by the S-link sender card
+  bool crcModified() const;
+
+  /// True if the FRL has detected a transmission error over the s-link cable
+  bool slinkError() const;
+
+  /// True if the FED_ID given by the FED is not the one expected by the FRL
+  bool wrongFedId() const;
+
+  /// Check that the trailer is OK
+  bool check() const;
+
+  /// Return the word containing the consistency checks
+  uint32_t conscheck() const;
+
+  /// Set all fields in the trailer
+  static void set(
+      unsigned char* trailer, uint32_t lenght, uint16_t crc, uint8_t evt_stat, uint8_t tts, bool moreTrailers = false);
+
+  static const uint32_t length;
+
+private:
+  const fedt_struct* theTrailer;
+};
+#endif  // DataFormats_FEDRawData_FEDTrailer_h
diff --git a/src/cudacompat/DataFormats/PixelErrors.h b/src/cudacompat/DataFormats/PixelErrors.h
new file mode 100644
index 000000000..797fec768
--- /dev/null
+++ b/src/cudacompat/DataFormats/PixelErrors.h
@@ -0,0 +1,21 @@
+#ifndef DataFormats_SiPixelDigi_interface_PixelErrors_h
+#define DataFormats_SiPixelDigi_interface_PixelErrors_h
+
+#include <map>
+#include <vector>
+#include <cstdint>
+
+#include "DataFormats/SiPixelRawDataError.h"
+
+// Better ideas for the placement of these?
+
+struct PixelErrorCompact {
+  uint32_t rawId;
+  uint32_t word;
+  uint8_t errorType;
+  uint8_t fedId;
+};
+
+using PixelFormatterErrors = std::map<uint32_t, std::vector<SiPixelRawDataError>>;
+
+#endif  // DataFormats_SiPixelDigi_interface_PixelErrors_h
diff --git a/src/cudacompat/DataFormats/SOARotation.h b/src/cudacompat/DataFormats/SOARotation.h
new file mode 100644
index 000000000..d75efef47
--- /dev/null
+++ b/src/cudacompat/DataFormats/SOARotation.h
@@ -0,0 +1,140 @@
+#ifndef DataFormats_GeometrySurface_SOARotation_h
+#define DataFormats_GeometrySurface_SOARotation_h
+
+template <class T>
+class TkRotation;
+
+// to be moved in an external common library???
+
+/** Rotation matrix used by SOA (as in GPU)
+ */
+
+template <class T>
+class SOARotation {
+public:
+  constexpr inline SOARotation() {}
+
+  constexpr inline explicit SOARotation(T) : R11(1), R12(0), R13(0), R21(0), R22(1), R23(0), R31(0), R32(0), R33(1) {}
+
+  constexpr inline SOARotation(T xx, T xy, T xz, T yx, T yy, T yz, T zx, T zy, T zz)
+      : R11(xx), R12(xy), R13(xz), R21(yx), R22(yy), R23(yz), R31(zx), R32(zy), R33(zz) {}
+
+  constexpr inline SOARotation(const T *p)
+      : R11(p[0]), R12(p[1]), R13(p[2]), R21(p[3]), R22(p[4]), R23(p[5]), R31(p[6]), R32(p[7]), R33(p[8]) {}
+
+  template <typename U>
+  constexpr inline SOARotation(const TkRotation<U> &a)
+      : R11(a.xx()),
+        R12(a.xy()),
+        R13(a.xz()),
+        R21(a.yx()),
+        R22(a.yy()),
+        R23(a.yz()),
+        R31(a.zx()),
+        R32(a.zy()),
+        R33(a.zz()) {}
+
+  constexpr inline SOARotation transposed() const { return SOARotation(R11, R21, R31, R12, R22, R32, R13, R23, R33); }
+
+  // if frame this is to local
+  constexpr inline void multiply(T const vx, T const vy, T const vz, T &ux, T &uy, T &uz) const {
+    ux = R11 * vx + R12 * vy + R13 * vz;
+    uy = R21 * vx + R22 * vy + R23 * vz;
+    uz = R31 * vx + R32 * vy + R33 * vz;
+  }
+
+  // if frame this is to global
+  constexpr inline void multiplyInverse(T const vx, T const vy, T const vz, T &ux, T &uy, T &uz) const {
+    ux = R11 * vx + R21 * vy + R31 * vz;
+    uy = R12 * vx + R22 * vy + R32 * vz;
+    uz = R13 * vx + R23 * vy + R33 * vz;
+  }
+
+  // if frame this is to global
+  constexpr inline void multiplyInverse(T const vx, T const vy, T &ux, T &uy, T &uz) const {
+    ux = R11 * vx + R21 * vy;
+    uy = R12 * vx + R22 * vy;
+    uz = R13 * vx + R23 * vy;
+  }
+
+  constexpr inline T const &xx() const { return R11; }
+  constexpr inline T const &xy() const { return R12; }
+  constexpr inline T const &xz() const { return R13; }
+  constexpr inline T const &yx() const { return R21; }
+  constexpr inline T const &yy() const { return R22; }
+  constexpr inline T const &yz() const { return R23; }
+  constexpr inline T const &zx() const { return R31; }
+  constexpr inline T const &zy() const { return R32; }
+  constexpr inline T const &zz() const { return R33; }
+
+private:
+  T R11, R12, R13;
+  T R21, R22, R23;
+  T R31, R32, R33;
+};
+
+template <class T>
+class SOAFrame {
+public:
+  constexpr inline SOAFrame() {}
+
+  constexpr inline SOAFrame(T ix, T iy, T iz, SOARotation<T> const &irot) : px(ix), py(iy), pz(iz), rot(irot) {}
+
+  constexpr inline SOARotation<T> const &rotation() const { return rot; }
+
+  constexpr inline void toLocal(T const vx, T const vy, T const vz, T &ux, T &uy, T &uz) const {
+    rot.multiply(vx - px, vy - py, vz - pz, ux, uy, uz);
+  }
+
+  constexpr inline void toGlobal(T const vx, T const vy, T const vz, T &ux, T &uy, T &uz) const {
+    rot.multiplyInverse(vx, vy, vz, ux, uy, uz);
+    ux += px;
+    uy += py;
+    uz += pz;
+  }
+
+  constexpr inline void toGlobal(T const vx, T const vy, T &ux, T &uy, T &uz) const {
+    rot.multiplyInverse(vx, vy, ux, uy, uz);
+    ux += px;
+    uy += py;
+    uz += pz;
+  }
+
+  constexpr inline void toGlobal(T cxx, T cxy, T cyy, T *gl) const {
+    auto const &r = rot;
+    gl[0] = r.xx() * (r.xx() * cxx + r.yx() * cxy) + r.yx() * (r.xx() * cxy + r.yx() * cyy);
+    gl[1] = r.xx() * (r.xy() * cxx + r.yy() * cxy) + r.yx() * (r.xy() * cxy + r.yy() * cyy);
+    gl[2] = r.xy() * (r.xy() * cxx + r.yy() * cxy) + r.yy() * (r.xy() * cxy + r.yy() * cyy);
+    gl[3] = r.xx() * (r.xz() * cxx + r.yz() * cxy) + r.yx() * (r.xz() * cxy + r.yz() * cyy);
+    gl[4] = r.xy() * (r.xz() * cxx + r.yz() * cxy) + r.yy() * (r.xz() * cxy + r.yz() * cyy);
+    gl[5] = r.xz() * (r.xz() * cxx + r.yz() * cxy) + r.yz() * (r.xz() * cxy + r.yz() * cyy);
+  }
+
+  constexpr inline void toLocal(T const *ge, T &lxx, T &lxy, T &lyy) const {
+    auto const &r = rot;
+
+    T cxx = ge[0];
+    T cyx = ge[1];
+    T cyy = ge[2];
+    T czx = ge[3];
+    T czy = ge[4];
+    T czz = ge[5];
+
+    lxx = r.xx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) +
+          r.xy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.xz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz);
+    lxy = r.yx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) +
+          r.yy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.yz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz);
+    lyy = r.yx() * (r.yx() * cxx + r.yy() * cyx + r.yz() * czx) +
+          r.yy() * (r.yx() * cyx + r.yy() * cyy + r.yz() * czy) + r.yz() * (r.yx() * czx + r.yy() * czy + r.yz() * czz);
+  }
+
+  constexpr inline T x() const { return px; }
+  constexpr inline T y() const { return py; }
+  constexpr inline T z() const { return pz; }
+
+private:
+  T px, py, pz;
+  SOARotation<T> rot;
+};
+
+#endif  // DataFormats_GeometrySurface_SOARotation_h
diff --git a/src/cudacompat/DataFormats/SiPixelDigisSoA.cc b/src/cudacompat/DataFormats/SiPixelDigisSoA.cc
new file mode 100644
index 000000000..600d79b02
--- /dev/null
+++ b/src/cudacompat/DataFormats/SiPixelDigisSoA.cc
@@ -0,0 +1,12 @@
+#include "DataFormats/SiPixelDigisSoA.h"
+
+#include <cassert>
+
+SiPixelDigisSoA::SiPixelDigisSoA(
+    size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus)
+    : pdigi_(pdigi, pdigi + nDigis),
+      rawIdArr_(rawIdArr, rawIdArr + nDigis),
+      adc_(adc, adc + nDigis),
+      clus_(clus, clus + nDigis) {
+  assert(pdigi_.size() == nDigis);
+}
diff --git a/src/cudacompat/DataFormats/SiPixelDigisSoA.h b/src/cudacompat/DataFormats/SiPixelDigisSoA.h
new file mode 100644
index 000000000..50e863f03
--- /dev/null
+++ b/src/cudacompat/DataFormats/SiPixelDigisSoA.h
@@ -0,0 +1,33 @@
+#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+#define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+
+#include <cstdint>
+#include <vector>
+
+class SiPixelDigisSoA {
+public:
+  SiPixelDigisSoA() = default;
+  explicit SiPixelDigisSoA(
+      size_t nDigis, const uint32_t* pdigi, const uint32_t* rawIdArr, const uint16_t* adc, const int32_t* clus);
+  ~SiPixelDigisSoA() = default;
+
+  auto size() const { return pdigi_.size(); }
+
+  uint32_t pdigi(size_t i) const { return pdigi_[i]; }
+  uint32_t rawIdArr(size_t i) const { return rawIdArr_[i]; }
+  uint16_t adc(size_t i) const { return adc_[i]; }
+  int32_t clus(size_t i) const { return clus_[i]; }
+
+  const std::vector<uint32_t>& pdigiVector() const { return pdigi_; }
+  const std::vector<uint32_t>& rawIdArrVector() const { return rawIdArr_; }
+  const std::vector<uint16_t>& adcVector() const { return adc_; }
+  const std::vector<int32_t>& clusVector() const { return clus_; }
+
+private:
+  std::vector<uint32_t> pdigi_;
+  std::vector<uint32_t> rawIdArr_;
+  std::vector<uint16_t> adc_;
+  std::vector<int32_t> clus_;
+};
+
+#endif
diff --git a/src/cudacompat/DataFormats/SiPixelRawDataError.cc b/src/cudacompat/DataFormats/SiPixelRawDataError.cc
new file mode 100644
index 000000000..3e2cbf47e
--- /dev/null
+++ b/src/cudacompat/DataFormats/SiPixelRawDataError.cc
@@ -0,0 +1,101 @@
+#include "DataFormats/SiPixelRawDataError.h"
+
+//---------------------------------------------------------------------------
+//!  \class SiPixelRawDataError
+//!  \brief Pixel error -- collection of errors and error information
+//!
+//!  Class to contain and store all information about errors
+//!
+//!
+//!  \author Andrew York, University of Tennessee
+//---------------------------------------------------------------------------
+
+//Constructors
+
+SiPixelRawDataError::SiPixelRawDataError() {}
+
+SiPixelRawDataError::SiPixelRawDataError(uint32_t errorWord32, const int errorType, int fedId)
+    : errorWord32_(errorWord32), errorType_(errorType), fedId_(fedId) {
+  setMessage();
+}
+
+SiPixelRawDataError::SiPixelRawDataError(uint64_t errorWord64, const int errorType, int fedId)
+    : errorWord64_(errorWord64), errorType_(errorType), fedId_(fedId) {
+  setMessage();
+}
+
+//Destructor
+
+SiPixelRawDataError::~SiPixelRawDataError() {}
+
+//functions to get error words and types
+
+void SiPixelRawDataError::setWord32(uint32_t errorWord32) { errorWord32_ = errorWord32; }
+
+void SiPixelRawDataError::setWord64(uint64_t errorWord64) { errorWord64_ = errorWord64; }
+
+void SiPixelRawDataError::setType(int errorType) {
+  errorType_ = errorType;
+  setMessage();
+}
+
+void SiPixelRawDataError::setFedId(int fedId) { fedId_ = fedId; }
+
+void SiPixelRawDataError::setMessage() {
+  switch (errorType_) {
+    case (25): {
+      errorMessage_ = "Error: Disabled FED channel (ROC=25)";
+      break;
+    }
+    case (26): {
+      errorMessage_ = "Error: Gap word";
+      break;
+    }
+    case (27): {
+      errorMessage_ = "Error: Dummy word";
+      break;
+    }
+    case (28): {
+      errorMessage_ = "Error: FIFO nearly full";
+      break;
+    }
+    case (29): {
+      errorMessage_ = "Error: Timeout";
+      break;
+    }
+    case (30): {
+      errorMessage_ = "Error: Trailer";
+      break;
+    }
+    case (31): {
+      errorMessage_ = "Error: Event number mismatch";
+      break;
+    }
+    case (32): {
+      errorMessage_ = "Error: Invalid or missing header";
+      break;
+    }
+    case (33): {
+      errorMessage_ = "Error: Invalid or missing trailer";
+      break;
+    }
+    case (34): {
+      errorMessage_ = "Error: Size mismatch";
+      break;
+    }
+    case (35): {
+      errorMessage_ = "Error: Invalid channel";
+      break;
+    }
+    case (36): {
+      errorMessage_ = "Error: Invalid ROC number";
+      break;
+    }
+    case (37): {
+      errorMessage_ = "Error: Invalid dcol/pixel address";
+      break;
+    }
+    default:
+      errorMessage_ = "Error: Unknown error type";
+  };
+}
diff --git a/src/cudacompat/DataFormats/SiPixelRawDataError.h b/src/cudacompat/DataFormats/SiPixelRawDataError.h
new file mode 100644
index 000000000..49783a6a9
--- /dev/null
+++ b/src/cudacompat/DataFormats/SiPixelRawDataError.h
@@ -0,0 +1,57 @@
+#ifndef DataFormats_SiPixelRawDataError_h
+#define DataFormats_SiPixelRawDataError_h
+
+//---------------------------------------------------------------------------
+//!  \class SiPixelRawDataError
+//!  \brief Pixel error -- collection of errors and error information
+//!
+//!  Class to contain and store all information about errors
+//!
+//!
+//!  \author Andrew York, University of Tennessee
+//---------------------------------------------------------------------------
+
+#include <string>
+#include <cstdint>
+
+class SiPixelRawDataError {
+public:
+  /// Default constructor
+  SiPixelRawDataError();
+  /// Constructor for 32-bit error word
+  SiPixelRawDataError(uint32_t errorWord32, const int errorType, int fedId);
+  /// Constructor with 64-bit error word and type included (header or trailer word)
+  SiPixelRawDataError(uint64_t errorWord64, const int errorType, int fedId);
+  /// Destructor
+  ~SiPixelRawDataError();
+
+  void setWord32(
+      uint32_t errorWord32);  // function to allow user to input the error word (if 32-bit) after instantiation
+  void setWord64(
+      uint64_t errorWord64);    // function to allow user to input the error word (if 64-bit) after instantiation
+  void setType(int errorType);  // function to allow user to input the error type after instantiation
+  void setFedId(int fedId);     // function to allow user to input the fedID after instantiation
+  void setMessage();            // function to create an error message based on errorType
+
+  inline uint32_t getWord32() const { return errorWord32_; }  // the 32-bit word that contains the error information
+  inline uint64_t getWord64() const { return errorWord64_; }  // the 64-bit word that contains the error information
+  inline int getType() const {
+    return errorType_;
+  }  // the number associated with the error type (26-31 for ROC number errors, 32-33 for calibration errors)
+  inline int getFedId() const { return fedId_; }                   // the fedId where the error occured
+  inline std::string getMessage() const { return errorMessage_; }  // the error message to be displayed with the error
+
+private:
+  uint32_t errorWord32_;
+  uint64_t errorWord64_;
+  int errorType_;
+  int fedId_;
+  std::string errorMessage_;
+};
+
+// Comparison operators
+inline bool operator<(const SiPixelRawDataError& one, const SiPixelRawDataError& other) {
+  return one.getFedId() < other.getFedId();
+}
+
+#endif
diff --git a/src/cudacompat/DataFormats/TrackCount.h b/src/cudacompat/DataFormats/TrackCount.h
new file mode 100644
index 000000000..1984387ab
--- /dev/null
+++ b/src/cudacompat/DataFormats/TrackCount.h
@@ -0,0 +1,14 @@
+#ifndef DataFormats_TrackCount_h
+#define DataFormats_TrackCount_h
+
+class TrackCount {
+public:
+  explicit TrackCount(unsigned int n) : tracks_(n) {}
+
+  unsigned int nTracks() const { return tracks_; }
+
+private:
+  unsigned int tracks_;
+};
+
+#endif
diff --git a/src/cudacompat/DataFormats/VertexCount.h b/src/cudacompat/DataFormats/VertexCount.h
new file mode 100644
index 000000000..f800facb7
--- /dev/null
+++ b/src/cudacompat/DataFormats/VertexCount.h
@@ -0,0 +1,14 @@
+#ifndef DataFormats_VertexCount_h
+#define DataFormats_VertexCount_h
+
+class VertexCount {
+public:
+  explicit VertexCount(unsigned int n) : vertices_(n) {}
+
+  unsigned int nVertices() const { return vertices_; }
+
+private:
+  unsigned int vertices_;
+};
+
+#endif
diff --git a/src/cudacompat/DataFormats/approx_atan2.h b/src/cudacompat/DataFormats/approx_atan2.h
new file mode 100644
index 000000000..4508a35cc
--- /dev/null
+++ b/src/cudacompat/DataFormats/approx_atan2.h
@@ -0,0 +1,290 @@
+#ifndef DataFormatsMathAPPROX_ATAN2_H
+#define DataFormatsMathAPPROX_ATAN2_H
+
+/*
+ * approximate atan2 evaluations
+ *
+ * Polynomials were obtained using Sollya scripts (in comments below)
+ *
+ *
+*/
+
+/*
+f= atan((1-x)/(1+x))-atan(1);
+I=[-1+10^(-4);1.0];
+filename="atan.txt";
+print("") > filename;
+for deg from 3 to 11 do begin
+  p = fpminimax(f, deg,[|1,23...|],I, floating, absolute); 
+  display=decimal;
+  acc=floor(-log2(sup(supnorm(p, f, I, absolute, 2^(-20)))));
+  print( "   // degree = ", deg, 
+         "  => absolute accuracy is ",  acc, "bits" ) >> filename;
+  print("template<> constexpr float approx_atan2f_P<", deg, ">(float x){") >> filename;
+  display=hexadecimal;
+  print(" return ", horner(p) , ";") >> filename;
+  print("}") >> filename;
+end;
+*/
+
+#include <cstdint>
+#include <cmath>
+#include <limits>
+#include <algorithm>
+
+// float
+
+template <int DEGREE>
+constexpr float approx_atan2f_P(float x);
+
+// degree =  3   => absolute accuracy is  7 bits
+template <>
+constexpr float approx_atan2f_P<3>(float x) {
+  return x * (float(-0xf.8eed2p-4) + x * x * float(0x3.1238p-4));
+}
+
+// degree =  5   => absolute accuracy is  10 bits
+template <>
+constexpr float approx_atan2f_P<5>(float x) {
+  auto z = x * x;
+  return x * (float(-0xf.ecfc8p-4) + z * (float(0x4.9e79dp-4) + z * float(-0x1.44f924p-4)));
+}
+
+// degree =  7   => absolute accuracy is  13 bits
+template <>
+constexpr float approx_atan2f_P<7>(float x) {
+  auto z = x * x;
+  return x * (float(-0xf.fcc7ap-4) + z * (float(0x5.23886p-4) + z * (float(-0x2.571968p-4) + z * float(0x9.fb05p-8))));
+}
+
+// degree =  9   => absolute accuracy is  16 bits
+template <>
+constexpr float approx_atan2f_P<9>(float x) {
+  auto z = x * x;
+  return x * (float(-0xf.ff73ep-4) +
+              z * (float(0x5.48ee1p-4) +
+                   z * (float(-0x2.e1efe8p-4) + z * (float(0x1.5cce54p-4) + z * float(-0x5.56245p-8)))));
+}
+
+// degree =  11   => absolute accuracy is  19 bits
+template <>
+constexpr float approx_atan2f_P<11>(float x) {
+  auto z = x * x;
+  return x * (float(-0xf.ffe82p-4) +
+              z * (float(0x5.526c8p-4) +
+                   z * (float(-0x3.18bea8p-4) +
+                        z * (float(0x1.dce3bcp-4) + z * (float(-0xd.7a64ap-8) + z * float(0x3.000eap-8))))));
+}
+
+// degree =  13   => absolute accuracy is  21 bits
+template <>
+constexpr float approx_atan2f_P<13>(float x) {
+  auto z = x * x;
+  return x * (float(-0xf.fffbep-4) +
+              z * (float(0x5.54adp-4) +
+                   z * (float(-0x3.2b4df8p-4) +
+                        z * (float(0x2.1df79p-4) +
+                             z * (float(-0x1.46081p-4) + z * (float(0x8.99028p-8) + z * float(-0x1.be0bc4p-8)))))));
+}
+
+// degree =  15   => absolute accuracy is  24 bits
+template <>
+constexpr float approx_atan2f_P<15>(float x) {
+  auto z = x * x;
+  return x * (float(-0xf.ffff4p-4) +
+              z * (float(0x5.552f9p-4 + z * (float(-0x3.30f728p-4) +
+                                             z * (float(0x2.39826p-4) +
+                                                  z * (float(-0x1.8a880cp-4) +
+                                                       z * (float(0xe.484d6p-8) +
+                                                            z * (float(-0x5.93d5p-8) + z * float(0x1.0875dcp-8)))))))));
+}
+
+template <int DEGREE>
+constexpr float unsafe_atan2f_impl(float y, float x) {
+  constexpr float pi4f = 3.1415926535897932384626434 / 4;
+  constexpr float pi34f = 3.1415926535897932384626434 * 3 / 4;
+
+  auto r = (std::abs(x) - std::abs(y)) / (std::abs(x) + std::abs(y));
+  if (x < 0)
+    r = -r;
+
+  auto angle = (x >= 0) ? pi4f : pi34f;
+  angle += approx_atan2f_P<DEGREE>(r);
+
+  return ((y < 0)) ? -angle : angle;
+}
+
+template <int DEGREE>
+constexpr float unsafe_atan2f(float y, float x) {
+  return unsafe_atan2f_impl<DEGREE>(y, x);
+}
+
+template <int DEGREE>
+constexpr float safe_atan2f(float y, float x) {
+  return unsafe_atan2f_impl<DEGREE>(y, ((y == 0.f) & (x == 0.f)) ? 0.2f : x);
+  // return (y==0.f)&(x==0.f) ? 0.f :  unsafe_atan2f_impl<DEGREE>( y, x);
+}
+
+// integer...
+/*
+  f= (2^31/pi)*(atan((1-x)/(1+x))-atan(1));
+  I=[-1+10^(-4);1.0];
+  p = fpminimax(f, [|1,3,5,7,9,11|],[|23...|],I, floating, absolute);
+ */
+
+template <int DEGREE>
+constexpr float approx_atan2i_P(float x);
+
+// degree =  3   => absolute accuracy is  6*10^6
+template <>
+constexpr float approx_atan2i_P<3>(float x) {
+  auto z = x * x;
+  return x * (-664694912.f + z * 131209024.f);
+}
+
+// degree =  5   => absolute accuracy is  4*10^5
+template <>
+constexpr float approx_atan2i_P<5>(float x) {
+  auto z = x * x;
+  return x * (-680392064.f + z * (197338400.f + z * (-54233256.f)));
+}
+
+// degree =  7   => absolute accuracy is  6*10^4
+template <>
+constexpr float approx_atan2i_P<7>(float x) {
+  auto z = x * x;
+  return x * (-683027840.f + z * (219543904.f + z * (-99981040.f + z * 26649684.f)));
+}
+
+// degree =  9   => absolute accuracy is  8000
+template <>
+constexpr float approx_atan2i_P<9>(float x) {
+  auto z = x * x;
+  return x * (-683473920.f + z * (225785056.f + z * (-123151184.f + z * (58210592.f + z * (-14249276.f)))));
+}
+
+// degree =  11   => absolute accuracy is  1000
+template <>
+constexpr float approx_atan2i_P<11>(float x) {
+  auto z = x * x;
+  return x *
+         (-683549696.f + z * (227369312.f + z * (-132297008.f + z * (79584144.f + z * (-35987016.f + z * 8010488.f)))));
+}
+
+// degree =  13   => absolute accuracy is  163
+template <>
+constexpr float approx_atan2i_P<13>(float x) {
+  auto z = x * x;
+  return x * (-683562624.f +
+              z * (227746080.f +
+                   z * (-135400128.f + z * (90460848.f + z * (-54431464.f + z * (22973256.f + z * (-4657049.f)))))));
+}
+
+template <>
+constexpr float approx_atan2i_P<15>(float x) {
+  auto z = x * x;
+  return x * (-683562624.f +
+              z * (227746080.f +
+                   z * (-135400128.f + z * (90460848.f + z * (-54431464.f + z * (22973256.f + z * (-4657049.f)))))));
+}
+
+template <int DEGREE>
+constexpr int unsafe_atan2i_impl(float y, float x) {
+  constexpr long long maxint = (long long)(std::numeric_limits<int>::max()) + 1LL;
+  constexpr int pi4 = int(maxint / 4LL);
+  constexpr int pi34 = int(3LL * maxint / 4LL);
+
+  auto r = (std::abs(x) - std::abs(y)) / (std::abs(x) + std::abs(y));
+  if (x < 0)
+    r = -r;
+
+  auto angle = (x >= 0) ? pi4 : pi34;
+  angle += int(approx_atan2i_P<DEGREE>(r));
+  // angle += int(std::round(approx_atan2i_P<DEGREE>(r)));
+
+  return (y < 0) ? -angle : angle;
+}
+
+template <int DEGREE>
+constexpr int unsafe_atan2i(float y, float x) {
+  return unsafe_atan2i_impl<DEGREE>(y, x);
+}
+
+// short (16bits)
+
+template <int DEGREE>
+constexpr float approx_atan2s_P(float x);
+
+// degree =  3   => absolute accuracy is  53
+template <>
+constexpr float approx_atan2s_P<3>(float x) {
+  auto z = x * x;
+  return x * ((-10142.439453125f) + z * 2002.0908203125f);
+}
+// degree =  5   => absolute accuracy is  7
+template <>
+constexpr float approx_atan2s_P<5>(float x) {
+  auto z = x * x;
+  return x * ((-10381.9609375f) + z * ((3011.1513671875f) + z * (-827.538330078125f)));
+}
+// degree =  7   => absolute accuracy is  2
+template <>
+constexpr float approx_atan2s_P<7>(float x) {
+  auto z = x * x;
+  return x * ((-10422.177734375f) + z * (3349.97412109375f + z * ((-1525.589599609375f) + z * 406.64190673828125f)));
+}
+// degree =  9   => absolute accuracy is 1
+template <>
+constexpr float approx_atan2s_P<9>(float x) {
+  auto z = x * x;
+  return x * ((-10428.984375f) + z * (3445.20654296875f + z * ((-1879.137939453125f) +
+                                                               z * (888.22314453125f + z * (-217.42669677734375f)))));
+}
+
+template <int DEGREE>
+constexpr short unsafe_atan2s_impl(float y, float x) {
+  constexpr int maxshort = (int)(std::numeric_limits<short>::max()) + 1;
+  constexpr short pi4 = short(maxshort / 4);
+  constexpr short pi34 = short(3 * maxshort / 4);
+
+  auto r = (std::abs(x) - std::abs(y)) / (std::abs(x) + std::abs(y));
+  if (x < 0)
+    r = -r;
+
+  auto angle = (x >= 0) ? pi4 : pi34;
+  angle += short(approx_atan2s_P<DEGREE>(r));
+
+  return (y < 0) ? -angle : angle;
+}
+
+template <int DEGREE>
+constexpr short unsafe_atan2s(float y, float x) {
+  return unsafe_atan2s_impl<DEGREE>(y, x);
+}
+
+constexpr int phi2int(float x) {
+  constexpr float p2i = ((long long)(std::numeric_limits<int>::max()) + 1LL) / M_PI;
+  return std::round(x * p2i);
+}
+
+constexpr float int2phi(int x) {
+  constexpr float i2p = M_PI / ((long long)(std::numeric_limits<int>::max()) + 1LL);
+  return float(x) * i2p;
+}
+
+constexpr double int2dphi(int x) {
+  constexpr double i2p = M_PI / ((long long)(std::numeric_limits<int>::max()) + 1LL);
+  return x * i2p;
+}
+
+constexpr short phi2short(float x) {
+  constexpr float p2i = ((int)(std::numeric_limits<short>::max()) + 1) / M_PI;
+  return std::round(x * p2i);
+}
+
+constexpr float short2phi(short x) {
+  constexpr float i2p = M_PI / ((int)(std::numeric_limits<short>::max()) + 1);
+  return float(x) * i2p;
+}
+
+#endif
diff --git a/src/cudacompat/DataFormats/fed_header.h b/src/cudacompat/DataFormats/fed_header.h
new file mode 100644
index 000000000..073ce14e1
--- /dev/null
+++ b/src/cudacompat/DataFormats/fed_header.h
@@ -0,0 +1,66 @@
+#ifndef DataFormats_FEDRawData_fed_header_h
+#define DataFormats_FEDRawData_fed_header_h
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*************************************************************************
+ *
+ * data structures and associated typedefs
+ *
+ *************************************************************************/
+
+/*
+ * FED header - in front of each FED block
+ */
+
+typedef struct fedh_struct {
+  uint32_t sourceid;
+  uint32_t eventid;
+} fedh_t;
+
+#define FED_SLINK_START_MARKER 0x5
+
+#define FED_HCTRLID_WIDTH 0x0000000f
+#define FED_HCTRLID_SHIFT 28
+#define FED_HCTRLID_MASK (FED_HCTRLID_WIDTH << FED_HCTRLID_SHIFT)
+#define FED_HCTRLID_EXTRACT(a) (((a) >> FED_HCTRLID_SHIFT) & FED_HCTRLID_WIDTH)
+
+#define FED_EVTY_WIDTH 0x0000000f
+#define FED_EVTY_SHIFT 24
+#define FED_EVTY_MASK (FED_EVTY_WIDTH << FED_EVTY_SHIFT)
+#define FED_EVTY_EXTRACT(a) (((a) >> FED_EVTY_SHIFT) & FED_EVTY_WIDTH)
+
+#define FED_LVL1_WIDTH 0x00ffffff
+#define FED_LVL1_SHIFT 0
+#define FED_LVL1_MASK (FED_LVL1_WIDTH << FED_LVL1_SHIFT)
+#define FED_LVL1_EXTRACT(a) (((a) >> FED_LVL1_SHIFT) & FED_LVL1_WIDTH)
+
+#define FED_BXID_WIDTH 0x00000fff
+#define FED_BXID_SHIFT 20
+#define FED_BXID_MASK (FED_BXID_WIDTH << FED_BXID_SHIFT)
+#define FED_BXID_EXTRACT(a) (((a) >> FED_BXID_SHIFT) & FED_BXID_WIDTH)
+
+#define FED_SOID_WIDTH 0x00000fff
+#define FED_SOID_SHIFT 8
+#define FED_SOID_MASK (FED_SOID_WIDTH << FED_SOID_SHIFT)
+#define FED_SOID_EXTRACT(a) (((a) >> FED_SOID_SHIFT) & FED_SOID_WIDTH)
+
+#define FED_VERSION_WIDTH 0x0000000f
+#define FED_VERSION_SHIFT 4
+#define FED_VERSION_MASK (FED_VERSION_WIDTH << FED_VERSION_SHIFT)
+#define FED_VERSION_EXTRACT(a) (((a) >> FED_VERSION_SHIFT) & FED_VERSION_WIDTH)
+
+#define FED_MORE_HEADERS_WIDTH 0x00000001
+#define FED_MORE_HEADERS_SHIFT 3
+#define FED_MORE_HEADERS_MASK (FED_MORE_HEADERS_WIDTH << FED_MORE_HEADERS_SHIFT)
+#define FED_MORE_HEADERS_EXTRACT(a) (((a) >> FED_MORE_HEADERS_SHIFT) & FED_MORE_HEADERS_WIDTH)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // DataFormats_FEDRawData_fed_header_h
diff --git a/src/cudacompat/DataFormats/fed_trailer.h b/src/cudacompat/DataFormats/fed_trailer.h
new file mode 100644
index 000000000..d1f9e70ce
--- /dev/null
+++ b/src/cudacompat/DataFormats/fed_trailer.h
@@ -0,0 +1,76 @@
+#ifndef DataFormats_FEDRawData_fed_trailer_h
+#define DataFormats_FEDRawData_fed_trailer_h
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*************************************************************************
+ *
+ * data structures and associated typedefs
+ *
+ *************************************************************************/
+
+/*
+ * FED trailer - at the end of each FED block
+ */
+
+typedef struct fedt_struct {
+  uint32_t conscheck;
+  uint32_t eventsize;
+} fedt_t;
+
+#define FED_SLINK_END_MARKER 0xa
+
+#define FED_TCTRLID_WIDTH 0x0000000f
+#define FED_TCTRLID_SHIFT 28
+#define FED_TCTRLID_MASK (FED_TCTRLID_WIDTH << FED_TCTRLID_SHIFT)
+#define FED_TCTRLID_EXTRACT(a) (((a) >> FED_TCTRLID_SHIFT) & FED_TCTRLID_WIDTH)
+
+#define FED_EVSZ_WIDTH 0x00ffffff
+#define FED_EVSZ_SHIFT 0
+#define FED_EVSZ_MASK (FED_EVSZ_WIDTH << FED_EVSZ_SHIFT)
+#define FED_EVSZ_EXTRACT(a) (((a) >> FED_EVSZ_SHIFT) & FED_EVSZ_WIDTH)
+
+#define FED_CRCS_WIDTH 0x0000ffff
+#define FED_CRCS_SHIFT 16
+#define FED_CRCS_MASK (FED_CRCS_WIDTH << FED_CRCS_SHIFT)
+#define FED_CRCS_EXTRACT(a) (((a) >> FED_CRCS_SHIFT) & FED_CRCS_WIDTH)
+
+#define FED_STAT_WIDTH 0x0000000f
+#define FED_STAT_SHIFT 8
+#define FED_STAT_MASK (FED_STAT_WIDTH << FED_STAT_SHIFT)
+#define FED_STAT_EXTRACT(a) (((a) >> FED_STAT_SHIFT) & FED_STAT_WIDTH)
+
+#define FED_TTSI_WIDTH 0x0000000f
+#define FED_TTSI_SHIFT 4
+#define FED_TTSI_MASK (FED_TTSI_WIDTH << FED_TTSI_SHIFT)
+#define FED_TTSI_EXTRACT(a) (((a) >> FED_TTSI_SHIFT) & FED_TTSI_WIDTH)
+
+#define FED_MORE_TRAILERS_WIDTH 0x00000001
+#define FED_MORE_TRAILERS_SHIFT 3
+#define FED_MORE_TRAILERS_MASK (FED_MORE_TRAILERS_WIDTH << FED_MORE_TRAILERS_SHIFT)
+#define FED_MORE_TRAILERS_EXTRACT(a) (((a) >> FED_MORE_TRAILERS_SHIFT) & FED_MORE_TRAILERS_WIDTH)
+
+#define FED_CRC_MODIFIED_WIDTH 0x00000001
+#define FED_CRC_MODIFIED_SHIFT 2
+#define FED_CRC_MODIFIED_MASK (FED_CRC_MODIFIED_WIDTH << FED_CRC_MODIFIED_SHIFT)
+#define FED_CRC_MODIFIED_EXTRACT(a) (((a) >> FED_CRC_MODIFIED_SHIFT) & FED_CRC_MODIFIED_WIDTH)
+
+#define FED_SLINK_ERROR_WIDTH 0x00000001
+#define FED_SLINK_ERROR_SHIFT 14
+#define FED_SLINK_ERROR_MASK (FED_SLINK_ERROR_WIDTH << FED_SLINK_ERROR_SHIFT)
+#define FED_SLINK_ERROR_EXTRACT(a) (((a) >> FED_SLINK_ERROR_SHIFT) & FED_SLINK_ERROR_WIDTH)
+
+#define FED_WRONG_FEDID_WIDTH 0x00000001
+#define FED_WRONG_FEDID_SHIFT 15
+#define FED_WRONG_FEDID_MASK (FED_WRONG_FEDID_WIDTH << FED_WRONG_FEDID_SHIFT)
+#define FED_WRONG_FEDID_EXTRACT(a) (((a) >> FED_WRONG_FEDID_SHIFT) & FED_WRONG_FEDID_WIDTH)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // DataFormats_FEDRawData_fed_trailer_h
diff --git a/src/cudacompat/Framework/EDGetToken.h b/src/cudacompat/Framework/EDGetToken.h
new file mode 100644
index 000000000..7d64df7a6
--- /dev/null
+++ b/src/cudacompat/Framework/EDGetToken.h
@@ -0,0 +1,85 @@
+#ifndef FWCore_Utilities_EDGetToken_h
+#define FWCore_Utilities_EDGetToken_h
+// -*- C++ -*-
+//
+// Package:     FWCore/Utilities
+// Class  :     EDGetToken
+//
+/**\class EDGetToken EDGetToken.h "FWCore/Utilities/interface/EDGetToken.h"
+
+ Description: A Token used to get data from the EDM
+
+ Usage:
+    A EDGetToken is created by calls to 'consumes' or 'mayConsume' from an EDM module.
+ The EDGetToken can then be used to quickly retrieve data from the edm::Event, edm::LuminosityBlock or edm::Run.
+ 
+The templated form, EDGetTokenT<T>, is the same as EDGetToken except when used to get data the framework
+ will skip checking that the type being requested matches the type specified during the 'consumes' or 'mayConsume' call.
+
+*/
+//
+// Original Author:  Chris Jones
+//         Created:  Wed, 03 Apr 2013 17:54:11 GMT
+//
+
+// system include files
+
+// user include files
+
+// forward declarations
+namespace edm {
+  template <typename T>
+  class EDGetTokenT;
+  class ProductRegistry;
+
+  class EDGetToken {
+    friend class ProductRegistry;
+
+  public:
+    EDGetToken() : m_value{s_uninitializedValue} {}
+
+    template <typename T>
+    EDGetToken(EDGetTokenT<T> iOther) : m_value{iOther.m_value} {}
+
+    // ---------- const member functions ---------------------
+    unsigned int index() const { return m_value; }
+    bool isUninitialized() const { return m_value == s_uninitializedValue; }
+
+  private:
+    //for testing
+    friend class TestEDGetToken;
+
+    static const unsigned int s_uninitializedValue = 0xFFFFFFFF;
+
+    explicit EDGetToken(unsigned int iValue) : m_value(iValue) {}
+
+    // ---------- member data --------------------------------
+    unsigned int m_value;
+  };
+
+  template <typename T>
+  class EDGetTokenT {
+    friend class ProductRegistry;
+    friend class EDGetToken;
+
+  public:
+    EDGetTokenT() : m_value{s_uninitializedValue} {}
+
+    // ---------- const member functions ---------------------
+    unsigned int index() const { return m_value; }
+    bool isUninitialized() const { return m_value == s_uninitializedValue; }
+
+  private:
+    //for testing
+    friend class TestEDGetToken;
+
+    static const unsigned int s_uninitializedValue = 0xFFFFFFFF;
+
+    explicit EDGetTokenT(unsigned int iValue) : m_value(iValue) {}
+
+    // ---------- member data --------------------------------
+    unsigned int m_value;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/EDProducer.h b/src/cudacompat/Framework/EDProducer.h
new file mode 100644
index 000000000..8160b8c96
--- /dev/null
+++ b/src/cudacompat/Framework/EDProducer.h
@@ -0,0 +1,53 @@
+#ifndef EDProducerBase_h
+#define EDProducerBase_h
+
+#include "Framework/WaitingTaskWithArenaHolder.h"
+
+namespace edm {
+  class Event;
+  class EventSetup;
+
+  class EDProducer {
+  public:
+    EDProducer() = default;
+    virtual ~EDProducer() = default;
+
+    bool hasAcquire() const { return false; }
+
+    void doAcquire(Event const& event, EventSetup const& eventSetup, WaitingTaskWithArenaHolder holder) {}
+
+    void doProduce(Event& event, EventSetup const& eventSetup) { produce(event, eventSetup); }
+
+    virtual void produce(Event& event, EventSetup const& eventSetup) = 0;
+
+    void doEndJob() { endJob(); }
+
+    virtual void endJob() {}
+
+  private:
+  };
+
+  class EDProducerExternalWork {
+  public:
+    EDProducerExternalWork() = default;
+    virtual ~EDProducerExternalWork() = default;
+
+    bool hasAcquire() const { return true; }
+
+    void doAcquire(Event const& event, EventSetup const& eventSetup, WaitingTaskWithArenaHolder holder) {
+      acquire(event, eventSetup, std::move(holder));
+    }
+
+    void doProduce(Event& event, EventSetup const& eventSetup) { produce(event, eventSetup); }
+
+    virtual void acquire(Event const& event, EventSetup const& eventSetup, WaitingTaskWithArenaHolder holder) = 0;
+    virtual void produce(Event& event, EventSetup const& eventSetup) = 0;
+
+    void doEndJob() { endJob(); }
+    virtual void endJob() {}
+
+  private:
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/EDPutToken.h b/src/cudacompat/Framework/EDPutToken.h
new file mode 100644
index 000000000..8aab86f0f
--- /dev/null
+++ b/src/cudacompat/Framework/EDPutToken.h
@@ -0,0 +1,89 @@
+#ifndef FWCore_Utilities_EDPutToken_h
+#define FWCore_Utilities_EDPutToken_h
+// -*- C++ -*-
+//
+// Package:     FWCore/Utilities
+// Class  :     EDPutToken
+//
+/**\class EDPutToken EDPutToken.h "FWCore/Utilities/interface/EDPutToken.h"
+
+ Description: A Token used to put data into the EDM
+
+ Usage:
+    A EDPutToken is created by calls to 'produces'from an EDProducer or EDFilter.
+ The EDPutToken can then be used to quickly put data into the edm::Event, edm::LuminosityBlock or edm::Run.
+ 
+The templated form, EDPutTokenT<T>, is the same as EDPutToken except when used to get data the framework
+ will skip checking that the type being requested matches the type specified during the 'produces'' call.
+
+*/
+//
+// Original Author:  Chris Jones
+//         Created:  Mon, 18 Sep 2017 17:54:11 GMT
+//
+
+// system include files
+
+// user include files
+
+// forward declarations
+namespace edm {
+  template <typename T>
+  class EDPutTokenT;
+  class ProductRegistry;
+
+  class EDPutToken {
+    friend class ProductRegistry;
+
+  public:
+    using value_type = unsigned int;
+
+    EDPutToken() : m_value{s_uninitializedValue} {}
+
+    template <typename T>
+    EDPutToken(EDPutTokenT<T> iOther) : m_value{iOther.m_value} {}
+
+    // ---------- const member functions ---------------------
+    value_type index() const { return m_value; }
+    bool isUninitialized() const { return m_value == s_uninitializedValue; }
+
+  private:
+    //for testing
+    friend class TestEDPutToken;
+
+    static const unsigned int s_uninitializedValue = 0xFFFFFFFF;
+
+    explicit EDPutToken(unsigned int iValue) : m_value(iValue) {}
+
+    // ---------- member data --------------------------------
+    value_type m_value;
+  };
+
+  template <typename T>
+  class EDPutTokenT {
+    friend class ProductRegistry;
+    friend class EDPutToken;
+
+  public:
+    using value_type = EDPutToken::value_type;
+
+    EDPutTokenT() : m_value{s_uninitializedValue} {}
+
+    // ---------- const member functions ---------------------
+    value_type index() const { return m_value; }
+    bool isUninitialized() const { return m_value == s_uninitializedValue; }
+
+  private:
+    //for testing
+    friend class TestEDPutToken;
+
+    static const unsigned int s_uninitializedValue = 0xFFFFFFFF;
+
+    explicit EDPutTokenT(unsigned int iValue) : m_value(iValue) {}
+
+    // ---------- member data --------------------------------
+    value_type m_value;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/ESPluginFactory.cc b/src/cudacompat/Framework/ESPluginFactory.cc
new file mode 100644
index 000000000..7289dacf2
--- /dev/null
+++ b/src/cudacompat/Framework/ESPluginFactory.cc
@@ -0,0 +1,34 @@
+#include "ESPluginFactory.h"
+
+#include <stdexcept>
+
+namespace edm {
+  namespace ESPluginFactory {
+    namespace impl {
+      void Registry::add(std::string const& name, std::unique_ptr<MakerBase> maker) {
+        auto found = pluginRegistry_.find(name);
+        if (found != pluginRegistry_.end()) {
+          throw std::logic_error("Plugin " + name + " is already registered");
+        }
+        pluginRegistry_.emplace(name, std::move(maker));
+      }
+
+      MakerBase const* Registry::get(std::string const& name) {
+        auto found = pluginRegistry_.find(name);
+        if (found == pluginRegistry_.end()) {
+          throw std::logic_error("Plugin " + name + " is not registered");
+        }
+        return found->second.get();
+      }
+
+      Registry& getGlobalRegistry() {
+        static Registry reg;
+        return reg;
+      }
+    };  // namespace impl
+
+    std::unique_ptr<ESProducer> create(std::string const& name, std::filesystem::path const& datadir) {
+      return impl::getGlobalRegistry().get(name)->create(datadir);
+    }
+  }  // namespace ESPluginFactory
+}  // namespace edm
diff --git a/src/cudacompat/Framework/ESPluginFactory.h b/src/cudacompat/Framework/ESPluginFactory.h
new file mode 100644
index 000000000..6c32ff230
--- /dev/null
+++ b/src/cudacompat/Framework/ESPluginFactory.h
@@ -0,0 +1,60 @@
+#ifndef PluginFactory_h
+#define PluginFactory_h
+
+#include <filesystem>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "Framework/ESProducer.h"
+
+class ProductRegistry;
+
+// Nothing here is thread safe
+namespace edm {
+  namespace ESPluginFactory {
+    namespace impl {
+      class MakerBase {
+      public:
+        virtual ~MakerBase() = default;
+
+        virtual std::unique_ptr<ESProducer> create(std::filesystem::path const& datadir) const = 0;
+      };
+
+      template <typename T>
+      class Maker : public MakerBase {
+      public:
+        virtual std::unique_ptr<ESProducer> create(std::filesystem::path const& datadir) const override {
+          return std::make_unique<T>(datadir);
+        };
+      };
+
+      class Registry {
+      public:
+        void add(std::string const& name, std::unique_ptr<MakerBase> maker);
+        MakerBase const* get(std::string const& name);
+
+      private:
+        std::unordered_map<std::string, std::unique_ptr<MakerBase>> pluginRegistry_;
+      };
+
+      Registry& getGlobalRegistry();
+
+      template <typename T>
+      class Registrar {
+      public:
+        Registrar(std::string const& name) { getGlobalRegistry().add(name, std::make_unique<Maker<T>>()); }
+      };
+    }  // namespace impl
+
+    std::unique_ptr<ESProducer> create(std::string const& name, std::filesystem::path const& datadir);
+  }  // namespace ESPluginFactory
+}  // namespace edm
+
+#define EDM_ES_PLUGIN_SYM(x, y) EDM_ES_PLUGIN_SYM2(x, y)
+#define EDM_ES_PLUGIN_SYM2(x, y) x##y
+
+#define DEFINE_FWK_EVENTSETUP_MODULE(type) \
+  static edm::ESPluginFactory::impl::Registrar<type> EDM_ES_PLUGIN_SYM(maker, __LINE__)(#type);
+
+#endif
diff --git a/src/cudacompat/Framework/ESProducer.h b/src/cudacompat/Framework/ESProducer.h
new file mode 100644
index 000000000..3ce5f83da
--- /dev/null
+++ b/src/cudacompat/Framework/ESProducer.h
@@ -0,0 +1,16 @@
+#ifndef ESProducer_h
+#define ESProducer_h
+
+namespace edm {
+  class EventSetup;
+
+  class ESProducer {
+  public:
+    ESProducer() = default;
+    virtual ~ESProducer() = default;
+
+    virtual void produce(EventSetup& eventSetup) = 0;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/EmptyWaitingTask.h b/src/cudacompat/Framework/EmptyWaitingTask.h
new file mode 100644
index 000000000..c8277e60d
--- /dev/null
+++ b/src/cudacompat/Framework/EmptyWaitingTask.h
@@ -0,0 +1,27 @@
+#ifndef EmptyWaitingTask_h
+#define EmptyWaitingTask_h
+
+// from FWCore/Concurrency/interface/WaitingTaskList.h
+#include "Framework/WaitingTask.h"
+
+namespace edm {
+  class EmptyWaitingTask : public WaitingTask {
+  public:
+    EmptyWaitingTask() = default;
+
+    tbb::task* execute() override { return nullptr; }
+  };
+
+  namespace waitingtask {
+    struct TaskDestroyer {
+      void operator()(tbb::task* iTask) const { tbb::task::destroy(*iTask); }
+    };
+  }  // namespace waitingtask
+  ///Create an EmptyWaitingTask which will properly be destroyed
+  inline std::unique_ptr<edm::EmptyWaitingTask, waitingtask::TaskDestroyer> make_empty_waiting_task() {
+    return std::unique_ptr<edm::EmptyWaitingTask, waitingtask::TaskDestroyer>(new (tbb::task::allocate_root())
+                                                                                  edm::EmptyWaitingTask{});
+  }
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/Event.h b/src/cudacompat/Framework/Event.h
new file mode 100644
index 000000000..9f952c492
--- /dev/null
+++ b/src/cudacompat/Framework/Event.h
@@ -0,0 +1,56 @@
+#ifndef Event_h
+#define Event_h
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "Framework/ProductRegistry.h"
+
+// type erasure
+namespace edm {
+  using StreamID = int;
+
+  class WrapperBase {
+  public:
+    virtual ~WrapperBase() = default;
+  };
+
+  template <typename T>
+  class Wrapper : public WrapperBase {
+  public:
+    template <typename... Args>
+    explicit Wrapper(Args&&... args) : obj_{std::forward<Args>(args)...} {}
+
+    T const& product() const { return obj_; }
+
+  private:
+    T obj_;
+  };
+
+  class Event {
+  public:
+    explicit Event(int streamId, int eventId, ProductRegistry const& reg)
+        : streamId_(streamId), eventId_(eventId), products_(reg.size()) {}
+
+    StreamID streamID() const { return streamId_; }
+    int eventID() const { return eventId_; }
+
+    template <typename T>
+    T const& get(EDGetTokenT<T> const& token) const {
+      return static_cast<Wrapper<T> const&>(*products_[token.index()]).product();
+    }
+
+    template <typename T, typename... Args>
+    void emplace(EDPutTokenT<T> const& token, Args&&... args) {
+      products_[token.index()] = std::make_unique<Wrapper<T>>(std::forward<Args>(args)...);
+    }
+
+  private:
+    StreamID streamId_;
+    int eventId_;
+    std::vector<std::unique_ptr<WrapperBase>> products_;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/EventSetup.h b/src/cudacompat/Framework/EventSetup.h
new file mode 100644
index 000000000..c5a6c0e68
--- /dev/null
+++ b/src/cudacompat/Framework/EventSetup.h
@@ -0,0 +1,56 @@
+#ifndef EventSetup_h
+#define EventSetup_h
+
+#include <memory>
+#include <typeindex>
+#include <unordered_map>
+
+#include <iostream>
+
+namespace edm {
+  // This is very different from CMSSW, but (hopefully) good-enough
+  // for this test
+  class ESWrapperBase {
+  public:
+    virtual ~ESWrapperBase() = default;
+  };
+
+  template <typename T>
+  class ESWrapper : public ESWrapperBase {
+  public:
+    explicit ESWrapper(std::unique_ptr<T> obj) : obj_{std::move(obj)} {}
+
+    T const& product() const { return *obj_; }
+
+  private:
+    std::unique_ptr<T> obj_;
+  };
+
+  class EventSetup {
+  public:
+    explicit EventSetup() {}
+
+    template <typename T>
+    void put(std::unique_ptr<T> prod) {
+      auto succeeded =
+          typeToProduct_.try_emplace(std::type_index(typeid(T)), std::make_unique<ESWrapper<T>>(std::move(prod)));
+      if (not succeeded.second) {
+        throw std::runtime_error(std::string("Product of type ") + typeid(T).name() + " already exists");
+      }
+    }
+
+    template <typename T>
+    T const& get() const {
+      const auto found = typeToProduct_.find(std::type_index(typeid(T)));
+      if (found == typeToProduct_.end()) {
+        throw std::runtime_error(std::string("Product of type ") + typeid(T).name() + " is not produced");
+      }
+      return static_cast<ESWrapper<T> const&>(*(found->second)).product();
+    }
+
+  private:
+    std::unordered_map<std::type_index, std::unique_ptr<ESWrapperBase>> typeToProduct_;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/FunctorTask.h b/src/cudacompat/Framework/FunctorTask.h
new file mode 100644
index 000000000..bee041d43
--- /dev/null
+++ b/src/cudacompat/Framework/FunctorTask.h
@@ -0,0 +1,52 @@
+#ifndef FWCore_Concurrency_FunctorTask_h
+#define FWCore_Concurrency_FunctorTask_h
+// -*- C++ -*-
+//
+// Package:     Concurrency
+// Class  :     FunctorTask
+//
+/**\class FunctorTask FunctorTask.h FWCore/Concurrency/interface/FunctorTask.h
+
+ Description: Builds a tbb::task from a lambda.
+
+ Usage:
+ 
+*/
+//
+// Original Author:  Chris Jones
+//         Created:  Thu Feb 21 13:46:31 CST 2013
+// $Id$
+//
+
+// system include files
+#include <atomic>
+#include <exception>
+#include <memory>
+#include "tbb/task.h"
+
+// user include files
+
+// forward declarations
+
+namespace edm {
+  template <typename F>
+  class FunctorTask : public tbb::task {
+  public:
+    explicit FunctorTask(F f) : func_(std::move(f)) {}
+
+    task* execute() override {
+      func_();
+      return nullptr;
+    };
+
+  private:
+    F func_;
+  };
+
+  template <typename ALLOC, typename F>
+  FunctorTask<F>* make_functor_task(ALLOC&& iAlloc, F f) {
+    return new (iAlloc) FunctorTask<F>(std::move(f));
+  }
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/PluginFactory.cc b/src/cudacompat/Framework/PluginFactory.cc
new file mode 100644
index 000000000..6e8a33eb6
--- /dev/null
+++ b/src/cudacompat/Framework/PluginFactory.cc
@@ -0,0 +1,34 @@
+#include "PluginFactory.h"
+
+#include <stdexcept>
+
+namespace edm {
+  namespace PluginFactory {
+    namespace impl {
+      void Registry::add(std::string const& name, std::unique_ptr<MakerBase> maker) {
+        auto found = pluginRegistry_.find(name);
+        if (found != pluginRegistry_.end()) {
+          throw std::logic_error("Plugin " + name + " is already registered");
+        }
+        pluginRegistry_.emplace(name, std::move(maker));
+      }
+
+      MakerBase const* Registry::get(std::string const& name) {
+        auto found = pluginRegistry_.find(name);
+        if (found == pluginRegistry_.end()) {
+          throw std::logic_error("Plugin " + name + " is not registered");
+        }
+        return found->second.get();
+      }
+
+      Registry& getGlobalRegistry() {
+        static Registry reg;
+        return reg;
+      }
+    };  // namespace impl
+
+    std::unique_ptr<Worker> create(std::string const& name, ProductRegistry& reg) {
+      return impl::getGlobalRegistry().get(name)->create(reg);
+    }
+  }  // namespace PluginFactory
+}  // namespace edm
diff --git a/src/cudacompat/Framework/PluginFactory.h b/src/cudacompat/Framework/PluginFactory.h
new file mode 100644
index 000000000..dce323fd6
--- /dev/null
+++ b/src/cudacompat/Framework/PluginFactory.h
@@ -0,0 +1,58 @@
+#ifndef PluginFactory_h
+#define PluginFactory_h
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "Framework/Worker.h"
+
+class ProductRegistry;
+
+// Nothing here is thread safe
+namespace edm {
+  namespace PluginFactory {
+    namespace impl {
+      class MakerBase {
+      public:
+        virtual ~MakerBase() = default;
+
+        virtual std::unique_ptr<Worker> create(ProductRegistry& reg) const = 0;
+      };
+
+      template <typename T>
+      class Maker : public MakerBase {
+      public:
+        virtual std::unique_ptr<Worker> create(ProductRegistry& reg) const override {
+          return std::make_unique<WorkerT<T>>(reg);
+        };
+      };
+
+      class Registry {
+      public:
+        void add(std::string const& name, std::unique_ptr<MakerBase> maker);
+        MakerBase const* get(std::string const& name);
+
+      private:
+        std::unordered_map<std::string, std::unique_ptr<MakerBase>> pluginRegistry_;
+      };
+
+      Registry& getGlobalRegistry();
+
+      template <typename T>
+      class Registrar {
+      public:
+        Registrar(std::string const& name) { getGlobalRegistry().add(name, std::make_unique<Maker<T>>()); }
+      };
+    }  // namespace impl
+
+    std::unique_ptr<Worker> create(std::string const& name, ProductRegistry& reg);
+  }  // namespace PluginFactory
+}  // namespace edm
+
+#define EDM_PLUGIN_SYM(x, y) EDM_PLUGIN_SYM2(x, y)
+#define EDM_PLUGIN_SYM2(x, y) x##y
+
+#define DEFINE_FWK_MODULE(type) static edm::PluginFactory::impl::Registrar<type> EDM_PLUGIN_SYM(maker, __LINE__)(#type);
+
+#endif
diff --git a/src/cudacompat/Framework/ProductRegistry.h b/src/cudacompat/Framework/ProductRegistry.h
new file mode 100644
index 000000000..91f00de4e
--- /dev/null
+++ b/src/cudacompat/Framework/ProductRegistry.h
@@ -0,0 +1,73 @@
+#ifndef ProductRegistry_h
+#define ProductRegistry_h
+
+#include <memory>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <typeindex>
+#include <unordered_map>
+
+#include "Framework/EDGetToken.h"
+#include "Framework/EDPutToken.h"
+
+namespace edm {
+  class ProductRegistry {
+  public:
+    constexpr static int kSourceIndex = 0;
+
+    ProductRegistry() = default;
+
+    // public interface
+    template <typename T>
+    EDPutTokenT<T> produces() {
+      const std::type_index ti{typeid(T)};
+      const unsigned int ind = typeToIndex_.size();
+      auto succeeded = typeToIndex_.try_emplace(ti, currentModuleIndex_, ind);
+      if (not succeeded.second) {
+        throw std::runtime_error(std::string("Product of type ") + typeid(T).name() + " already exists");
+      }
+      return EDPutTokenT<T>{ind};
+    }
+
+    template <typename T>
+    EDGetTokenT<T> consumes() {
+      const auto found = typeToIndex_.find(std::type_index(typeid(T)));
+      if (found == typeToIndex_.end()) {
+        throw std::runtime_error(std::string("Product of type ") + typeid(T).name() + " is not produced");
+      }
+      consumedModules_.insert(found->second.moduleIndex());
+      return EDGetTokenT<T>{found->second.productIndex()};
+    }
+
+    auto size() const { return typeToIndex_.size(); }
+
+    // internal interface
+    void beginModuleConstruction(int i) {
+      currentModuleIndex_ = i;
+      consumedModules_.clear();
+    }
+
+    std::set<unsigned> const& consumedModules() { return consumedModules_; }
+
+  private:
+    class Indices {
+    public:
+      explicit Indices(unsigned int mi, unsigned int pi) : moduleIndex_(mi), productIndex_(pi) {}
+
+      unsigned int moduleIndex() const { return moduleIndex_; }
+      unsigned int productIndex() const { return productIndex_; }
+
+    private:
+      unsigned int moduleIndex_;  // index of producing module
+      unsigned int productIndex_;
+    };
+
+    unsigned int currentModuleIndex_ = kSourceIndex;
+    std::set<unsigned int> consumedModules_;
+
+    std::unordered_map<std::type_index, Indices> typeToIndex_;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/ReusableObjectHolder.h b/src/cudacompat/Framework/ReusableObjectHolder.h
new file mode 100644
index 000000000..4b8270e1b
--- /dev/null
+++ b/src/cudacompat/Framework/ReusableObjectHolder.h
@@ -0,0 +1,167 @@
+#ifndef FWCore_Utilities_ReusableObjectHolder_h
+#define FWCore_Utilities_ReusableObjectHolder_h
+
+// -*- C++ -*-
+//
+// Package:     FWCore/Utilities
+// Class  :     ReusableObjectHolder
+//
+/**\class edm::ReusableObjectHolder ReusableObjectHolder "ReusableObjectHolder.h"
+ 
+ Description: Thread safe way to do create and reuse a group of the same object type.
+ 
+ Usage:
+ This class can be used to safely reuse a series of objects created on demand. The reuse
+ of the objects is safe even across different threads since one can safely call all member
+ functions of this class on the same instance of this class from multiple threads.
+
+ This class manages the cache of reusable objects and therefore an instance of this
+ class must live as long as you want the cache to live.
+ 
+ The primary way of using the class it to call makeOrGetAndClear
+ An example use would be
+ \code
+ auto objectToUse = holder.makeOrGetAndClear(
+                             []() { return new MyObject(10); }, //makes new one
+                             [](MyObject* old) {old->reset(); } //resets old one
+                    );
+ \endcode
+ 
+ If you always want to set the values you can use makeOrGet
+ \code
+ auto objectToUse = holder.makeOrGet(
+                         []() { return new MyObject(); });
+ objectToUse->setValue(3);
+ \endcode
+ 
+ NOTE: If you hold onto the std::shared_ptr<> until another call to the ReusableObjectHolder,
+ make sure to release the shared_ptr before the call. That way the object you were just
+ using can go back into the cache and be reused for the call you are going to make.
+ An example
+ \code
+  std::shared_ptr<MyObject> obj;
+  while(someCondition()) {
+    //release object so it can re-enter the cache
+    obj.release();
+    obj = holder.makeOrGet([]{ return new MyObject();} );
+    obj->setValue(someNewValue());
+    useTheObject(obj);
+  }
+ \endcode
+ 
+ The above example is very contrived, since the better way to do the above is
+ \code
+ while(someCondition()) {
+   auto obj = holder.makeOrGet([]{ return new MyObject();} );
+   obj->setValue(someNewValue());
+   useTheObject(obj);
+   //obj goes out of scope and returns the object to the cache
+ }
+ \endcode
+
+ When a custom deleter is used, the deleter type must be the same to
+ all objects. The deleter is allowed to have state that depends on the
+ object. The deleter object is passed along the std::unique_ptr, and
+ is internally kept along the object. The deleter object must be copyable.
+ */
+//
+// Original Author:  Chris Jones
+//         Created:  Fri, 31 July 2014 14:29:41 GMT
+//
+
+#include <memory>
+#include <cassert>
+#include <atomic>
+#include "tbb/task.h"
+#include "tbb/concurrent_queue.h"
+
+namespace edm {
+  template <class T, class Deleter = std::default_delete<T>>
+  class ReusableObjectHolder {
+  public:
+    using deleter_type = Deleter;
+
+    ReusableObjectHolder() : m_outstandingObjects(0) {}
+    ReusableObjectHolder(ReusableObjectHolder&& iOther)
+        : m_availableQueue(std::move(iOther.m_availableQueue)), m_outstandingObjects(0) {
+      assert(0 == iOther.m_outstandingObjects);
+    }
+    ~ReusableObjectHolder() {
+      assert(0 == m_outstandingObjects);
+      std::unique_ptr<T, Deleter> item;
+      while (m_availableQueue.try_pop(item)) {
+        item.reset();
+      }
+    }
+
+    ///Adds the item to the cache.
+    /// Use this function if you know ahead of time
+    /// how many cached items you will need.
+    void add(std::unique_ptr<T, Deleter> iItem) {
+      if (nullptr != iItem) {
+        m_availableQueue.push(std::move(iItem));
+      }
+    }
+
+    ///Tries to get an already created object,
+    /// if none are available, returns an empty shared_ptr.
+    /// Use this function in conjunction with add()
+    std::shared_ptr<T> tryToGet() {
+      std::unique_ptr<T, Deleter> item;
+      m_availableQueue.try_pop(item);
+      if (nullptr == item) {
+        return std::shared_ptr<T>{};
+      }
+      //instead of deleting, hand back to queue
+      auto pHolder = this;
+      auto deleter = item.get_deleter();
+      ++m_outstandingObjects;
+      return std::shared_ptr<T>{item.release(), [pHolder, deleter](T* iItem) {
+                                  pHolder->addBack(std::unique_ptr<T, Deleter>{iItem, deleter});
+                                }};
+    }
+
+    ///If there isn't an object already available, creates a new one using iFunc
+    template <typename F>
+    std::shared_ptr<T> makeOrGet(F iFunc) {
+      std::shared_ptr<T> returnValue;
+      while (!(returnValue = tryToGet())) {
+        add(makeUnique(iFunc()));
+      }
+      return returnValue;
+    }
+
+    ///If there is an object already available, passes the object to iClearFunc and then
+    /// returns the object.
+    ///If there is not an object already available, creates a new one using iMakeFunc
+    template <typename FM, typename FC>
+    std::shared_ptr<T> makeOrGetAndClear(FM iMakeFunc, FC iClearFunc) {
+      std::shared_ptr<T> returnValue;
+      while (!(returnValue = tryToGet())) {
+        add(makeUnique(iMakeFunc()));
+      }
+      iClearFunc(returnValue.get());
+      return returnValue;
+    }
+
+  private:
+    std::unique_ptr<T> makeUnique(T* ptr) {
+      static_assert(std::is_same_v<Deleter, std::default_delete<T>>,
+                    "Generating functions returning raw pointers are supported only with std::default_delete<T>");
+      return std::unique_ptr<T>{ptr};
+    }
+
+    std::unique_ptr<T, Deleter> makeUnique(std::unique_ptr<T, Deleter> ptr) { return ptr; }
+
+    void addBack(std::unique_ptr<T, Deleter> iItem) {
+      m_availableQueue.push(std::move(iItem));
+      --m_outstandingObjects;
+    }
+
+    tbb::concurrent_queue<std::unique_ptr<T, Deleter>> m_availableQueue;
+    std::atomic<size_t> m_outstandingObjects;
+  };
+
+}  // namespace edm
+
+#endif /* end of include guard: FWCore_Utilities_ReusableObjectHolder_h */
diff --git a/src/cudacompat/Framework/RunningAverage.h b/src/cudacompat/Framework/RunningAverage.h
new file mode 100644
index 000000000..c60d51fbb
--- /dev/null
+++ b/src/cudacompat/Framework/RunningAverage.h
@@ -0,0 +1,53 @@
+#ifndef FWCore_Utilities_RunningAverage_H
+#define FWCore_Utilities_RunningAverage_H
+#include <atomic>
+#include <algorithm>
+#include <array>
+
+// Function for testing RunningAverage
+namespace test_average {
+  namespace running_average {
+    int test();
+  }
+}  // namespace test_average
+
+namespace edm {
+  // keeps the running average of the last N entries
+  // thread safe, fast: does not garantee precise update in case of collision
+  class RunningAverage {
+    // For tests
+    friend int ::test_average::running_average::test();
+
+  public:
+    static constexpr int N = 16;  // better be a power of 2
+    explicit RunningAverage(unsigned int k = 4) : m_mean(N * k), m_curr(0) {
+      for (auto& i : m_buffer)
+        i = k;
+    }
+
+    int mean() const { return m_mean / N; }
+
+    int upper() const {
+      auto lm = mean();
+      return lm += (std::abs(m_buffer[0] - lm) + std::abs(m_buffer[N / 2] - lm));
+    }  // about 2 sigma
+
+    void update(unsigned int q) {
+      int e = m_curr;
+      while (!m_curr.compare_exchange_weak(e, e + 1))
+        ;
+      int k = (N - 1) & e;
+      int old = m_buffer[k];
+      if (!m_buffer[k].compare_exchange_strong(old, q))
+        return;
+      m_mean += (q - old);
+    }
+
+  private:
+    std::array<std::atomic<int>, N> m_buffer;
+    std::atomic<int> m_mean;
+    std::atomic<int> m_curr;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/WaitingTask.h b/src/cudacompat/Framework/WaitingTask.h
new file mode 100644
index 000000000..905bcf5ac
--- /dev/null
+++ b/src/cudacompat/Framework/WaitingTask.h
@@ -0,0 +1,93 @@
+#ifndef FWCore_Concurrency_WaitingTask_h
+#define FWCore_Concurrency_WaitingTask_h
+// -*- C++ -*-
+//
+// Package:     Concurrency
+// Class  :     WaitingTask
+//
+/**\class WaitingTask WaitingTask.h FWCore/Concurrency/interface/WaitingTask.h
+
+ Description: Task used by WaitingTaskList.
+
+ Usage:
+    Used as a callback to happen after a task has been completed. Includes the ability to hold an exception which has occurred while waiting.
+*/
+//
+// Original Author:  Chris Jones
+//         Created:  Thu Feb 21 13:46:31 CST 2013
+// $Id$
+//
+
+// system include files
+#include <atomic>
+#include <exception>
+#include <memory>
+#include "tbb/task.h"
+
+// user include files
+
+// forward declarations
+
+namespace edm {
+  class WaitingTaskList;
+  class WaitingTaskHolder;
+  class WaitingTaskWithArenaHolder;
+
+  class WaitingTask : public tbb::task {
+  public:
+    friend class WaitingTaskList;
+    friend class WaitingTaskHolder;
+    friend class WaitingTaskWithArenaHolder;
+
+    ///Constructor
+    WaitingTask() : m_ptr{nullptr} {}
+    ~WaitingTask() override { delete m_ptr.load(); };
+
+    // ---------- const member functions ---------------------------
+
+    ///Returns exception thrown by dependent task
+    /** If the value is non-null then the dependent task failed.
+    */
+    std::exception_ptr const* exceptionPtr() const { return m_ptr.load(); }
+
+  private:
+    ///Called if waited for task failed
+    /**Allows transfer of the exception caused by the dependent task to be
+     * moved to another thread.
+     * This method should only be called by WaitingTaskList
+     */
+    void dependentTaskFailed(std::exception_ptr iPtr) {
+      if (iPtr and not m_ptr) {
+        auto temp = std::make_unique<std::exception_ptr>(iPtr);
+        std::exception_ptr* expected = nullptr;
+        if (m_ptr.compare_exchange_strong(expected, temp.get())) {
+          temp.release();
+        }
+      }
+    }
+
+    std::atomic<std::exception_ptr*> m_ptr;
+  };
+
+  template <typename F>
+  class FunctorWaitingTask : public WaitingTask {
+  public:
+    explicit FunctorWaitingTask(F f) : func_(std::move(f)) {}
+
+    task* execute() override {
+      func_(exceptionPtr());
+      return nullptr;
+    };
+
+  private:
+    F func_;
+  };
+
+  template <typename ALLOC, typename F>
+  FunctorWaitingTask<F>* make_waiting_task(ALLOC&& iAlloc, F f) {
+    return new (iAlloc) FunctorWaitingTask<F>(std::move(f));
+  }
+
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/WaitingTaskHolder.h b/src/cudacompat/Framework/WaitingTaskHolder.h
new file mode 100644
index 000000000..d74c5d8ed
--- /dev/null
+++ b/src/cudacompat/Framework/WaitingTaskHolder.h
@@ -0,0 +1,90 @@
+#ifndef FWCore_Concurrency_WaitingTaskHolder_h
+#define FWCore_Concurrency_WaitingTaskHolder_h
+// -*- C++ -*-
+//
+// Package:     FWCore/Concurrency
+// Class  :     WaitingTaskHolder
+//
+/**\class WaitingTaskHolder WaitingTaskHolder.h "WaitingTaskHolder.h"
+
+ Description: [one line class summary]
+
+ Usage:
+    <usage>
+
+*/
+//
+// Original Author:  FWCore
+//         Created:  Fri, 18 Nov 2016 20:30:42 GMT
+//
+
+// system include files
+#include <cassert>
+
+// user include files
+#include "Framework/WaitingTask.h"
+
+// forward declarations
+
+namespace edm {
+  class WaitingTaskHolder {
+  public:
+    WaitingTaskHolder() : m_task(nullptr) {}
+
+    explicit WaitingTaskHolder(edm::WaitingTask* iTask) : m_task(iTask) { m_task->increment_ref_count(); }
+    ~WaitingTaskHolder() {
+      if (m_task) {
+        doneWaiting(std::exception_ptr{});
+      }
+    }
+
+    WaitingTaskHolder(const WaitingTaskHolder& iHolder) : m_task(iHolder.m_task) { m_task->increment_ref_count(); }
+
+    WaitingTaskHolder(WaitingTaskHolder&& iOther) : m_task(iOther.m_task) { iOther.m_task = nullptr; }
+
+    WaitingTaskHolder& operator=(const WaitingTaskHolder& iRHS) {
+      WaitingTaskHolder tmp(iRHS);
+      std::swap(m_task, tmp.m_task);
+      return *this;
+    }
+
+    // ---------- const member functions ---------------------
+    bool taskHasFailed() const { return m_task->exceptionPtr() != nullptr; }
+
+    // ---------- static member functions --------------------
+
+    // ---------- member functions ---------------------------
+
+    /** Use in the case where you need to inform the parent task of a
+     failure before some other child task which may be run later reports
+     a different, but related failure. You must later call doneWaiting
+     in the same thread passing the same exceptoin.
+     */
+    void presetTaskAsFailed(std::exception_ptr iExcept) {
+      if (iExcept) {
+        m_task->dependentTaskFailed(iExcept);
+      }
+    }
+
+    void doneWaiting(std::exception_ptr iExcept) {
+      if (iExcept) {
+        m_task->dependentTaskFailed(iExcept);
+      }
+      //spawn can run the task before we finish
+      // doneWaiting and some other thread might
+      // try to reuse this object. Resetting
+      // before spawn avoids problems
+      auto task = m_task;
+      m_task = nullptr;
+      if (0 == task->decrement_ref_count()) {
+        tbb::task::spawn(*task);
+      }
+    }
+
+  private:
+    // ---------- member data --------------------------------
+    WaitingTask* m_task;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/WaitingTaskList.cc b/src/cudacompat/Framework/WaitingTaskList.cc
new file mode 100644
index 000000000..c0740e68d
--- /dev/null
+++ b/src/cudacompat/Framework/WaitingTaskList.cc
@@ -0,0 +1,172 @@
+// -*- C++ -*-
+//
+// Package:     Concurrency
+// Class  :     WaitingTaskList
+//
+// Implementation:
+//     [Notes on implementation]
+//
+// Original Author:  Chris Jones
+//         Created:  Thu Feb 21 13:46:45 CST 2013
+// $Id$
+//
+
+// system include files
+
+// user include files
+#include "tbb/task.h"
+#include <cassert>
+
+#include "WaitingTaskList.h"
+#include "hardware_pause.h"
+
+using namespace edm;
+//
+// constants, enums and typedefs
+//
+
+//
+// static data member definitions
+//
+
+//
+// constructors and destructor
+//
+WaitingTaskList::WaitingTaskList(unsigned int iInitialSize)
+    : m_head{nullptr},
+      m_nodeCache{new WaitNode[iInitialSize]},
+      m_nodeCacheSize{iInitialSize},
+      m_lastAssignedCacheIndex{0},
+      m_waiting{true} {
+  auto nodeCache = m_nodeCache.get();
+  for (auto it = nodeCache, itEnd = nodeCache + m_nodeCacheSize; it != itEnd; ++it) {
+    it->m_fromCache = true;
+  }
+}
+
+//
+// member functions
+//
+void WaitingTaskList::reset() {
+  m_exceptionPtr = std::exception_ptr{};
+  unsigned int nSeenTasks = m_lastAssignedCacheIndex;
+  m_lastAssignedCacheIndex = 0;
+  assert(m_head == nullptr);
+  if (nSeenTasks > m_nodeCacheSize) {
+    //need to expand so next time we don't have to do any
+    // memory requests
+    m_nodeCacheSize = nSeenTasks;
+    m_nodeCache.reset(new WaitNode[nSeenTasks]);
+    auto nodeCache = m_nodeCache.get();
+    for (auto it = nodeCache, itEnd = nodeCache + m_nodeCacheSize; it != itEnd; ++it) {
+      it->m_fromCache = true;
+    }
+  }
+  //this will make sure all cores see the changes
+  m_waiting = true;
+}
+
+WaitingTaskList::WaitNode* WaitingTaskList::createNode(WaitingTask* iTask) {
+  unsigned int index = m_lastAssignedCacheIndex++;
+
+  WaitNode* returnValue;
+  if (index < m_nodeCacheSize) {
+    returnValue = m_nodeCache.get() + index;
+  } else {
+    returnValue = new WaitNode;
+    returnValue->m_fromCache = false;
+  }
+  returnValue->m_task = iTask;
+  //No other thread can see m_next yet. The caller to create node
+  // will be doing a synchronization operation anyway which will
+  // make sure m_task and m_next are synched across threads
+  returnValue->m_next.store(returnValue, std::memory_order_relaxed);
+
+  return returnValue;
+}
+
+void WaitingTaskList::add(WaitingTask* iTask) {
+  iTask->increment_ref_count();
+  if (!m_waiting) {
+    if (bool(m_exceptionPtr)) {
+      iTask->dependentTaskFailed(m_exceptionPtr);
+    }
+    if (0 == iTask->decrement_ref_count()) {
+      tbb::task::spawn(*iTask);
+    }
+  } else {
+    WaitNode* newHead = createNode(iTask);
+    //This exchange is sequentially consistent thereby
+    // ensuring ordering between it and setNextNode
+    WaitNode* oldHead = m_head.exchange(newHead);
+    newHead->setNextNode(oldHead);
+
+    //For the case where oldHead != nullptr,
+    // even if 'm_waiting' changed, we don't
+    // have to recheck since we beat 'announce()' in
+    // the ordering of 'm_head.exchange' call so iTask
+    // is guaranteed to be in the link list
+
+    if (nullptr == oldHead) {
+      if (!m_waiting) {
+        //if finished waiting right before we did the
+        // exchange our task will not be spawned. Also,
+        // additional threads may be calling add() and swapping
+        // heads and linking us to the new head.
+        // It is safe to call announce from multiple threads
+        announce();
+      }
+    }
+  }
+}
+
+void WaitingTaskList::presetTaskAsFailed(std::exception_ptr iExcept) {
+  if (iExcept and m_waiting) {
+    WaitNode* node = m_head.load();
+    while (node) {
+      WaitNode* next;
+      while (node == (next = node->nextNode())) {
+        hardware_pause();
+      }
+      node->m_task->dependentTaskFailed(iExcept);
+      node = next;
+    }
+  }
+}
+
+void WaitingTaskList::announce() {
+  //Need a temporary storage since one of these tasks could
+  // cause the next event to start processing which would refill
+  // this waiting list after it has been reset
+  WaitNode* n = m_head.exchange(nullptr);
+  WaitNode* next;
+  while (n) {
+    //it is possible that 'WaitingTaskList::add' is running in a different
+    // thread and we have a new 'head' but the old head has not yet been
+    // attached to the new head (we identify this since 'nextNode' will return itself).
+    //  In that case we have to wait until the link has been established before going on.
+    while (n == (next = n->nextNode())) {
+      hardware_pause();
+    }
+    auto t = n->m_task;
+    if (bool(m_exceptionPtr)) {
+      t->dependentTaskFailed(m_exceptionPtr);
+    }
+    if (!n->m_fromCache) {
+      delete n;
+    }
+    n = next;
+
+    //the task may indirectly call WaitingTaskList::reset
+    // so we need to call spawn after we are done using the node.
+    if (0 == t->decrement_ref_count()) {
+      tbb::task::spawn(*t);
+    }
+  }
+}
+
+void WaitingTaskList::doneWaiting(std::exception_ptr iPtr) {
+  m_exceptionPtr = iPtr;
+  m_waiting = false;
+  announce();
+}
diff --git a/src/cudacompat/Framework/WaitingTaskList.h b/src/cudacompat/Framework/WaitingTaskList.h
new file mode 100644
index 000000000..4e2c5e008
--- /dev/null
+++ b/src/cudacompat/Framework/WaitingTaskList.h
@@ -0,0 +1,175 @@
+#ifndef FWCore_Concurrency_WaitingTaskList_h
+#define FWCore_Concurrency_WaitingTaskList_h
+// -*- C++ -*-
+//
+// Package:     Concurrency
+// Class  :     WaitingTaskList
+//
+/**\class WaitingTaskList WaitingTaskList.h FWCore/Concurrency/interface/WaitingTaskList.h
+
+ Description: Handles starting tasks once some resource becomes available.
+
+ Usage:
+    This class can be used to have tasks wait to be spawned until a resource is available.
+ Tasks that want to use the resource are added to the list by calling add(tbb::task*).
+ When the resource becomes available one calls doneWaiting() and then any waiting tasks will
+ be spawned. If a call to add() is made after doneWaiting() the newly added task will
+ immediately be spawned.
+ The class can be reused by calling reset(). However, reset() is not thread-safe so one 
+ must be certain neither add(...) nor doneWaiting() is called while reset() is running.
+ 
+ An example usage would be if you had a task doing a long calculation (the resource) and
+ then several other tasks have been created in a different thread and before running those
+ new tasks you need the result of the long calculation.
+ \code
+ class CalcTask : public edm::WaitingTask {
+    public:
+    CalcTask(edm::WaitingTaskList* iWL, Value* v):
+    m_waitList(iWL), m_output(v) {}
+ 
+    tbb::task* execute() {
+     std::exception_ptr ptr;
+     try {
+       *m_output = doCalculation();
+     } catch(...) {
+       ptr = std::current_exception();
+     }
+     m_waitList.doneWaiting(ptr);
+     return nullptr;
+    }
+    private:
+     edm::WaitingTaskList* m_waitList;
+     Value* m_output;
+ };
+ \endcode
+ 
+ In one part of the code we can setup the shared resource
+ \code
+ WaitingTaskList waitList;
+ Value v;
+ \endcode
+
+ In another part we can start the calculation
+ \code
+ tbb::task* calc = new(tbb::task::allocate_root()) CalcTask(&waitList,&v);
+ tbb::task::spawn(calc);
+ \endcode
+ 
+ Finally in some unrelated part of the code we can create tasks that need the calculation
+ \code
+ tbb::task* t1 = makeTask1(v);
+ waitList.add(t1);
+ tbb::task* t2 = makeTask2(v);
+ waitList.add(t2);
+ \endcode
+
+*/
+//
+// Original Author:  Chris Jones
+//         Created:  Thu Feb 21 13:46:31 CST 2013
+// $Id$
+//
+
+// system include files
+#include <atomic>
+
+// user include files
+#include "Framework/WaitingTask.h"
+
+// forward declarations
+
+namespace edm {
+  class EmptyWaitingTask : public WaitingTask {
+  public:
+    EmptyWaitingTask() = default;
+
+    tbb::task* execute() override { return nullptr; }
+  };
+
+  namespace waitingtask {
+    struct TaskDestroyer {
+      void operator()(tbb::task* iTask) const { tbb::task::destroy(*iTask); }
+    };
+  }  // namespace waitingtask
+  ///Create an EmptyWaitingTask which will properly be destroyed
+  inline std::unique_ptr<edm::EmptyWaitingTask, waitingtask::TaskDestroyer> make_empty_waiting_task() {
+    return std::unique_ptr<edm::EmptyWaitingTask, waitingtask::TaskDestroyer>(new (tbb::task::allocate_root())
+                                                                                  edm::EmptyWaitingTask{});
+  }
+
+  class WaitingTaskList {
+  public:
+    ///Constructor
+    /**The WaitingTaskList is initial set to waiting.
+       * \param[in] iInitialSize specifies the initial size of the cache used to hold waiting tasks.
+       * The value is only useful for optimization as the object can resize itself.
+       */
+    explicit WaitingTaskList(unsigned int iInitialSize = 2);
+    ~WaitingTaskList() = default;
+
+    // ---------- member functions ---------------------------
+
+    /** Use in the case where you need to inform the parent task of a
+       failure before some other child task which may be run later reports
+       a different, but related failure. You must later call doneWaiting
+       with same exception later in the same thread.
+       */
+    void presetTaskAsFailed(std::exception_ptr iExcept);
+
+    ///Adds task to the waiting list
+    /**If doneWaiting() has already been called then the added task will immediately be spawned.
+       * If that is not the case then the task will be held until doneWaiting() is called and will
+       * then be spawned.
+       * Calls to add() and doneWaiting() can safely be done concurrently.
+       */
+    void add(WaitingTask*);
+
+    ///Signals that the resource is now available and tasks should be spawned
+    /**The owner of the resource calls this function to allow the waiting tasks to
+       * start accessing it.
+       * If the task fails, a non 'null' std::exception_ptr should be used.
+       * To have tasks wait again one must call reset().
+       * Calls to add() and doneWaiting() can safely be done concurrently.
+       */
+    void doneWaiting(std::exception_ptr iPtr);
+
+    ///Resets access to the resource so that added tasks will wait.
+    /**The owner of the resouce calls reset() to make tasks wait.
+       * Calling reset() is NOT thread safe. The system must guarantee that no tasks are
+       * using the resource when reset() is called and neither add() nor doneWaiting() can
+       * be called concurrently with reset().
+       */
+    void reset();
+
+  private:
+    WaitingTaskList(const WaitingTaskList&) = delete;                   // stop default
+    const WaitingTaskList& operator=(const WaitingTaskList&) = delete;  // stop default
+
+    /**Handles spawning the tasks,
+       * safe to call from multiple threads
+       */
+    void announce();
+
+    struct WaitNode {
+      WaitingTask* m_task;
+      std::atomic<WaitNode*> m_next;
+      bool m_fromCache;
+
+      void setNextNode(WaitNode* iNext) { m_next = iNext; }
+
+      WaitNode* nextNode() const { return m_next; }
+    };
+
+    WaitNode* createNode(WaitingTask* iTask);
+
+    // ---------- member data --------------------------------
+    std::atomic<WaitNode*> m_head;
+    std::unique_ptr<WaitNode[]> m_nodeCache;
+    std::exception_ptr m_exceptionPtr;
+    unsigned int m_nodeCacheSize;
+    std::atomic<unsigned int> m_lastAssignedCacheIndex;
+    std::atomic<bool> m_waiting;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/Framework/WaitingTaskWithArenaHolder.cc b/src/cudacompat/Framework/WaitingTaskWithArenaHolder.cc
new file mode 100644
index 000000000..7c852d041
--- /dev/null
+++ b/src/cudacompat/Framework/WaitingTaskWithArenaHolder.cc
@@ -0,0 +1,96 @@
+// -*- C++ -*-
+//
+// Package:     Concurrency
+// Class  :     WaitingTaskWithArenaHolder
+//
+// Original Author:  W. David Dagenhart
+//         Created:  6 December 2017
+
+#include "WaitingTaskWithArenaHolder.h"
+#include "WaitingTask.h"
+#include "WaitingTaskHolder.h"
+
+namespace edm {
+
+  WaitingTaskWithArenaHolder::WaitingTaskWithArenaHolder() : m_task(nullptr) {}
+
+  // Note that the arena will be the one containing the thread
+  // that runs this constructor. This is the arena where you
+  // eventually intend for the task to be spawned.
+  WaitingTaskWithArenaHolder::WaitingTaskWithArenaHolder(WaitingTask* iTask)
+      : m_task(iTask), m_arena(std::make_shared<tbb::task_arena>(tbb::task_arena::attach())) {
+    m_task->increment_ref_count();
+  }
+
+  WaitingTaskWithArenaHolder::~WaitingTaskWithArenaHolder() {
+    if (m_task) {
+      doneWaiting(std::exception_ptr{});
+    }
+  }
+
+  WaitingTaskWithArenaHolder::WaitingTaskWithArenaHolder(WaitingTaskWithArenaHolder const& iHolder)
+      : m_task(iHolder.m_task), m_arena(iHolder.m_arena) {
+    if (m_task != nullptr) {
+      m_task->increment_ref_count();
+    }
+  }
+
+  WaitingTaskWithArenaHolder::WaitingTaskWithArenaHolder(WaitingTaskWithArenaHolder&& iOther)
+      : m_task(iOther.m_task), m_arena(std::move(iOther.m_arena)) {
+    iOther.m_task = nullptr;
+  }
+
+  WaitingTaskWithArenaHolder& WaitingTaskWithArenaHolder::operator=(const WaitingTaskWithArenaHolder& iRHS) {
+    WaitingTaskWithArenaHolder tmp(iRHS);
+    std::swap(m_task, tmp.m_task);
+    std::swap(m_arena, tmp.m_arena);
+    return *this;
+  }
+
+  WaitingTaskWithArenaHolder& WaitingTaskWithArenaHolder::operator=(WaitingTaskWithArenaHolder&& iRHS) {
+    WaitingTaskWithArenaHolder tmp(std::move(iRHS));
+    std::swap(m_task, tmp.m_task);
+    std::swap(m_arena, tmp.m_arena);
+    return *this;
+  }
+
+  // This spawns the task. The arena is needed to get the task spawned
+  // into the correct arena of threads. Use of the arena allows doneWaiting
+  // to be called from a thread outside the arena of threads that will manage
+  // the task. doneWaiting can be called from a non-TBB thread.
+  void WaitingTaskWithArenaHolder::doneWaiting(std::exception_ptr iExcept) {
+    if (iExcept) {
+      m_task->dependentTaskFailed(iExcept);
+    }
+    //enqueue can run the task before we finish
+    // doneWaiting and some other thread might
+    // try to reuse this object. Resetting
+    // before enqueue avoids problems
+    auto task = m_task;
+    m_task = nullptr;
+    if (0 == task->decrement_ref_count()) {
+      // The enqueue call will cause a worker thread to be created in
+      // the arena if there is not one already.
+      m_arena->enqueue([task = task]() { tbb::task::spawn(*task); });
+    }
+  }
+
+  // This next function is useful if you know from the context that
+  // m_arena (which is set when the  constructor was executes) is the
+  // same arena in which you want to execute the doneWaiting function.
+  // It allows an optimization which avoids the enqueue step in the
+  // doneWaiting function.
+  //
+  // Be warned though that in general this function cannot be used.
+  // Spawning a task outside the correct arena could create a new separate
+  // arena with its own extra TBB worker threads if this function is used
+  // in an inappropriate context (and silently such that you might not notice
+  // the problem quickly).
+
+  WaitingTaskHolder WaitingTaskWithArenaHolder::makeWaitingTaskHolderAndRelease() {
+    WaitingTaskHolder holder(m_task);
+    m_task->decrement_ref_count();
+    m_task = nullptr;
+    return holder;
+  }
+}  // namespace edm
diff --git a/src/cudacompat/Framework/WaitingTaskWithArenaHolder.h b/src/cudacompat/Framework/WaitingTaskWithArenaHolder.h
new file mode 100644
index 000000000..4b14febfb
--- /dev/null
+++ b/src/cudacompat/Framework/WaitingTaskWithArenaHolder.h
@@ -0,0 +1,100 @@
+#ifndef FWCore_Concurrency_WaitingTaskWithArenaHolder_h
+#define FWCore_Concurrency_WaitingTaskWithArenaHolder_h
+// -*- C++ -*-
+//
+// Package:     FWCore/Concurrency
+// Class  :     WaitingTaskWithArenaHolder
+//
+/**\class edm::WaitingTaskWithArenaHolder
+
+ Description: This holds a WaitingTask and can be passed to something
+ the WaitingTask is waiting for. That allows that something to call
+ doneWaiting to let the WaitingTask know it can run. The use of the
+ arena allows one to call doneWaiting from a thread external to the
+ arena where the task should run. The external thread might be a non-TBB
+ thread.
+*/
+//
+// Original Author:  W. David Dagenhart
+//         Created:  9 November 2017
+//
+
+#include <exception>
+#include <memory>
+
+#include "tbb/task_arena.h"
+
+namespace edm {
+
+  class WaitingTask;
+  class WaitingTaskHolder;
+
+  class WaitingTaskWithArenaHolder {
+  public:
+    WaitingTaskWithArenaHolder();
+
+    // Note that the arena will be the one containing the thread
+    // that runs this constructor. This is the arena where you
+    // eventually intend for the task to be spawned.
+    explicit WaitingTaskWithArenaHolder(WaitingTask* iTask);
+
+    ~WaitingTaskWithArenaHolder();
+
+    WaitingTaskWithArenaHolder(WaitingTaskWithArenaHolder const& iHolder);
+
+    WaitingTaskWithArenaHolder(WaitingTaskWithArenaHolder&& iOther);
+
+    WaitingTaskWithArenaHolder& operator=(const WaitingTaskWithArenaHolder& iRHS);
+
+    WaitingTaskWithArenaHolder& operator=(WaitingTaskWithArenaHolder&& iRHS);
+
+    // This spawns the task. The arena is needed to get the task spawned
+    // into the correct arena of threads. Use of the arena allows doneWaiting
+    // to be called from a thread outside the arena of threads that will manage
+    // the task. doneWaiting can be called from a non-TBB thread.
+    void doneWaiting(std::exception_ptr iExcept);
+
+    // This next function is useful if you know from the context that
+    // m_arena (which is set when the constructor was executes) is the
+    // same arena in which you want to execute the doneWaiting function.
+    // It allows an optimization which avoids the enqueue step in the
+    // doneWaiting function.
+    //
+    // Be warned though that in general this function cannot be used.
+    // Spawning a task outside the correct arena could create a new separate
+    // arena with its own extra TBB worker threads if this function is used
+    // in an inappropriate context (and silently such that you might not notice
+    // the problem quickly).
+    WaitingTaskHolder makeWaitingTaskHolderAndRelease();
+
+  private:
+    // ---------- member data --------------------------------
+    WaitingTask* m_task;
+    std::shared_ptr<tbb::task_arena> m_arena;
+  };
+
+  template <typename F>
+  auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) {
+    return [holder = std::move(h), func = std::forward<F>(f)]() mutable {
+      try {
+        func(holder);
+      } catch (...) {
+        holder.doneWaiting(std::current_exception());
+      }
+    };
+  }
+
+  template <typename ALLOC, typename F>
+  auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) {
+    return make_waiting_task(
+        std::forward<ALLOC>(iAlloc),
+        [holder = h, func = make_lambda_with_holder(h, std::forward<F>(f))](std::exception_ptr const* excptr) mutable {
+          if (excptr) {
+            holder.doneWaiting(*excptr);
+            return;
+          }
+          func();
+        });
+  }
+}  // namespace edm
+#endif
diff --git a/src/cudacompat/Framework/Worker.cc b/src/cudacompat/Framework/Worker.cc
new file mode 100644
index 000000000..b3b3df74c
--- /dev/null
+++ b/src/cudacompat/Framework/Worker.cc
@@ -0,0 +1,25 @@
+#include "Framework/Worker.h"
+
+namespace edm {
+  void Worker::prefetchAsync(Event& event, EventSetup const& eventSetup, WaitingTask* iTask) {
+    //std::cout << "prefetchAsync for " << this << " iTask " << iTask << std::endl;
+    bool expected = false;
+    if (prefetchRequested_.compare_exchange_strong(expected, true)) {
+      //std::cout << "first prefetch call" << std::endl;
+      //Need to be sure the ref count isn't set to 0 immediately
+      iTask->increment_ref_count();
+      for (Worker* dep : itemsToGet_) {
+        //std::cout << "calling doWorkAsync for " << dep << " with " << iTask << std::endl;
+        dep->doWorkAsync(event, eventSetup, iTask);
+      }
+
+      auto count = iTask->decrement_ref_count();
+      //std::cout << "count " << count << std::endl;
+      if (0 == count) {
+        //std::cout << "spawning iTask for " << this << " task " << iTask << std::endl;
+        //if everything finishes before we leave this routine, we need to launch the task
+        tbb::task::spawn(*iTask);
+      }
+    }
+  }
+}  // namespace edm
diff --git a/src/cudacompat/Framework/Worker.h b/src/cudacompat/Framework/Worker.h
new file mode 100644
index 000000000..0a03670a4
--- /dev/null
+++ b/src/cudacompat/Framework/Worker.h
@@ -0,0 +1,112 @@
+#ifndef Worker_h
+#define Worker_h
+
+#include <atomic>
+#include <vector>
+//#include <iostream>
+
+#include "Framework/WaitingTask.h"
+#include "Framework/WaitingTaskHolder.h"
+#include "Framework/WaitingTaskList.h"
+#include "Framework/WaitingTaskWithArenaHolder.h"
+
+namespace edm {
+  class Event;
+  class EventSetup;
+  class ProductRegistry;
+
+  class Worker {
+  public:
+    virtual ~Worker() = default;
+
+    // not thread safe
+    void setItemsToGet(std::vector<Worker*> workers) { itemsToGet_ = std::move(workers); }
+
+    // thread safe
+    void prefetchAsync(Event& event, EventSetup const& eventSetup, WaitingTask* iTask);
+
+    // not thread safe
+    virtual void doWorkAsync(Event& event, EventSetup const& eventSetup, WaitingTask* iTask) = 0;
+
+    // not thread safe
+    virtual void doEndJob() = 0;
+
+    // not thread safe
+    void reset() {
+      prefetchRequested_ = false;
+      doReset();
+    }
+
+  protected:
+    virtual void doReset() = 0;
+
+  private:
+    std::vector<Worker*> itemsToGet_;
+    std::atomic<bool> prefetchRequested_ = false;
+  };
+
+  template <typename T>
+  class WorkerT : public Worker {
+  public:
+    explicit WorkerT(ProductRegistry& reg) : producer_(reg) {}
+
+    void doWorkAsync(Event& event, EventSetup const& eventSetup, WaitingTask* iTask) override {
+      waitingTasksWork_.add(iTask);
+      //std::cout << "doWorkAsync for " << this << " with iTask " << iTask << std::endl;
+      bool expected = false;
+      if (workStarted_.compare_exchange_strong(expected, true)) {
+        //std::cout << "first doWorkAsync call" << std::endl;
+
+        WaitingTask* moduleTask = make_waiting_task(
+            tbb::task::allocate_root(), [this, &event, &eventSetup](std::exception_ptr const* iPtr) mutable {
+              if (iPtr) {
+                waitingTasksWork_.doneWaiting(*iPtr);
+              } else {
+                std::exception_ptr exceptionPtr;
+                try {
+                  //std::cout << "calling doProduce " << this << std::endl;
+                  producer_.doProduce(event, eventSetup);
+                } catch (...) {
+                  exceptionPtr = std::current_exception();
+                }
+                //std::cout << "waitingTasksWork_.doneWaiting " << this << std::endl;
+                waitingTasksWork_.doneWaiting(exceptionPtr);
+              }
+            });
+        if (producer_.hasAcquire()) {
+          WaitingTaskWithArenaHolder runProduceHolder{moduleTask};
+          moduleTask = make_waiting_task(tbb::task::allocate_root(),
+                                         [this, &event, &eventSetup, runProduceHolder = std::move(runProduceHolder)](
+                                             std::exception_ptr const* iPtr) mutable {
+                                           if (iPtr) {
+                                             runProduceHolder.doneWaiting(*iPtr);
+                                           } else {
+                                             std::exception_ptr exceptionPtr;
+                                             try {
+                                               producer_.doAcquire(event, eventSetup, runProduceHolder);
+                                             } catch (...) {
+                                               exceptionPtr = std::current_exception();
+                                             }
+                                             runProduceHolder.doneWaiting(exceptionPtr);
+                                           }
+                                         });
+        }
+        //std::cout << "calling prefetchAsync " << this << " with moduleTask " << moduleTask << std::endl;
+        prefetchAsync(event, eventSetup, moduleTask);
+      }
+    }
+
+    void doEndJob() override { producer_.doEndJob(); }
+
+  private:
+    void doReset() override {
+      waitingTasksWork_.reset();
+      workStarted_ = false;
+    }
+
+    T producer_;
+    WaitingTaskList waitingTasksWork_;
+    std::atomic<bool> workStarted_ = false;
+  };
+}  // namespace edm
+#endif
diff --git a/src/cudacompat/Framework/hardware_pause.h b/src/cudacompat/Framework/hardware_pause.h
new file mode 100644
index 000000000..eb15d0516
--- /dev/null
+++ b/src/cudacompat/Framework/hardware_pause.h
@@ -0,0 +1,33 @@
+#ifndef FWCore_Concurrency_hardware_pause_h
+#define FWCore_Concurrency_hardware_pause_h
+// -*- C++ -*-
+//
+// Package:     Concurrency
+// Class  :     hardware_pause
+//
+/**\class hardware_pause hardware_pause.h FWCore/Concurrency/interface/hardware_pause.h
+
+ Description: assembler instruction to allow a short pause
+
+ Usage:
+    This hardware instruction tells the CPU to pause momentarily. This can be useful
+ in the case where one is doing a 'spin lock' on a quantity that you expect to change
+ within a few clock cycles.
+
+*/
+//
+// Original Author:  Chris Jones
+//         Created:  Thu Feb 21 13:55:57 CST 2013
+// $Id$
+//
+
+//NOTE: Taken from libdispatch shims/atomics.h
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)
+#define hardware_pause() asm("")
+#endif
+#if defined(__x86_64__) || defined(__i386__)
+#undef hardware_pause
+#define hardware_pause() asm("pause")
+#endif
+
+#endif
diff --git a/src/cudacompat/Geometry/phase1PixelTopology.h b/src/cudacompat/Geometry/phase1PixelTopology.h
new file mode 100644
index 000000000..409ebec3c
--- /dev/null
+++ b/src/cudacompat/Geometry/phase1PixelTopology.h
@@ -0,0 +1,174 @@
+#ifndef Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
+#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
+
+#include <cstdint>
+#include <array>
+
+namespace phase1PixelTopology {
+
+  constexpr uint16_t numRowsInRoc = 80;
+  constexpr uint16_t numColsInRoc = 52;
+  constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
+  constexpr uint16_t lastColInRoc = numColsInRoc - 1;
+
+  constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
+  constexpr uint16_t numColsInModule = 8 * numColsInRoc;
+  constexpr uint16_t lastRowInModule = numRowsInModule - 1;
+  constexpr uint16_t lastColInModule = numColsInModule - 1;
+
+  constexpr int16_t xOffset = -81;
+  constexpr int16_t yOffset = -54 * 4;
+
+  constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule);
+
+  constexpr uint32_t numberOfModules = 1856;
+  constexpr uint32_t numberOfLayers = 10;
+  constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
+                                                       96,
+                                                       320,
+                                                       672,  // barrel
+                                                       1184,
+                                                       1296,
+                                                       1408,  // positive endcap
+                                                       1520,
+                                                       1632,
+                                                       1744,  // negative endcap
+                                                       numberOfModules};
+  constexpr char const* layerName[numberOfLayers] = {
+      "BL1",
+      "BL2",
+      "BL3",
+      "BL4",  // barrel
+      "E+1",
+      "E+2",
+      "E+3",  // positive endcap
+      "E-1",
+      "E-2",
+      "E-3"  // negative endcap
+  };
+
+  constexpr uint32_t numberOfModulesInBarrel = 1184;
+  constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8;
+
+  template <class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+      -> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)> {
+    return {{f(Indices)...}};
+  }
+
+  template <int N, class Function>
+  constexpr auto map_to_array(Function f) -> std::array<typename std::result_of<Function(std::size_t)>::type, N> {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
+
+  constexpr uint32_t findMaxModuleStride() {
+    bool go = true;
+    int n = 2;
+    while (go) {
+      for (uint8_t i = 1; i < 11; ++i) {
+        if (layerStart[i] % n != 0) {
+          go = false;
+          break;
+        }
+      }
+      if (!go)
+        break;
+      n *= 2;
+    }
+    return n / 2;
+  }
+
+  constexpr uint32_t maxModuleStride = findMaxModuleStride();
+
+  constexpr uint8_t findLayer(uint32_t detId) {
+    for (uint8_t i = 0; i < 11; ++i)
+      if (detId < layerStart[i + 1])
+        return i;
+    return 11;
+  }
+
+  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
+    detId *= maxModuleStride;
+    for (uint8_t i = 0; i < 11; ++i)
+      if (detId < layerStart[i + 1])
+        return i;
+    return 11;
+  }
+
+  constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
+  constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr bool validateLayerIndex() {
+    bool res = true;
+    for (auto i = 0U; i < numberOfModules; ++i) {
+      auto j = i / maxModuleStride;
+      res &= (layer[j] < 10);
+      res &= (i >= layerStart[layer[j]]);
+      res &= (i < layerStart[layer[j] + 1]);
+    }
+    return res;
+  }
+
+  static_assert(validateLayerIndex(), "layer from detIndex algo is buggy");
+
+  // this is for the ROC n<512 (upgrade 1024)
+  constexpr inline uint16_t divu52(uint16_t n) {
+    n = n >> 2;
+    uint16_t q = (n >> 1) + (n >> 4);
+    q = q + (q >> 4) + (q >> 5);
+    q = q >> 3;
+    uint16_t r = n - q * 13;
+    return q + ((r + 3) >> 4);
+  }
+
+  constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }
+
+  constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }
+
+  constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }
+
+  constexpr inline uint16_t toRocY(uint16_t py) {
+    auto roc = divu52(py);
+    return py - 52 * roc;
+  }
+
+  constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }
+
+  constexpr inline bool isBigPixY(uint16_t py) {
+    auto ly = toRocY(py);
+    return (ly == 0) | (ly == lastColInRoc);
+  }
+
+  constexpr inline uint16_t localX(uint16_t px) {
+    auto shift = 0;
+    if (px > lastRowInRoc)
+      shift += 1;
+    if (px > numRowsInRoc)
+      shift += 1;
+    return px + shift;
+  }
+
+  constexpr inline uint16_t localY(uint16_t py) {
+    auto roc = divu52(py);
+    auto shift = 2 * roc;
+    auto yInRoc = py - 52 * roc;
+    if (yInRoc > 0)
+      shift += 1;
+    return py + shift;
+  }
+
+  //FIXME move it elsewhere?
+  struct AverageGeometry {
+    static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel;
+    float ladderZ[numberOfLaddersInBarrel];
+    float ladderX[numberOfLaddersInBarrel];
+    float ladderY[numberOfLaddersInBarrel];
+    float ladderR[numberOfLaddersInBarrel];
+    float ladderMinZ[numberOfLaddersInBarrel];
+    float ladderMaxZ[numberOfLaddersInBarrel];
+    float endCapZ[2];  // just for pos and neg Layer1
+  };
+
+}  // namespace phase1PixelTopology
+
+#endif  // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
diff --git a/src/cudacompat/Makefile b/src/cudacompat/Makefile
new file mode 100644
index 000000000..878193f42
--- /dev/null
+++ b/src/cudacompat/Makefile
@@ -0,0 +1,147 @@
+TARGET_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+TARGET_NAME := $(notdir $(TARGET_DIR))
+TARGET := $(BASE_DIR)/$(TARGET_NAME)
+include Makefile.deps
+EXTERNAL_DEPENDS := $(cudacompat_EXTERNAL_DEPENDS)
+
+$(TARGET):
+test_cpu:
+test_nvidiagpu: $(TARGET)
+	@echo
+	@echo "Testing $(TARGET)"
+	$(TARGET) --maxEvents 2
+	@echo "Succeeded"
+test_intelagpu:
+test_auto:
+.PHONY: test_cpu test_nvidiagpu test_intelgpu test_auto
+
+EXE_SRC := $(wildcard $(TARGET_DIR)/bin/*.cc)
+EXE_OBJ := $(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$(EXE_SRC:%=%.o))
+EXE_DEP := $(EXE_OBJ:$.o=$.d)
+
+LIBNAMES := $(filter-out plugin-% bin test Makefile% plugins.txt%,$(wildcard *))
+PLUGINNAMES := $(patsubst plugin-%,%,$(filter plugin-%,$(wildcard *)))
+MY_CXXFLAGS := -I$(TARGET_DIR) -DSRC_DIR=$(TARGET_DIR) -DLIB_DIR=$(LIB_DIR)/$(TARGET_NAME)
+MY_LDFLAGS := -ldl -Wl,-rpath,$(LIB_DIR)/$(TARGET_NAME)
+LIB_LDFLAGS := -L$(LIB_DIR)/$(TARGET_NAME)
+
+ALL_DEPENDS := $(EXE_DEP)
+# Files for libraries
+LIBS :=
+define LIB_template
+$(1)_SRC := $$(wildcard $(TARGET_DIR)/$(1)/*.cc)
+$(1)_OBJ := $$(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$$($(1)_SRC:%=%.o))
+$(1)_DEP := $$($(1)_OBJ:$.o=$.d)
+ALL_DEPENDS += $$($(1)_DEP)
+$(1)_LIB := $(LIB_DIR)/$(TARGET_NAME)/lib$(1).so
+LIBS += $$($(1)_LIB)
+$(1)_LDFLAGS := -l$(1)
+endef
+$(foreach lib,$(LIBNAMES),$(eval $(call LIB_template,$(lib))))
+
+# Files for plugins
+PLUGINS :=
+define PLUGIN_template
+$(1)_SRC := $$(wildcard $(TARGET_DIR)/plugin-$(1)/*.cc)
+$(1)_CUSRC := $$(wildcard $(TARGET_DIR)/plugin-$(1)/*.cu)
+$(1)_OBJ := $$(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$$($(1)_SRC:%=%.o))
+$(1)_CUOBJ := $$(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$$($(1)_CUSRC:%=%.o))
+$(1)_DEP := $$($(1)_OBJ:$.o=$.d)
+ALL_DEPENDS += $$($(1)_DEP)
+$(1)_LIB := $(LIB_DIR)/$(TARGET_NAME)/plugin$(1).so
+PLUGINS += $$($(1)_LIB)
+$(1)_CUDADLINK := $$(if $$(strip $$($(1)_CUOBJ)),$(OBJ_DIR)/$(TARGET_NAME)/plugin-$(1)/plugin$(1)_cudadlink.o,)
+endef
+$(foreach lib,$(PLUGINNAMES),$(eval $(call PLUGIN_template,$(lib))))
+
+# Files for unit tests
+TESTS_SRC := $(wildcard $(TARGET_DIR)/test/*.cc)
+TESTS_OBJ := $(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$(TESTS_SRC:%=%.o))
+TESTS_DEP := $(TESTS_OBJ:$.o=$.d)
+TESTS_CUSRC := $(wildcard $(TARGET_DIR)/test/*.cu)
+TESTS_CUOBJ := $(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$(TESTS_CUSRC:%=%.o))
+TESTS_CUDADLINK := $(TESTS_CUOBJ:$cu.o=$cudadlink.o)
+TESTS_CUDEP := $(TESTS_CUOBJ:$.o=$.d)
+TESTS_EXE_CPU := $(patsubst $(SRC_DIR)/$(TARGET_NAME)/test/%.cc,$(TEST_DIR)/$(TARGET_NAME)/%,$(TESTS_SRC))
+TESTS_EXE_CUDA :=  $(patsubst $(SRC_DIR)/$(TARGET_NAME)/test/%.cu,$(TEST_DIR)/$(TARGET_NAME)/%,$(TESTS_CUSRC))
+TESTS_EXE := $(TESTS_EXE_CPU) $(TESTS_EXE_CUDA)
+ALL_DEPENDS += $(TESTS_DEP) $(TESTS_CUDEP)
+# Needed to keep the unit test object files after building $(TARGET)
+.SECONDARY: $(TESTS_OBJ) $(TESTS_CUOBJ) $(TESTS_CUDADLINK)
+
+define RUNTEST_template
+run_$(1): $(1)
+	@echo
+	@echo "Running test $(1)"
+	@$(1)
+	@echo "Succeeded"
+test_$(2): run_$(1)
+endef
+$(foreach test,$(TESTS_EXE_CPU),$(eval $(call RUNTEST_template,$(test),cpu)))
+TEST_EXE_CUDA_RUN := $(filter-out $(TEST_DIR)/$(TARGET_NAME)/testEigenGPUNoFit,$(TESTS_EXE_CUDA))
+$(foreach test,$(TEST_EXE_CUDA_RUN),$(eval $(call RUNTEST_template,$(test),nvidiagpu)))
+
+-include $(ALL_DEPENDS)
+
+# Build targets
+$(TARGET): $(EXE_OBJ) $(LIBS) $(PLUGINS) | $(TESTS_EXE)
+	$(CXX) $(EXE_OBJ) $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS))
+
+define BUILD_template
+$(OBJ_DIR)/$(2)/%.cc.o: $(SRC_DIR)/$(2)/%.cc
+	@[ -d $$(@D) ] || mkdir -p $$(@D)
+	$(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD
+	@cp $(OBJ_DIR)/$(2)/$$*.cc.d $(OBJ_DIR)/$(2)/$$*.cc.d.tmp; \
+	  sed 's#\($(2)/$$*\)\.o[ :]*#\1.o \1.d : #g' < $(OBJ_DIR)/$(2)/$$*.cc.d.tmp > $(OBJ_DIR)/$(2)/$$*.cc.d; \
+	  sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$$$//' \
+	      -e '/^$$$$/ d' -e 's/$$$$/ :/' -e 's/ *//' < $(OBJ_DIR)/$(2)/$$*.cc.d.tmp >> $(OBJ_DIR)/$(2)/$$*.cc.d; \
+	  rm $(OBJ_DIR)/$(2)/$$*.cc.d.tmp
+
+$(OBJ_DIR)/$(2)/%.cu.o: $(SRC_DIR)/$(2)/%.cu
+	@[ -d $$(@D) ] || mkdir -p $$(@D)
+	$(CUDA_NVCC) $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(MY_CXXFLAGS) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD
+
+$$($(1)_CUDADLINK): $$($(1)_CUOBJ)
+	$(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $$($(1)_CUOBJ) -o $$@
+
+$$($(1)_LIB): $$($(1)_OBJ) $$($(1)_CUDADLINK) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_DEPS)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LIB))
+	@[ -d $$(@D) ] || mkdir -p $$(@D)
+	$(CXX) $$($(1)_OBJ) $$($(1)_CUOBJ) $$($(1)_CUDADLINK) $(LDFLAGS) -shared $(SO_LDFLAGS) $(LIB_LDFLAGS) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LDFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_LDFLAGS)) -o $$@
+endef
+
+$(foreach lib,$(LIBNAMES),$(eval $(call BUILD_template,$(lib),$(TARGET_NAME)/$(lib))))
+$(foreach lib,$(PLUGINNAMES),$(eval $(call BUILD_template,$(lib),$(TARGET_NAME)/plugin-$(lib))))
+
+$(OBJ_DIR)/$(TARGET_NAME)/bin/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/bin/%.cc
+	@[ -d $(@D) ] || mkdir -p $(@D)
+	$(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD
+	@cp $(@D)/$*.cc.d $(@D)/$*.cc.d.tmp; \
+	  sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.d.tmp > $(@D)/$*.cc.d; \
+	  sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+	      -e '/^$$/ d' -e 's/$$/ :/' -e 's/ *//' < $(@D)/$*.cc.d.tmp >> $(@D)/$*.cc.d; \
+	  rm $(@D)/$*.cc.d.tmp
+
+# Tests
+$(OBJ_DIR)/$(TARGET_NAME)/test/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/test/%.cc
+	@[ -d $(@D) ] || mkdir -p $(@D)
+	$(CXX) $(CXXFLAGS) $(CUDA_TEST_CXXFLAGS) $(MY_CXXFLAGS) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD
+	@cp $(@D)/$*.cc.d $(@D)/$*.cc.d.tmp; \
+	  sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.d.tmp > $(@D)/$*.cc.d; \
+	  sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+	      -e '/^$$/ d' -e 's/$$/ :/' -e 's/ *//' < $(@D)/$*.cc.d.tmp >> $(@D)/$*.cc.d; \
+	  rm $(@D)/$*.cc.d.tmp
+
+$(TEST_DIR)/$(TARGET_NAME)/%: $(OBJ_DIR)/$(TARGET_NAME)/test/%.cc.o | $(LIBS)
+	@[ -d $(@D) ] || mkdir -p $(@D)
+	$(CXX) $< $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS))
+
+$(OBJ_DIR)/$(TARGET_NAME)/test/%.cu.o: $(SRC_DIR)/$(TARGET_NAME)/test/%.cu
+	@[ -d $(@D) ] || mkdir -p $(@D)
+	$(CUDA_NVCC) $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(MY_CXXFLAGS) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD
+
+$(OBJ_DIR)/$(TARGET_NAME)/test/%.cudadlink.o: $(OBJ_DIR)/$(TARGET_NAME)/test/%.cu.o
+	$(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $< -o $@
+
+$(TEST_DIR)/$(TARGET_NAME)/%: $(OBJ_DIR)/$(TARGET_NAME)/test/%.cu.o $(OBJ_DIR)/$(TARGET_NAME)/test/%.cudadlink.o | $(LIBS)
+	@[ -d $(@D) ] || mkdir -p $(@D)
+	$(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS))
diff --git a/src/cudacompat/Makefile.deps b/src/cudacompat/Makefile.deps
new file mode 100644
index 000000000..30e5ac5c7
--- /dev/null
+++ b/src/cudacompat/Makefile.deps
@@ -0,0 +1,12 @@
+cudacompat_EXTERNAL_DEPENDS := TBB CUDA EIGEN
+BeamSpotProducer_DEPENDS := Framework CUDACore CUDADataFormats
+CUDACore_DEPENDS := Framework
+CUDADataFormats_DEPENDS := CUDACore DataFormats
+CondFormats_DEPENDS := CUDACore
+PixelTriplets_DEPENDS := Framework CUDACore CUDADataFormats
+PixelTrackFitting_DEPENDS := Framework CUDACore CUDADataFormats
+PixelVertexFinding_DEPENDS := Framework CUDACore CUDADataFormats
+SiPixelClusterizer_DEPENDS := Framework CUDACore CUDADataFormats CondFormats DataFormats
+SiPixelRawToDigi_DEPENDS := Framework CUDACore CUDADataFormats CondFormats DataFormats
+SiPixelRecHits_DEPENDS := Framework CUDACore CUDADataFormats CondFormats
+Validation_DEPENDS := Framework CUDACore CUDADataFormats
diff --git a/src/cudacompat/bin/EventProcessor.cc b/src/cudacompat/bin/EventProcessor.cc
new file mode 100644
index 000000000..a50cc6570
--- /dev/null
+++ b/src/cudacompat/bin/EventProcessor.cc
@@ -0,0 +1,45 @@
+#include "Framework/EmptyWaitingTask.h"
+#include "Framework/ESPluginFactory.h"
+#include "Framework/WaitingTask.h"
+#include "Framework/WaitingTaskHolder.h"
+
+#include "EventProcessor.h"
+
+namespace edm {
+  EventProcessor::EventProcessor(int maxEvents,
+                                 int numberOfStreams,
+                                 std::vector<std::string> const& path,
+                                 std::vector<std::string> const& esproducers,
+                                 std::filesystem::path const& datadir,
+                                 bool validation)
+      : source_(maxEvents, registry_, datadir, validation) {
+    for (auto const& name : esproducers) {
+      pluginManager_.load(name);
+      auto esp = ESPluginFactory::create(name, datadir);
+      esp->produce(eventSetup_);
+    }
+
+    //schedules_.reserve(numberOfStreams);
+    for (int i = 0; i < numberOfStreams; ++i) {
+      schedules_.emplace_back(registry_, pluginManager_, &source_, &eventSetup_, i, path);
+    }
+  }
+
+  void EventProcessor::runToCompletion() {
+    // The task that waits for all other work
+    auto globalWaitTask = make_empty_waiting_task();
+    globalWaitTask->increment_ref_count();
+    for (auto& s : schedules_) {
+      s.runToCompletionAsync(WaitingTaskHolder(globalWaitTask.get()));
+    }
+    globalWaitTask->wait_for_all();
+    if (globalWaitTask->exceptionPtr()) {
+      std::rethrow_exception(*(globalWaitTask->exceptionPtr()));
+    }
+  }
+
+  void EventProcessor::endJob() {
+    // Only on the first stream...
+    schedules_[0].endJob();
+  }
+}  // namespace edm
diff --git a/src/cudacompat/bin/EventProcessor.h b/src/cudacompat/bin/EventProcessor.h
new file mode 100644
index 000000000..614e60c38
--- /dev/null
+++ b/src/cudacompat/bin/EventProcessor.h
@@ -0,0 +1,39 @@
+#ifndef EventProcessor_h
+#define EventProcessor_h
+
+#include <filesystem>
+#include <string>
+#include <vector>
+
+#include "Framework/EventSetup.h"
+
+#include "PluginManager.h"
+#include "StreamSchedule.h"
+#include "Source.h"
+
+namespace edm {
+  class EventProcessor {
+  public:
+    explicit EventProcessor(int maxEvents,
+                            int numberOfStreams,
+                            std::vector<std::string> const& path,
+                            std::vector<std::string> const& esproducers,
+                            std::filesystem::path const& datadir,
+                            bool validation);
+
+    int maxEvents() const { return source_.maxEvents(); }
+
+    void runToCompletion();
+
+    void endJob();
+
+  private:
+    edmplugin::PluginManager pluginManager_;
+    ProductRegistry registry_;
+    Source source_;
+    EventSetup eventSetup_;
+    std::vector<StreamSchedule> schedules_;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/bin/PluginManager.cc b/src/cudacompat/bin/PluginManager.cc
new file mode 100644
index 000000000..7977cdbc2
--- /dev/null
+++ b/src/cudacompat/bin/PluginManager.cc
@@ -0,0 +1,39 @@
+#include <iostream>
+#include <fstream>
+
+#include "PluginManager.h"
+
+#ifndef SRC_DIR
+#error "SRC_DIR undefined"
+#endif
+#ifndef LIB_DIR
+#error "LIB_DIR undefined"
+#endif
+
+#define STR_EXPAND(x) #x
+#define STR(x) STR_EXPAND(x)
+
+namespace edmplugin {
+  PluginManager::PluginManager() {
+    std::ifstream pluginMap(STR(SRC_DIR) "/plugins.txt");
+    std::string plugin, library;
+    while (pluginMap >> plugin >> library) {
+      //std::cout << "plugin " << plugin << " in " << library << std::endl;
+      pluginToLibrary_[plugin] = library;
+    }
+  }
+
+  SharedLibrary const& PluginManager::load(std::string const& pluginName) {
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
+
+    auto libName = pluginToLibrary_.at(pluginName);
+
+    auto found = loadedPlugins_.find(libName);
+    if (found == loadedPlugins_.end()) {
+      auto ptr = std::make_shared<SharedLibrary>(STR(LIB_DIR) "/" + libName);
+      loadedPlugins_[libName] = ptr;
+      return *ptr;
+    }
+    return *(found->second);
+  }
+}  // namespace edmplugin
diff --git a/src/cudacompat/bin/PluginManager.h b/src/cudacompat/bin/PluginManager.h
new file mode 100644
index 000000000..3a1042cf8
--- /dev/null
+++ b/src/cudacompat/bin/PluginManager.h
@@ -0,0 +1,26 @@
+#ifndef PluginManager_h
+#define PluginManager_h
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+#include "SharedLibrary.h"
+
+namespace edmplugin {
+  class PluginManager {
+  public:
+    PluginManager();
+
+    SharedLibrary const& load(std::string const& pluginName);
+
+  private:
+    std::unordered_map<std::string, std::string> pluginToLibrary_;
+
+    std::recursive_mutex mutex_;
+    std::unordered_map<std::string, std::shared_ptr<SharedLibrary>> loadedPlugins_;
+  };
+}  // namespace edmplugin
+
+#endif
diff --git a/src/cudacompat/bin/SharedLibrary.cc b/src/cudacompat/bin/SharedLibrary.cc
new file mode 100644
index 000000000..b0e70d85a
--- /dev/null
+++ b/src/cudacompat/bin/SharedLibrary.cc
@@ -0,0 +1,82 @@
+// -*- C++ -*-
+//
+// Package:     PluginManager
+// Class  :     SharedLibrary
+//
+// Implementation:
+//     <Notes on implementation>
+//
+// Original Author:  Chris Jones
+//         Created:  Thu Apr  5 15:30:15 EDT 2007
+//
+
+// system include files
+#include <string> /*needed by the following include*/
+#include <dlfcn.h>
+#include <cerrno>
+#include <stdexcept>
+
+// user include files
+#include "SharedLibrary.h"
+
+namespace edmplugin {
+  //
+  // constants, enums and typedefs
+  //
+
+  //
+  // static data member definitions
+  //
+
+  //
+  // constructors and destructor
+  //
+  SharedLibrary::SharedLibrary(const std::string& iName)
+      : libraryHandle_(::dlopen(iName.c_str(), RTLD_LAZY | RTLD_GLOBAL)), path_(iName) {
+    if (libraryHandle_ == nullptr) {
+      char const* err = dlerror();
+      if (err == nullptr) {
+        throw std::runtime_error("unable to load " + iName);
+      }
+      throw std::runtime_error("unable to load " + iName + " because " + err);
+    }
+  }
+
+  // SharedLibrary::SharedLibrary(const SharedLibrary& rhs)
+  // {
+  //    // do actual copying here;
+  // }
+
+  SharedLibrary::~SharedLibrary() {}
+
+  //
+  // assignment operators
+  //
+  // const SharedLibrary& SharedLibrary::operator=(const SharedLibrary& rhs)
+  // {
+  //   //An exception safe implementation is
+  //   SharedLibrary temp(rhs);
+  //   swap(rhs);
+  //
+  //   return *this;
+  // }
+
+  //
+  // member functions
+  //
+
+  //
+  // const member functions
+  //
+  bool SharedLibrary::symbol(const std::string& iSymbolName, void*& iSymbol) const {
+    if (libraryHandle_ == nullptr) {
+      return false;
+    }
+    iSymbol = dlsym(libraryHandle_, iSymbolName.c_str());
+    return (iSymbol != nullptr);
+  }
+
+  //
+  // static member functions
+  //
+}  // namespace edmplugin
diff --git a/src/cudacompat/bin/SharedLibrary.h b/src/cudacompat/bin/SharedLibrary.h
new file mode 100644
index 000000000..345b2fe84
--- /dev/null
+++ b/src/cudacompat/bin/SharedLibrary.h
@@ -0,0 +1,53 @@
+#ifndef FWCore_PluginManager_SharedLibrary_h
+#define FWCore_PluginManager_SharedLibrary_h
+// -*- C++ -*-
+//
+// Package:     PluginManager
+// Class  :     SharedLibrary
+//
+/**\class SharedLibrary SharedLibrary.h FWCore/PluginManager/interface/SharedLibrary.h
+
+ Description: Handles the loading of a SharedLibrary
+
+ Usage:
+    <usage>
+
+*/
+//
+// Original Author:  Chris Jones
+//         Created:  Thu Apr  5 15:30:08 EDT 2007
+//
+
+// system include files
+#include <string>
+
+// user include files
+
+// forward declarations
+
+namespace edmplugin {
+  class SharedLibrary {
+  public:
+    SharedLibrary(const std::string& iName);
+    ~SharedLibrary();
+
+    // ---------- const member functions ---------------------
+    bool symbol(const std::string& iSymbolName, void*& iSymbol) const;
+    const std::string& path() const { return path_; }
+
+    // ---------- static member functions --------------------
+
+    // ---------- member functions ---------------------------
+
+  private:
+    SharedLibrary(const SharedLibrary&) = delete;  // stop default
+
+    const SharedLibrary& operator=(const SharedLibrary&) = delete;  // stop default
+
+    // ---------- member data --------------------------------
+    void* libraryHandle_;
+    std::string path_;
+  };
+
+}  // namespace edmplugin
+#endif
diff --git a/src/cudacompat/bin/Source.cc b/src/cudacompat/bin/Source.cc
new file mode 100644
index 000000000..64d1a3b5c
--- /dev/null
+++ b/src/cudacompat/bin/Source.cc
@@ -0,0 +1,100 @@
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <filesystem>
+
+#include "Source.h"
+
+namespace {
+  FEDRawDataCollection readRaw(std::ifstream &is, unsigned int nfeds) {
+    FEDRawDataCollection rawCollection;
+    for (unsigned int ifed = 0; ifed < nfeds; ++ifed) {
+      unsigned int fedId;
+      is.read(reinterpret_cast<char *>(&fedId), sizeof(unsigned int));
+      unsigned int fedSize;
+      is.read(reinterpret_cast<char *>(&fedSize), sizeof(unsigned int));
+      FEDRawData &rawData = rawCollection.FEDData(fedId);
+      rawData.resize(fedSize);
+      is.read(reinterpret_cast<char *>(rawData.data()), fedSize);
+    }
+    return rawCollection;
+  }
+
+}  // namespace
+
+namespace edm {
+  Source::Source(int maxEvents, ProductRegistry &reg, std::filesystem::path const &datadir, bool validation)
+      : maxEvents_(maxEvents), numEvents_(0), rawToken_(reg.produces<FEDRawDataCollection>()), validation_(validation) {
+    std::ifstream in_raw(datadir / "raw.bin", std::ios::binary);
+    std::ifstream in_digiclusters;
+    std::ifstream in_tracks;
+    std::ifstream in_vertices;
+
+    if (validation_) {
+      digiClusterToken_ = reg.produces<DigiClusterCount>();
+      trackToken_ = reg.produces<TrackCount>();
+      vertexToken_ = reg.produces<VertexCount>();
+
+      in_digiclusters = std::ifstream(datadir / "digicluster.bin", std::ios::binary);
+      in_tracks = std::ifstream(datadir / "tracks.bin", std::ios::binary);
+      in_vertices = std::ifstream(datadir / "vertices.bin", std::ios::binary);
+      in_digiclusters.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+      in_tracks.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+      in_vertices.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+    }
+
+    unsigned int nfeds;
+    in_raw.exceptions(std::ifstream::badbit);
+    in_raw.read(reinterpret_cast<char *>(&nfeds), sizeof(unsigned int));
+    while (not in_raw.eof()) {
+      in_raw.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+
+      raw_.emplace_back(readRaw(in_raw, nfeds));
+
+      if (validation_) {
+        unsigned int nm, nd, nc, nt, nv;
+        in_digiclusters.read(reinterpret_cast<char *>(&nm), sizeof(unsigned int));
+        in_digiclusters.read(reinterpret_cast<char *>(&nd), sizeof(unsigned int));
+        in_digiclusters.read(reinterpret_cast<char *>(&nc), sizeof(unsigned int));
+        in_tracks.read(reinterpret_cast<char *>(&nt), sizeof(unsigned int));
+        in_vertices.read(reinterpret_cast<char *>(&nv), sizeof(unsigned int));
+        digiclusters_.emplace_back(nm, nd, nc);
+        tracks_.emplace_back(nt);
+        vertices_.emplace_back(nv);
+      }
+
+      // next event
+      in_raw.exceptions(std::ifstream::badbit);
+      in_raw.read(reinterpret_cast<char *>(&nfeds), sizeof(unsigned int));
+    }
+
+    if (validation_) {
+      assert(raw_.size() == digiclusters_.size());
+      assert(raw_.size() == tracks_.size());
+      assert(raw_.size() == vertices_.size());
+    }
+
+    if (maxEvents_ < 0) {
+      maxEvents_ = raw_.size();
+    }
+  }
+
+  std::unique_ptr<Event> Source::produce(int streamId, ProductRegistry const &reg) {
+    const int old = numEvents_.fetch_add(1);
+    const int iev = old + 1;
+    if (old >= maxEvents_) {
+      return nullptr;
+    }
+    auto ev = std::make_unique<Event>(streamId, iev, reg);
+    const int index = old % raw_.size();
+
+    ev->emplace(rawToken_, raw_[index]);
+    if (validation_) {
+      ev->emplace(digiClusterToken_, digiclusters_[index]);
+      ev->emplace(trackToken_, tracks_[index]);
+      ev->emplace(vertexToken_, vertices_[index]);
+    }
+
+    return ev;
+  }
+}  // namespace edm
diff --git a/src/cudacompat/bin/Source.h b/src/cudacompat/bin/Source.h
new file mode 100644
index 000000000..c13534f33
--- /dev/null
+++ b/src/cudacompat/bin/Source.h
@@ -0,0 +1,40 @@
+#ifndef Source_h
+#define Source_h
+
+#include <atomic>
+#include <filesystem>
+#include <string>
+#include <memory>
+
+#include "Framework/Event.h"
+#include "DataFormats/FEDRawDataCollection.h"
+#include "DataFormats/DigiClusterCount.h"
+#include "DataFormats/TrackCount.h"
+#include "DataFormats/VertexCount.h"
+
+namespace edm {
+  class Source {
+  public:
+    explicit Source(int maxEvents, ProductRegistry& reg, std::filesystem::path const& datadir, bool validation);
+
+    int maxEvents() const { return maxEvents_; }
+
+    // thread safe
+    std::unique_ptr<Event> produce(int streamId, ProductRegistry const& reg);
+
+  private:
+    int maxEvents_;
+    std::atomic<int> numEvents_;
+    EDPutTokenT<FEDRawDataCollection> const rawToken_;
+    EDPutTokenT<DigiClusterCount> digiClusterToken_;
+    EDPutTokenT<TrackCount> trackToken_;
+    EDPutTokenT<VertexCount> vertexToken_;
+    std::vector<FEDRawDataCollection> raw_;
+    std::vector<DigiClusterCount> digiclusters_;
+    std::vector<TrackCount> tracks_;
+    std::vector<VertexCount> vertices_;
+    bool const validation_;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/bin/StreamSchedule.cc b/src/cudacompat/bin/StreamSchedule.cc
new file mode 100644
index 000000000..8636bce24
--- /dev/null
+++ b/src/cudacompat/bin/StreamSchedule.cc
@@ -0,0 +1,94 @@
+//#include <iostream>
+
+#include <tbb/task.h>
+
+#include "Framework/FunctorTask.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/WaitingTask.h"
+#include "Framework/Worker.h"
+
+#include "PluginManager.h"
+#include "Source.h"
+#include "StreamSchedule.h"
+
+namespace edm {
+  StreamSchedule::StreamSchedule(ProductRegistry reg,
+                                 edmplugin::PluginManager& pluginManager,
+                                 Source* source,
+                                 EventSetup const* eventSetup,
+                                 int streamId,
+                                 std::vector<std::string> const& path)
+      : registry_(std::move(reg)), source_(source), eventSetup_(eventSetup), streamId_(streamId) {
+    path_.reserve(path.size());
+    int modInd = 1;
+    for (auto const& name : path) {
+      pluginManager.load(name);
+      registry_.beginModuleConstruction(modInd);
+      path_.emplace_back(PluginFactory::create(name, registry_));
+      //std::cout << "module " << modInd << " " << path_.back().get() << std::endl;
+      std::vector<Worker*> consumes;
+      for (unsigned int depInd : registry_.consumedModules()) {
+        if (depInd != ProductRegistry::kSourceIndex) {
+          //std::cout << "module " << modInd << " depends on " << (depInd-1) << " " << path_[depInd-1].get() << std::endl;
+          consumes.push_back(path_[depInd - 1].get());
+        }
+      }
+      path_.back()->setItemsToGet(std::move(consumes));
+      ++modInd;
+    }
+  }
+
+  StreamSchedule::~StreamSchedule() = default;
+  StreamSchedule::StreamSchedule(StreamSchedule&&) = default;
+  StreamSchedule& StreamSchedule::operator=(StreamSchedule&&) = default;
+
+  void StreamSchedule::runToCompletionAsync(WaitingTaskHolder h) {
+    auto task =
+        make_functor_task(tbb::task::allocate_root(), [this, h]() mutable { processOneEventAsync(std::move(h)); });
+    if (streamId_ == 0) {
+      tbb::task::spawn(*task);
+    } else {
+      tbb::task::enqueue(*task);
+    }
+  }
+
+  void StreamSchedule::processOneEventAsync(WaitingTaskHolder h) {
+    auto event = source_->produce(streamId_, registry_);
+    if (event) {
+      // Pass the event object ownership to the "end-of-event" task
+      // Pass a non-owning pointer to the event to preceding tasks
+      //std::cout << "Begin processing event " << event->eventID() << std::endl;
+      auto eventPtr = event.get();
+      auto nextEventTask =
+          make_waiting_task(tbb::task::allocate_root(),
+                            [this, h = std::move(h), ev = std::move(event)](std::exception_ptr const* iPtr) mutable {
+                              ev.reset();
+                              if (iPtr) {
+                                h.doneWaiting(*iPtr);
+                              } else {
+                                for (auto const& worker : path_) {
+                                  worker->reset();
+                                }
+                                processOneEventAsync(std::move(h));
+                              }
+                            });
+      // To guarantee that the nextEventTask is spawned also in
+      // absence of Workers, and also to prevent spawning it before
+      // all workers have been processed (should not happen though)
+      auto nextEventTaskHolder = WaitingTaskHolder(nextEventTask);
+
+      for (auto iWorker = path_.rbegin(); iWorker != path_.rend(); ++iWorker) {
+        //std::cout << "calling doWorkAsync for " << iWorker->get() << " with nextEventTask " << nextEventTask << std::endl;
+        (*iWorker)->doWorkAsync(*eventPtr, *eventSetup_, nextEventTask);
+      }
+    } else {
+      h.doneWaiting(std::exception_ptr{});
+    }
+  }
+
+  void StreamSchedule::endJob() {
+    for (auto& w : path_) {
+      w->doEndJob();
+    }
+  }
+}  // namespace edm
diff --git a/src/cudacompat/bin/StreamSchedule.h b/src/cudacompat/bin/StreamSchedule.h
new file mode 100644
index 000000000..1bd364c70
--- /dev/null
+++ b/src/cudacompat/bin/StreamSchedule.h
@@ -0,0 +1,51 @@
+#ifndef StreamSchedule_h
+#define StreamSchedule_h
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "Framework/ProductRegistry.h"
+#include "Framework/WaitingTaskHolder.h"
+
+namespace edmplugin {
+  class PluginManager;
+}
+
+namespace edm {
+  class EventSetup;
+  class Source;
+  class Worker;
+
+  // Schedule of modules per stream (concurrent event)
+  class StreamSchedule {
+  public:
+    // copy ProductRegistry per stream
+    explicit StreamSchedule(ProductRegistry reg,
+                            edmplugin::PluginManager& pluginManager,
+                            Source* source,
+                            EventSetup const* eventSetup,
+                            int streamId,
+                            std::vector<std::string> const& path);
+    ~StreamSchedule();
+    StreamSchedule(StreamSchedule const&) = delete;
+    StreamSchedule& operator=(StreamSchedule const&) = delete;
+    StreamSchedule(StreamSchedule&&);
+    StreamSchedule& operator=(StreamSchedule&&);
+
+    void runToCompletionAsync(WaitingTaskHolder h);
+
+    void endJob();
+
+  private:
+    void processOneEventAsync(WaitingTaskHolder h);
+
+    ProductRegistry registry_;
+    Source* source_;
+    EventSetup const* eventSetup_;
+    std::vector<std::unique_ptr<Worker>> path_;
+    int streamId_;
+  };
+}  // namespace edm
+
+#endif
diff --git a/src/cudacompat/bin/main.cc b/src/cudacompat/bin/main.cc
new file mode 100644
index 000000000..c8a76eee5
--- /dev/null
+++ b/src/cudacompat/bin/main.cc
@@ -0,0 +1,171 @@
+#include <algorithm>
+#include <cstdlib>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <filesystem>
+#include <string>
+#include <vector>
+
+#include <tbb/task_scheduler_init.h>
+
+#include <cuda_runtime.h>
+
+#include "EventProcessor.h"
+
+namespace {
+  void print_help(std::string const& name) {
+    std::cout
+        << name
+        << ": [--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] [--transfer] [--validation] "
+           "[--histogram] [--empty]\n\n"
+        << "Options\n"
+        << " --numberOfThreads   Number of threads to use (default 1)\n"
+        << " --numberOfStreams   Number of concurrent events (default 0=numberOfThreads)\n"
+        << " --maxEvents         Number of events to process (default -1 for all events in the input file)\n"
+        << " --data              Path to the 'data' directory (default 'data' in the directory of the executable)\n"
+        << " --transfer          Transfer results from GPU to CPU (default is to leave them on GPU)\n"
+        << " --validation        Run (rudimentary) validation at the end (implies --transfer)\n"
+        << " --histogram         Produce histograms at the end (implies --transfer)\n"
+        << " --empty             Ignore all producers (for testing only)\n"
+        << std::endl;
+  }
+}  // namespace
+
+int main(int argc, char** argv) {
+  // Parse command line arguments
+  std::vector<std::string> args(argv, argv + argc);
+  int numberOfThreads = 1;
+  int numberOfStreams = 0;
+  int maxEvents = -1;
+  std::filesystem::path datadir;
+  bool transfer = false;
+  bool validation = false;
+  bool histogram = false;
+  bool empty = false;
+  for (auto i = args.begin() + 1, e = args.end(); i != e; ++i) {
+    if (*i == "-h" or *i == "--help") {
+      print_help(args.front());
+      return EXIT_SUCCESS;
+    } else if (*i == "--numberOfThreads") {
+      ++i;
+      numberOfThreads = std::stoi(*i);
+    } else if (*i == "--numberOfStreams") {
+      ++i;
+      numberOfStreams = std::stoi(*i);
+    } else if (*i == "--maxEvents") {
+      ++i;
+      maxEvents = std::stoi(*i);
+    } else if (*i == "--data") {
+      ++i;
+      datadir = *i;
+    } else if (*i == "--transfer") {
+      transfer = true;
+    } else if (*i == "--validation") {
+      transfer = true;
+      validation = true;
+    } else if (*i == "--histogram") {
+      transfer = true;
+      histogram = true;
+    } else if (*i == "--empty") {
+      empty = true;
+    } else {
+      std::cout << "Invalid parameter " << *i << std::endl << std::endl;
+      print_help(args.front());
+      return EXIT_FAILURE;
+    }
+  }
+  if (numberOfStreams == 0) {
+    numberOfStreams = numberOfThreads;
+  }
+  if (datadir.empty()) {
+    datadir = std::filesystem::path(args[0]).parent_path() / "data";
+  }
+  if (not std::filesystem::exists(datadir)) {
+    std::cout << "Data directory '" << datadir << "' does not exist" << std::endl;
+    return EXIT_FAILURE;
+  }
+  int numberOfDevices;
+  auto status = cudaGetDeviceCount(&numberOfDevices);
+  if (cudaSuccess != status) {
+    std::cout << "Failed to initialize the CUDA runtime";
+    return EXIT_FAILURE;
+  }
+  std::cout << "Found " << numberOfDevices << " devices" << std::endl;
+
+  // Initialize EventProcessor
+  std::vector<std::string> edmodules;
+  std::vector<std::string> esmodules;
+  if (not empty) {
+    edmodules = {
+        "BeamSpotToCUDA", "SiPixelRawToClusterCUDA", "SiPixelRecHitCUDA", "CAHitNtupletCUDA", "PixelVertexProducerCUDA"};
+    esmodules = {"BeamSpotESProducer",
+                 "SiPixelFedCablingMapGPUWrapperESProducer",
+                 "SiPixelGainCalibrationForHLTGPUESProducer",
+                 "PixelCPEFastESProducer"};
+    if (transfer) {
+      auto capos = std::find(edmodules.begin(), edmodules.end(), "CAHitNtupletCUDA");
+      assert(capos != edmodules.end());
+      edmodules.insert(capos + 1, "PixelTrackSoAFromCUDA");
+      auto vertpos = std::find(edmodules.begin(), edmodules.end(), "PixelVertexProducerCUDA");
+      assert(vertpos != edmodules.end());
+      edmodules.insert(vertpos + 1, "PixelVertexSoAFromCUDA");
+    }
+    if (validation) {
+      edmodules.emplace_back("CountValidator");
+    }
+    if (histogram) {
+      edmodules.emplace_back("HistoValidator");
+    }
+  }
+  edm::EventProcessor processor(
+      maxEvents, numberOfStreams, std::move(edmodules), std::move(esmodules), datadir, validation);
+  maxEvents = processor.maxEvents();
+
+  std::cout << "Processing " << maxEvents << " events, of which " << numberOfStreams << " concurrently, with "
+            << numberOfThreads << " threads." << std::endl;
+
+  // Initialize tasks scheduler (thread pool)
+  tbb::task_scheduler_init tsi(numberOfThreads);
+
+  // Run work
+  auto start = std::chrono::high_resolution_clock::now();
+  try {
+    processor.runToCompletion();
+  } catch (std::runtime_error& e) {
+    std::cout << "\n----------\nCaught std::runtime_error" << std::endl;
+    std::cout << e.what() << std::endl;
+    return EXIT_FAILURE;
+  } catch (std::exception& e) {
+    std::cout << "\n----------\nCaught std::exception" << std::endl;
+    std::cout << e.what() << std::endl;
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cout << "\n----------\nCaught exception of unknown type" << std::endl;
+    return EXIT_FAILURE;
+  }
+  auto stop = std::chrono::high_resolution_clock::now();
+
+  // Run endJob
+  try {
+    processor.endJob();
+  } catch (std::runtime_error& e) {
+    std::cout << "\n----------\nCaught std::runtime_error" << std::endl;
+    std::cout << e.what() << std::endl;
+    return EXIT_FAILURE;
+  } catch (std::exception& e) {
+    std::cout << "\n----------\nCaught std::exception" << std::endl;
+    std::cout << e.what() << std::endl;
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cout << "\n----------\nCaught exception of unknown type" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  // Work done, report timing
+  auto diff = stop - start;
+  auto time = static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(diff).count()) / 1e6;
+  std::cout << "Processed " << maxEvents << " events in " << std::scientific << time << " seconds, throughput "
+            << std::defaultfloat << (maxEvents / time) << " events/s." << std::endl;
+  return EXIT_SUCCESS;
+}
diff --git a/src/cudacompat/plugin-BeamSpotProducer/BeamSpotESProducer.cc b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotESProducer.cc
new file mode 100644
index 000000000..3c7f1a1f0
--- /dev/null
+++ b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotESProducer.cc
@@ -0,0 +1,28 @@
+#include "DataFormats/BeamSpotPOD.h"
+#include "Framework/ESProducer.h"
+#include "Framework/EventSetup.h"
+#include "Framework/ESPluginFactory.h"
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+class BeamSpotESProducer : public edm::ESProducer {
+public:
+  explicit BeamSpotESProducer(std::filesystem::path const& datadir) : data_(datadir) {}
+  void produce(edm::EventSetup& eventSetup);
+
+private:
+  std::filesystem::path data_;
+};
+
+void BeamSpotESProducer::produce(edm::EventSetup& eventSetup) {
+  auto bs = std::make_unique<BeamSpotPOD>();
+
+  std::ifstream in(data_ / "beamspot.bin", std::ios::binary);
+  in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+  in.read(reinterpret_cast<char*>(bs.get()), sizeof(BeamSpotPOD));
+  eventSetup.put(std::move(bs));
+}
+
+DEFINE_FWK_EVENTSETUP_MODULE(BeamSpotESProducer);
diff --git a/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
new file mode 100644
index 000000000..48badcabf
--- /dev/null
+++ b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
@@ -0,0 +1,44 @@
+#include <fstream>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/Product.h"
+#include "CUDACore/ScopedContext.h"
+#include "CUDACore/copyAsync.h"
+#include "CUDACore/host_noncached_unique_ptr.h"
+#include "CUDADataFormats/BeamSpotCUDA.h"
+#include "DataFormats/BeamSpotPOD.h"
+#include "Framework/EDProducer.h"
+#include "Framework/Event.h"
+#include "Framework/EventSetup.h"
+#include "Framework/PluginFactory.h"
+
+class BeamSpotToCUDA : public edm::EDProducer {
+public:
+  explicit BeamSpotToCUDA(edm::ProductRegistry& reg);
+  ~BeamSpotToCUDA() override = default;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+private:
+  const edm::EDPutTokenT<cms::cuda::Product<BeamSpotCUDA>> bsPutToken_;
+
+  cms::cuda::host::noncached::unique_ptr<BeamSpotPOD> bsHost;
+};
+
+BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg)
+    : bsPutToken_{reg.produces<cms::cuda::Product<BeamSpotCUDA>>()},
+      bsHost{cms::cuda::make_host_noncached_unique<BeamSpotPOD>(cudaHostAllocWriteCombined)} {}
+
+void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  *bsHost = iSetup.get<BeamSpotPOD>();
+
+  cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
+
+  BeamSpotCUDA bsDevice(ctx.stream());
+  cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream());
+
+  ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice));
+}
+
+DEFINE_FWK_MODULE(BeamSpotToCUDA);
diff --git a/src/cudacompat/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudacompat/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
new file mode 100644
index 000000000..66e93f818
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
@@ -0,0 +1,65 @@
+#include <cuda_runtime.h>
+
+#include "CUDACore/Product.h"
+#include "CUDACore/HostProduct.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+#include "CUDACore/ScopedContext.h"
+
+class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork {
+public:
+  explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg);
+  ~PixelTrackSoAFromCUDA() override = default;
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
+
+  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
+};
+
+PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)
+    : tokenCUDA_(reg.consumes<cms::cuda::Product<PixelTrackHeterogeneous>>()),
+      tokenSOA_(reg.produces<PixelTrackHeterogeneous>()) {}
+
+void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                    edm::EventSetup const& iSetup,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_soa = inputData.toHostAsync(ctx.stream());
+}
+
+void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  /*
+  auto const & tsoa = *m_soa;
+  auto maxTracks = tsoa.stride();
+  std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    assert(nHits==int(tsoa.hitIndices.size(it)));
+    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    nt++;
+  }
+  std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
+  */
+
+  // DO NOT  make a copy  (actually TWO....)
+  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(m_soa)));
+
+  assert(!m_soa);
+}
+
+DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/src/cudacompat/plugin-PixelTriplets/BrokenLine.h b/src/cudacompat/plugin-PixelTriplets/BrokenLine.h
new file mode 100644
index 000000000..0a4b5f28f
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/BrokenLine.h
@@ -0,0 +1,565 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+
+#include <Eigen/Eigenvalues>
+
+#include "FitUtils.h"
+
+namespace BrokenLine {
+
+  //!< Karimäki's parameters: (phi, d, k=1/R)
+  /*!< covariance matrix: \n
+    |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
+    |cov(phi, d )|cov( d , d )|cov( k , d )| \n
+    |cov(phi, k )|cov( d , k )|cov( k , k )|
+  */
+  using karimaki_circle_fit = Rfit::circle_fit;
+
+  /*!
+    \brief data needed for the Broken Line fit procedure.
+  */
+  template <int N>
+  struct PreparedBrokenLineData {
+    int q;                      //!< particle charge
+    Rfit::Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
+    Rfit::VectorNd<N> s;        //!< total distance traveled in the transverse plane
+                                //   starting from the pre-fitted closest approach
+    Rfit::VectorNd<N> S;        //!< total distance traveled (three-dimensional)
+    Rfit::VectorNd<N> Z;        //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    Rfit::VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
+  };
+
+  /*!
+    \brief Computes the Coulomb multiple scattering variance of the planar angle.
+    
+    \param length length of the track in the material.
+    \param B magnetic field in Gev/cm/c.
+    \param R radius of curvature (needed to evaluate p).
+    \param Layer denotes which of the four layers of the detector is the endpoint of the multiple scattered track. For example, if Layer=3, then the particle has just gone through the material between the second and the third layer.
+    
+    \todo add another Layer variable to identify also the start point of the track, so if there are missing hits or multiple hits, the part of the detector that the particle has traversed can be exactly identified.
+    
+    \warning the formula used here assumes beta=1, and so neglects the dependence of theta_0 on the mass of the particle at fixed momentum.
+    
+    \return the variance of the planar angle ((theta_0)^2 /3).
+  */
+  __host__ __device__ inline double MultScatt(
+      const double& length, const double B, const double R, int Layer, double slope) {
+    // limit R to 20GeV...
+    auto pt2 = std::min(20., B * R);
+    pt2 *= pt2;
+    constexpr double XXI_0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
+    //if(Layer==1) XXI_0=0.06/16.;
+    // else XXI_0=0.06/16.;
+    //XX_0*=1;
+    constexpr double geometry_factor =
+        0.7;  //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+    constexpr double fact = geometry_factor * Rfit::sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + Rfit::sqr(slope))) * (std::abs(length) * XXI_0) *
+           Rfit::sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
+  }
+
+  /*!
+    \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
+    
+    \param slope tangent of the angle of rotation.
+    
+    \return 2D rotation matrix.
+  */
+  __host__ __device__ inline Rfit::Matrix2d RotationMatrix(double slope) {
+    Rfit::Matrix2d Rot;
+    Rot(0, 0) = 1. / sqrt(1. + Rfit::sqr(slope));
+    Rot(0, 1) = slope * Rot(0, 0);
+    Rot(1, 0) = -Rot(0, 1);
+    Rot(1, 1) = Rot(0, 0);
+    return Rot;
+  }
+
+  /*!
+    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a translation of the coordinate system, such that the old origin has coordinates (x0,y0) in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
+    
+    \param circle circle fit in the old coordinate system.
+    \param x0 x coordinate of the translation vector.
+    \param y0 y coordinate of the translation vector.
+    \param jacobian passed by reference in order to save stack.
+  */
+  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle,
+                                                    double x0,
+                                                    double y0,
+                                                    Rfit::Matrix3d& jacobian) {
+    double A, U, BB, C, DO, DP, uu, xi, v, mu, lambda, zeta;
+    DP = x0 * cos(circle.par(0)) + y0 * sin(circle.par(0));
+    DO = x0 * sin(circle.par(0)) - y0 * cos(circle.par(0)) + circle.par(1);
+    uu = 1 + circle.par(2) * circle.par(1);
+    C = -circle.par(2) * y0 + uu * cos(circle.par(0));
+    BB = circle.par(2) * x0 + uu * sin(circle.par(0));
+    A = 2. * DO + circle.par(2) * (Rfit::sqr(DO) + Rfit::sqr(DP));
+    U = sqrt(1. + circle.par(2) * A);
+    xi = 1. / (Rfit::sqr(BB) + Rfit::sqr(C));
+    v = 1. + circle.par(2) * DO;
+    lambda = (0.5 * A) / (U * Rfit::sqr(1. + U));
+    mu = 1. / (U * (1. + U)) + circle.par(2) * lambda;
+    zeta = Rfit::sqr(DO) + Rfit::sqr(DP);
+
+    jacobian << xi * uu * v, -xi * Rfit::sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
+        mu * zeta - lambda * A, 0, 0, 1.;
+
+    circle.par(0) = atan2(BB, C);
+    circle.par(1) = A / (1 + U);
+    // circle.par(2)=circle.par(2);
+
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
+  }
+
+  /*!
+    \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param B magnetic field in Gev/cm/c.
+    \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
+  */
+  template <typename M3xN, typename V4, int N>
+  __host__ __device__ inline void prepareBrokenLineData(const M3xN& hits,
+                                                        const V4& fast_fit,
+                                                        const double B,
+                                                        PreparedBrokenLineData<N>& results) {
+    constexpr auto n = N;
+    u_int i;
+    Rfit::Vector2d d;
+    Rfit::Vector2d e;
+
+    d = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
+    e = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
+    results.q = Rfit::cross2D(d, e) > 0 ? -1 : 1;
+
+    const double slope = -results.q / fast_fit(3);
+
+    Rfit::Matrix2d R = RotationMatrix(slope);
+
+    // calculate radii and s
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * Rfit::MatrixXd::Constant(1, n, 1);
+    e = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
+    for (i = 0; i < n; i++) {
+      d = results.radii.block(0, i, 2, 1);
+      results.s(i) = results.q * fast_fit(2) * atan2(Rfit::cross2D(d, e), d.dot(e));  // calculates the arc length
+    }
+    Rfit::VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
+
+    //calculate S and Z
+    Rfit::Matrix2xNd<N> pointsSZ = Rfit::Matrix2xNd<N>::Zero();
+    for (i = 0; i < n; i++) {
+      pointsSZ(0, i) = results.s(i);
+      pointsSZ(1, i) = z(i);
+      pointsSZ.block(0, i, 2, 1) = R * pointsSZ.block(0, i, 2, 1);
+    }
+    results.S = pointsSZ.block(0, 0, 1, n).transpose();
+    results.Z = pointsSZ.block(1, 0, 1, n).transpose();
+
+    //calculate VarBeta
+    results.VarBeta(0) = results.VarBeta(n - 1) = 0;
+    for (i = 1; i < n - 1; i++) {
+      results.VarBeta(i) = MultScatt(results.S(i + 1) - results.S(i), B, fast_fit(2), i + 2, slope) +
+                           MultScatt(results.S(i) - results.S(i - 1), B, fast_fit(2), i + 1, slope);
+    }
+  }
+
+  /*!
+    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. This is the whole matrix in the case of the line fit and the main n-by-n block in the case of the circle fit.
+    
+    \param w weights of the first part of the cost function, the one with the measurements and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
+    \param S total distance traveled by the particle from the pre-fitted closest approach.
+    \param VarBeta kink angles' variance.
+    
+    \return the n-by-n matrix of the linear system
+  */
+  template <int N>
+  __host__ __device__ inline Rfit::MatrixNd<N> MatrixC_u(const Rfit::VectorNd<N>& w,
+                                                         const Rfit::VectorNd<N>& S,
+                                                         const Rfit::VectorNd<N>& VarBeta) {
+    constexpr u_int n = N;
+    u_int i;
+
+    Rfit::MatrixNd<N> C_U = Rfit::MatrixNd<N>::Zero();
+    for (i = 0; i < n; i++) {
+      C_U(i, i) = w(i);
+      if (i > 1)
+        C_U(i, i) += 1. / (VarBeta(i - 1) * Rfit::sqr(S(i) - S(i - 1)));
+      if (i > 0 && i < n - 1)
+        C_U(i, i) += (1. / VarBeta(i)) * Rfit::sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+      if (i < n - 2)
+        C_U(i, i) += 1. / (VarBeta(i + 1) * Rfit::sqr(S(i + 1) - S(i)));
+
+      if (i > 0 && i < n - 1)
+        C_U(i, i + 1) =
+            1. / (VarBeta(i) * (S(i + 1) - S(i))) * (-(S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+      if (i < n - 2)
+        C_U(i, i + 1) += 1. / (VarBeta(i + 1) * (S(i + 1) - S(i))) *
+                         (-(S(i + 2) - S(i)) / ((S(i + 2) - S(i + 1)) * (S(i + 1) - S(i))));
+
+      if (i < n - 2)
+        C_U(i, i + 2) = 1. / (VarBeta(i + 1) * (S(i + 2) - S(i + 1)) * (S(i + 1) - S(i)));
+
+      C_U(i, i) *= 0.5;
+    }
+    return C_U + C_U.transpose();
+  }
+
+  /*!
+    \brief A very fast helix fit.
+    
+    \param hits the measured hits.
+    
+    \return (X0,Y0,R,tan(theta)).
+    
+    \warning sign of theta is (intentionally, for now) mistaken for negative charges.
+  */
+
+  template <typename M3xN, typename V4>
+  __host__ __device__ inline void BL_Fast_fit(const M3xN& hits, V4& result) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;  // get the number of hits
+
+    const Rfit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Rfit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    const Rfit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+
+    auto tmp = 0.5 / Rfit::cross2D(c, a);
+    result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
+    result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
+    // check Wikipedia for these formulas
+
+    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(Rfit::cross2D(b, a)));
+    // Using Math Olympiad's formula R=abc/(4A)
+
+    const Rfit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const Rfit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+
+    result(3) = result(2) * atan2(Rfit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+    // ds/dz slope between last and first point
+  }
+
+  /*!
+    \brief Performs the Broken Line fit in the curved track case (that is, the fit parameters are the interceptions u and the curvature correction \Delta\kappa).
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param B magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param circle_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (phi, d, k); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit with the curvature correction.\n
+    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and \Delta\kappa and their covariance matrix.
+    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+  */
+  template <typename M3xN, typename M6xN, typename V4, int N>
+  __host__ __device__ inline void BL_Circle_fit(const M3xN& hits,
+                                                const M6xN& hits_ge,
+                                                const V4& fast_fit,
+                                                const double B,
+                                                PreparedBrokenLineData<N>& data,
+                                                karimaki_circle_fit& circle_results) {
+    constexpr u_int n = N;
+    u_int i;
+
+    circle_results.q = data.q;
+    auto& radii = data.radii;
+    const auto& s = data.s;
+    const auto& S = data.S;
+    auto& Z = data.Z;
+    auto& VarBeta = data.VarBeta;
+    const double slope = -circle_results.q / fast_fit(3);
+    VarBeta *= 1. + Rfit::sqr(slope);  // the kink angles are projected!
+
+    for (i = 0; i < n; i++) {
+      Z(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
+    }
+
+    Rfit::Matrix2d V;     // covariance matrix
+    Rfit::VectorNd<N> w;  // weights
+    Rfit::Matrix2d RR;    // rotation matrix point by point
+    //double Slope; // slope of the circle point by point
+    for (i = 0; i < n; i++) {
+      V(0, 0) = hits_ge.col(i)[0];            // x errors
+      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      V(1, 1) = hits_ge.col(i)[2];            // y errors
+      //Slope=-radii(0,i)/radii(1,i);
+      RR = RotationMatrix(-radii(0, i) / radii(1, i));
+      w(i) = 1. / ((RR * V * RR.transpose())(1, 1));  // compute the orthogonal weight point by point
+    }
+
+    Rfit::VectorNplusONEd<N> r_u;
+    r_u(n) = 0;
+    for (i = 0; i < n; i++) {
+      r_u(i) = w(i) * Z(i);
+    }
+
+    Rfit::MatrixNplusONEd<N> C_U;
+    C_U.block(0, 0, n, n) = MatrixC_u(w, s, VarBeta);
+    C_U(n, n) = 0;
+    //add the border to the C_u matrix
+    for (i = 0; i < n; i++) {
+      C_U(i, n) = 0;
+      if (i > 0 && i < n - 1) {
+        C_U(i, n) +=
+            -(s(i + 1) - s(i - 1)) * (s(i + 1) - s(i - 1)) / (2. * VarBeta(i) * (s(i + 1) - s(i)) * (s(i) - s(i - 1)));
+      }
+      if (i > 1) {
+        C_U(i, n) += (s(i) - s(i - 2)) / (2. * VarBeta(i - 1) * (s(i) - s(i - 1)));
+      }
+      if (i < n - 2) {
+        C_U(i, n) += (s(i + 2) - s(i)) / (2. * VarBeta(i + 1) * (s(i + 1) - s(i)));
+      }
+      C_U(n, i) = C_U(i, n);
+      if (i > 0 && i < n - 1)
+        C_U(n, n) += Rfit::sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
+    }
+
+#ifdef CPP_DUMP
+    std::cout << "CU5\n" << C_U << std::endl;
+#endif
+    Rfit::MatrixNplusONEd<N> I;
+    math::cholesky::invert(C_U, I);
+    // Rfit::MatrixNplusONEd<N> I = C_U.inverse();
+#ifdef CPP_DUMP
+    std::cout << "I5\n" << I << std::endl;
+#endif
+
+    Rfit::VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+
+    // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
+
+    radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
+    radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
+
+    Rfit::Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
+    Rfit::Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
+
+    circle_results.par << atan2((e - d)(1), (e - d)(0)),
+        -circle_results.q * (fast_fit(2) - sqrt(Rfit::sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
+        circle_results.q * (1. / fast_fit(2) + u(n));
+
+    assert(circle_results.q * circle_results.par(1) <= 0);
+
+    Rfit::Vector2d eMinusd = e - d;
+    double tmp1 = eMinusd.squaredNorm();
+
+    Rfit::Matrix3d jacobian;
+    jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
+        (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0,
+        (circle_results.q / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) /
+            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
+        (circle_results.q / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) /
+            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
+        0, 0, 0, circle_results.q;
+
+    circle_results.cov << I(0, 0), I(0, 1), I(0, n), I(1, 0), I(1, 1), I(1, n), I(n, 0), I(n, 1), I(n, n);
+
+    circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
+
+    //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
+
+    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), jacobian);
+    circle_results.cov(0, 0) += (1 + Rfit::sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
+
+    //...And translate back to the original system
+
+    TranslateKarimaki(circle_results, d(0), d(1), jacobian);
+
+    // compute chi2
+    circle_results.chi2 = 0;
+    for (i = 0; i < n; i++) {
+      circle_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
+      if (i > 0 && i < n - 1)
+        circle_results.chi2 += Rfit::sqr(u(i - 1) / (s(i) - s(i - 1)) -
+                                         u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
+                                         u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
+                               VarBeta(i);
+    }
+
+    // assert(circle_results.chi2>=0);
+  }
+
+  /*!
+    \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param B magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param line_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (cot(theta), Zip); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit without the curvature correction.\n
+    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and their covariance matrix.
+    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+  */
+  template <typename V4, typename M6xN, int N>
+  __host__ __device__ inline void BL_Line_fit(const M6xN& hits_ge,
+                                              const V4& fast_fit,
+                                              const double B,
+                                              const PreparedBrokenLineData<N>& data,
+                                              Rfit::line_fit& line_results) {
+    constexpr u_int n = N;
+    u_int i;
+
+    const auto& radii = data.radii;
+    const auto& S = data.S;
+    const auto& Z = data.Z;
+    const auto& VarBeta = data.VarBeta;
+
+    const double slope = -data.q / fast_fit(3);
+    Rfit::Matrix2d R = RotationMatrix(slope);
+
+    Rfit::Matrix3d V = Rfit::Matrix3d::Zero();                 // covariance matrix XYZ
+    Rfit::Matrix2x3d JacobXYZtosZ = Rfit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    Rfit::VectorNd<N> w = Rfit::VectorNd<N>::Zero();
+    for (i = 0; i < n; i++) {
+      V(0, 0) = hits_ge.col(i)[0];            // x errors
+      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      V(0, 2) = V(2, 0) = hits_ge.col(i)[3];  // cov_xz
+      V(1, 1) = hits_ge.col(i)[2];            // y errors
+      V(2, 1) = V(1, 2) = hits_ge.col(i)[4];  // cov_yz
+      V(2, 2) = hits_ge.col(i)[5];            // z errors
+      auto tmp = 1. / radii.block(0, i, 2, 1).norm();
+      JacobXYZtosZ(0, 0) = radii(1, i) * tmp;
+      JacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
+      JacobXYZtosZ(1, 2) = 1.;
+      w(i) = 1. / ((R * JacobXYZtosZ * V * JacobXYZtosZ.transpose() * R.transpose())(
+                      1, 1));  // compute the orthogonal weight point by point
+    }
+
+    Rfit::VectorNd<N> r_u;
+    for (i = 0; i < n; i++) {
+      r_u(i) = w(i) * Z(i);
+    }
+#ifdef CPP_DUMP
+    std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl;
+#endif
+    Rfit::MatrixNd<N> I;
+    math::cholesky::invert(MatrixC_u(w, S, VarBeta), I);
+    //    Rfit::MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
+#ifdef CPP_DUMP
+    std::cout << "I4\n" << I << std::endl;
+#endif
+
+    Rfit::VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+
+    // line parameters in the system in which the first hit is the origin and with axis along SZ
+    line_results.par << (u(1) - u(0)) / (S(1) - S(0)), u(0);
+    auto idiff = 1. / (S(1) - S(0));
+    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * Rfit::sqr(idiff) +
+                            MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope),
+        (I(0, 1) - I(0, 0)) * idiff, (I(0, 1) - I(0, 0)) * idiff, I(0, 0);
+
+    // translate to the original SZ system
+    Rfit::Matrix2d jacobian;
+    jacobian(0, 0) = 1.;
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = -S(0);
+    jacobian(1, 1) = 1.;
+    line_results.par(1) += -line_results.par(0) * S(0);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+    // rotate to the original sz system
+    auto tmp = R(0, 0) - line_results.par(0) * R(0, 1);
+    jacobian(1, 1) = 1. / tmp;
+    jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = line_results.par(1) * R(0, 1) * jacobian(0, 0);
+    line_results.par(1) = line_results.par(1) * jacobian(1, 1);
+    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * jacobian(1, 1);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+    // compute chi2
+    line_results.chi2 = 0;
+    for (i = 0; i < n; i++) {
+      line_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
+      if (i > 0 && i < n - 1)
+        line_results.chi2 += Rfit::sqr(u(i - 1) / (S(i) - S(i - 1)) -
+                                       u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
+                                       u(i + 1) / (S(i + 1) - S(i))) /
+                             VarBeta(i);
+    }
+
+    // assert(line_results.chi2>=0);
+  }
+
+  /*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n
+    -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n
+    Points must be passed ordered (from inner to outer layer).
+    
+    \param hits Matrix3xNd hits coordinates in this form: \n
+    |x1|x2|x3|...|xn| \n
+    |y1|y2|y3|...|yn| \n
+    |z1|z2|z3|...|zn|
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+    |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n
+    |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n
+    |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n
+    |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n
+    .       .       .       .       . .       .       .       .       . .       .       .       .       . \n
+    |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n
+    |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n
+    |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n
+    |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n
+    .       .       .    .          . .       .       .       .       . .       .       .       .       . \n
+    |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n
+    |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
+    |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
+    |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
+    \param B magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
+    
+    \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
+    
+    \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs.
+    
+    \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
+  */
+  template <int N>
+  inline Rfit::helix_fit BL_Helix_fit(const Rfit::Matrix3xNd<N>& hits,
+                                      const Eigen::Matrix<float, 6, 4>& hits_ge,
+                                      const double B) {
+    Rfit::helix_fit helix;
+    Rfit::Vector4d fast_fit;
+    BL_Fast_fit(hits, fast_fit);
+
+    PreparedBrokenLineData<N> data;
+    karimaki_circle_fit circle;
+    Rfit::line_fit line;
+    Rfit::Matrix3d jacobian;
+
+    prepareBrokenLineData(hits, fast_fit, B, data);
+    BL_Line_fit(hits_ge, fast_fit, B, data, line);
+    BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+
+    // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
+    jacobian << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (Rfit::sqr(circle.par(2)) * circle.par(2));
+    circle.par(2) = B / std::abs(circle.par(2));
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
+
+    helix.par << circle.par, line.par;
+    helix.cov = Rfit::MatrixXd::Zero(5, 5);
+    helix.cov.block(0, 0, 3, 3) = circle.cov;
+    helix.cov.block(3, 3, 2, 2) = line.cov;
+    helix.q = circle.q;
+    helix.chi2_circle = circle.chi2;
+    helix.chi2_line = line.chi2;
+
+    return helix;
+  }
+
+}  // namespace BrokenLine
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
diff --git a/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cc b/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cc
new file mode 100644
index 000000000..cc5865d97
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cc
@@ -0,0 +1,68 @@
+#include "BrokenLineFitOnGPU.h"
+
+void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
+  assert(tuples_d);
+
+  //  Fit internals
+  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU_ =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    kernelBLFastFit<3>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+
+    kernelBLFit<3>(tupleMultiplicity_d,
+                   bField_,
+                   outputSoa_d,
+                   hitsGPU_.get(),
+                   hits_geGPU_.get(),
+                   fast_fit_resultsGPU_.get(),
+                   3,
+                   offset);
+
+    // fit quads
+    kernelBLFastFit<4>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+
+    kernelBLFit<4>(tupleMultiplicity_d,
+                   bField_,
+                   outputSoa_d,
+                   hitsGPU_.get(),
+                   hits_geGPU_.get(),
+                   fast_fit_resultsGPU_.get(),
+                   4,
+                   offset);
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      kernelBLFastFit<4>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      kernelBLFit<4>(tupleMultiplicity_d,
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     5,
+                     offset);
+    } else {
+      // fit penta (all 5)
+      kernelBLFastFit<5>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      kernelBLFit<5>(tupleMultiplicity_d,
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     5,
+                     offset);
+    }
+
+  }  // loop on concurrent fits
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cu b/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
new file mode 100644
index 000000000..c1ba97c29
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
@@ -0,0 +1,85 @@
+#include "BrokenLineFitOnGPU.h"
+#include "CUDACore/device_unique_ptr.h"
+
+void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples,
+                                            cudaStream_t stream) {
+  assert(tuples_d);
+
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                             bField_,
+                                                             outputSoa_d,
+                                                             hitsGPU_.get(),
+                                                             hits_geGPU_.get(),
+                                                             fast_fit_resultsGPU_.get(),
+                                                             3,
+                                                             offset);
+    cudaCheck(cudaGetLastError());
+
+    // fit quads
+    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                 bField_,
+                                                                 outputSoa_d,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 4,
+                                                                 offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
+      cudaCheck(cudaGetLastError());
+    } else {
+      // fit penta (all 5)
+      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
+      cudaCheck(cudaGetLastError());
+    }
+
+  }  // loop on concurrent fits
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.h b/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.h
new file mode 100644
index 000000000..a30c251b7
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/BrokenLineFitOnGPU.h
@@ -0,0 +1,185 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+// #define BROKENLINE_DEBUG
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/cuda_assert.h"
+#include "CondFormats/pixelCPEforGPU.h"
+
+#include "BrokenLine.h"
+#include "HelixFitOnGPU.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
+
+// #define BL_DUMP_HITS
+
+template <int N>
+__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
+                                CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                HitsOnGPU const *__restrict__ hhp,
+                                double *__restrict__ phits,
+                                float *__restrict__ phits_ge,
+                                double *__restrict__ pfast_fit,
+                                uint32_t nHits,
+                                uint32_t offset) {
+  constexpr uint32_t hitsInFit = N;
+
+  assert(hitsInFit <= nHits);
+
+  assert(hhp);
+  assert(pfast_fit);
+  assert(foundNtuplets);
+  assert(tupleMultiplicity);
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+
+#ifdef BROKENLINE_DEBUG
+  if (0 == local_start) {
+    printf("%d total Ntuple\n", foundNtuplets->nbins());
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+  }
+#endif
+
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it from the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    assert(tkid < foundNtuplets->nbins());
+
+    assert(foundNtuplets->size(tkid) == nHits);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+#ifdef BL_DUMP_HITS
+    __shared__ int done;
+    done = 0;
+    __syncthreads();
+    bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
+#endif
+
+    // Prepare data structure
+    auto const *hitId = foundNtuplets->begin(tkid);
+    for (unsigned int i = 0; i < hitsInFit; ++i) {
+      auto hit = hitId[i];
+      float ge[6];
+      hhp->cpeParams()
+          .detParams(hhp->detectorIndex(hit))
+          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+#ifdef BL_DUMP_HITS
+      if (dump) {
+        printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
+               tkid,
+               hhp->detectorIndex(hit),
+               i,
+               hhp->xGlobal(hit),
+               hhp->yGlobal(hit),
+               hhp->zGlobal(hit));
+        printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
+               tkid,
+               hhp->detetectorIndex(hit),
+               i,
+               ge[0],
+               ge[1],
+               ge[2],
+               ge[3],
+               ge[4],
+               ge[5]);
+      }
+#endif
+      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+    }
+    BrokenLine::BL_Fast_fit(hits, fast_fit);
+
+    // no NaN here....
+    assert(fast_fit(0) == fast_fit(0));
+    assert(fast_fit(1) == fast_fit(1));
+    assert(fast_fit(2) == fast_fit(2));
+    assert(fast_fit(3) == fast_fit(3));
+  }
+}
+
+template <int N>
+__global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                            double B,
+                            OutputSoA *results,
+                            double *__restrict__ phits,
+                            float *__restrict__ phits_ge,
+                            double *__restrict__ pfast_fit,
+                            uint32_t nHits,
+                            uint32_t offset) {
+  assert(N <= nHits);
+
+  assert(results);
+  assert(pfast_fit);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it for the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    BrokenLine::PreparedBrokenLineData<N> data;
+    Rfit::Matrix3d Jacob;
+
+    BrokenLine::karimaki_circle_fit circle;
+    Rfit::line_fit line;
+
+    BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
+    BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
+    BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+
+    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(B), tkid);
+    results->pt(tkid) = float(B) / float(std::abs(circle.par(2)));
+    results->eta(tkid) = asinhf(line.par(0));
+    results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
+
+#ifdef BROKENLINE_DEBUG
+    if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
+      printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
+    printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+           N,
+           nHits,
+           tkid,
+           circle.par(0),
+           circle.par(1),
+           circle.par(2));
+    printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
+    printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
+           circle.chi2,
+           line.chi2,
+           circle.cov(0, 0),
+           circle.cov(1, 1),
+           circle.cov(2, 2),
+           line.cov(0, 0),
+           line.cov(1, 1));
+#endif
+  }
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/CAConstants.h b/src/cudacompat/plugin-PixelTriplets/CAConstants.h
new file mode 100644
index 000000000..b063d0f6e
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAConstants.h
@@ -0,0 +1,69 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/SimpleVector.h"
+#include "CUDACore/VecArray.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
+
+// #define ONLY_PHICUT
+
+namespace CAConstants {
+
+  // constants
+#ifndef ONLY_PHICUT
+#ifdef GPU_SMALL_EVENTS
+  constexpr uint32_t maxNumberOfTuples() { return 3 * 1024; }
+#else
+  constexpr uint32_t maxNumberOfTuples() { return 24 * 1024; }
+#endif
+#else
+  constexpr uint32_t maxNumberOfTuples() { return 48 * 1024; }
+#endif
+  constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
+#ifndef ONLY_PHICUT
+#ifndef GPU_SMALL_EVENTS
+  constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
+  constexpr uint32_t maxCellsPerHit() { return 128; }
+#else
+  constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
+  constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
+#endif
+#else
+  constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
+  constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
+#endif
+  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }
+
+  constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
+  constexpr uint32_t maxNumberOfLayers() { return 10; }
+  constexpr uint32_t maxTuples() { return maxNumberOfTuples(); }
+
+  // types
+  using hindex_type = uint16_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+  using tindex_type = uint16_t;  //  for tuples
+
+#ifndef ONLY_PHICUT
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
+#else
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
+#endif
+
+  using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
+  using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
+
+  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit()>;
+  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
+  using HitToTuple =
+      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
+  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples()>;
+
+}  // namespace CAConstants
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc
new file mode 100644
index 000000000..57baea007
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc
@@ -0,0 +1,44 @@
+#include <cuda_runtime.h>
+
+#include "CUDACore/Product.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+#include "Framework/RunningAverage.h"
+#include "CUDACore/ScopedContext.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+
+class CAHitNtupletCUDA : public edm::EDProducer {
+public:
+  explicit CAHitNtupletCUDA(edm::ProductRegistry& reg);
+  ~CAHitNtupletCUDA() override = default;
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
+
+  CAHitNtupletGeneratorOnGPU gpuAlgo_;
+};
+
+CAHitNtupletCUDA::CAHitNtupletCUDA(edm::ProductRegistry& reg)
+    : tokenHitGPU_{reg.consumes<cms::cuda::Product<TrackingRecHit2DGPU>>()},
+      tokenTrackGPU_{reg.produces<cms::cuda::Product<PixelTrackHeterogeneous>>()},
+      gpuAlgo_(reg) {}
+
+void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
+  auto bf = 0.0114256972711507;  // 1/fieldInGeV
+
+  auto const& phits = iEvent.get(tokenHitGPU_);
+  cms::cuda::ScopedContextProduce ctx{phits};
+  auto const& hits = ctx.get(phits);
+
+  ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+}
+
+DEFINE_FWK_MODULE(CAHitNtupletCUDA);
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
new file mode 100644
index 000000000..4ea687af3
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
@@ -0,0 +1,184 @@
+#include "CAHitNtupletGeneratorKernelsImpl.h"
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
+  kernel_printCounters(counters);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) {
+  kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  // overkill to use template here (std::make_unique would suffice)
+  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
+  device_isOuterHitOfCell_.reset(
+      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
+  assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
+                                             CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ =
+      (GPUCACell::CellTracks *)(cellStorage_.get() +
+                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+
+  gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
+                                 nhits,
+                                 device_theCellNeighbors_.get(),
+                                 device_theCellNeighborsContainer_,
+                                 device_theCellTracks_.get(),
+                                 device_theCellTracksContainer_);
+
+  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
+  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_ > 3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
+  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
+                                         device_nCells_,
+                                         device_theCellNeighbors_.get(),
+                                         device_theCellTracks_.get(),
+                                         hh.view(),
+                                         device_isOuterHitOfCell_.get(),
+                                         nActualPairs,
+                                         m_params.idealConditions_,
+                                         m_params.doClusterCut_,
+                                         m_params.doZ0Cut_,
+                                         m_params.doPtCut_,
+                                         m_params.maxNumberOfDoublets_);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  assert(tuples_d && quality_d);
+
+  // zero tuples
+  cms::cuda::launchZero(tuples_d, cudaStream);
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  kernel_connect(device_hitTuple_apc_,
+                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+                 hh.view(),
+                 device_theCells_.get(),
+                 device_nCells_,
+                 device_theCellNeighbors_.get(),
+                 device_isOuterHitOfCell_.get(),
+                 m_params.hardCurvCut_,
+                 m_params.ptmin_,
+                 m_params.CAThetaCutBarrel_,
+                 m_params.CAThetaCutForward_,
+                 m_params.dcaCutInnerTriplet_,
+                 m_params.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && m_params.earlyFishbone_) {
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+  }
+
+  kernel_find_ntuplets(hh.view(),
+                       device_theCells_.get(),
+                       device_nCells_,
+                       device_theCellTracks_.get(),
+                       tuples_d,
+                       device_hitTuple_apc_,
+                       quality_d,
+                       m_params.minHitsPerNtuplet_);
+  if (m_params.doStats_)
+    kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
+
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+
+  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
+  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+
+  if (nhits > 1 && m_params.lateFishbone_) {
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+  }
+
+  if (m_params.doStats_) {
+    kernel_checkOverflows(tuples_d,
+                          device_tupleMultiplicity_.get(),
+                          device_hitTuple_apc_,
+                          device_theCells_.get(),
+                          device_nCells_,
+                          device_theCellNeighbors_.get(),
+                          device_theCellTracks_.get(),
+                          device_isOuterHitOfCell_.get(),
+                          nhits,
+                          m_params.maxNumberOfDoublets_,
+                          counters_);
+  }
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  // classify tracks based on kinematics
+  kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d);
+
+  if (m_params.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+
+  // fill hit->track "map"
+  kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+  cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
+  kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+
+  // remove duplicates (tracks that share a hit)
+  kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+
+  if (m_params.doStats_) {
+    // counters (add flag???)
+    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
+    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
+  }
+
+#ifdef DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+#endif
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
new file mode 100644
index 000000000..08474c1af
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
@@ -0,0 +1,306 @@
+#include "CAHitNtupletGeneratorKernelsImpl.h"
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto blockSize = 128;
+  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
+  cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  // zero tuples
+  cms::cuda::launchZero(tuples_d, cudaStream);
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  auto nthTot = 64;
+  auto stride = 4;
+  auto blockSize = nthTot / stride;
+  auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  auto rescale = numberOfBlocks / 65536;
+  blockSize *= (rescale + 1);
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  assert(numberOfBlocks < 65536);
+  assert(blockSize > 0 && 0 == blockSize % 16);
+  dim3 blks(1, numberOfBlocks, 1);
+  dim3 thrs(stride, blockSize, 1);
+
+  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
+      device_hitTuple_apc_,
+      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+      hh.view(),
+      device_theCells_.get(),
+      device_nCells_,
+      device_theCellNeighbors_.get(),
+      device_isOuterHitOfCell_.get(),
+      m_params.hardCurvCut_,
+      m_params.ptmin_,
+      m_params.CAThetaCutBarrel_,
+      m_params.CAThetaCutForward_,
+      m_params.dcaCutInnerTriplet_,
+      m_params.dcaCutOuterTriplet_);
+  cudaCheck(cudaGetLastError());
+
+  if (nhits > 1 && m_params.earlyFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+    cudaCheck(cudaGetLastError());
+  }
+
+  blockSize = 64;
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                                     device_theCells_.get(),
+                                                                     device_nCells_,
+                                                                     device_theCellTracks_.get(),
+                                                                     tuples_d,
+                                                                     device_hitTuple_apc_,
+                                                                     quality_d,
+                                                                     m_params.minHitsPerNtuplet_);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.doStats_)
+    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
+  cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  blockSize = 128;
+  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  blockSize = 128;
+  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudaCheck(cudaGetLastError());
+
+  if (nhits > 1 && m_params.lateFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    cudaCheck(cudaGetLastError());
+  }
+
+  if (m_params.doStats_) {
+    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_.get(),
+                                                                        device_theCellTracks_.get(),
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        m_params.maxNumberOfDoublets_,
+                                                                        counters_);
+    cudaCheck(cudaGetLastError());
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // free space asap
+  // device_isOuterHitOfCell_.reset();
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
+      CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
+          CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
+      stream);
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ =
+      (GPUCACell::CellTracks *)(cellStorage_.get() +
+                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+
+  {
+    int threadsPerBlock = 128;
+    // at least one block!
+    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
+                                                                           nhits,
+                                                                           device_theCellNeighbors_.get(),
+                                                                           device_theCellNeighborsContainer_,
+                                                                           device_theCellTracks_.get(),
+                                                                           device_theCellTracksContainer_);
+    cudaCheck(cudaGetLastError());
+  }
+
+  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_ > 3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
+  int stride = 4;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
+  int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  dim3 blks(1, blocks, 1);
+  dim3 thrs(stride, threadsPerBlock, 1);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
+                                                                    device_nCells_,
+                                                                    device_theCellNeighbors_.get(),
+                                                                    device_theCellTracks_.get(),
+                                                                    hh.view(),
+                                                                    device_isOuterHitOfCell_.get(),
+                                                                    nActualPairs,
+                                                                    m_params.idealConditions_,
+                                                                    m_params.doClusterCut_,
+                                                                    m_params.doZ0Cut_,
+                                                                    m_params.doPtCut_,
+                                                                    m_params.maxNumberOfDoublets_);
+  cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  auto blockSize = 64;
+
+  // classify tracks based on kinematics
+  auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, m_params.cuts_, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        device_theCells_.get(), device_nCells_, quality_d);
+    cudaCheck(cudaGetLastError());
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
+    // fill hit->track "map"
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+    cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
+    cudaCheck(cudaGetLastError());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+  if (m_params.minHitsPerNtuplet_ < 4) {
+    // remove duplicates (tracks that share a hit)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+
+  if (m_params.doStats_) {
+    // counters (add flag???)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
+    cudaCheck(cudaGetLastError());
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
+    cudaCheck(cudaGetLastError());
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
+      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+#endif
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) {
+  kernel_printCounters<<<1, 1>>>(counters);
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
new file mode 100644
index 000000000..3c3e3d447
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
@@ -0,0 +1,207 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "GPUCACell.h"
+
+// #define DUMP_GPU_TK_TUPLES
+
+namespace cAHitNtupletGenerator {
+
+  // counters
+  struct Counters {
+    unsigned long long nEvents;
+    unsigned long long nHits;
+    unsigned long long nCells;
+    unsigned long long nTuples;
+    unsigned long long nFitTracks;
+    unsigned long long nGoodTracks;
+    unsigned long long nUsedHits;
+    unsigned long long nDupHits;
+    unsigned long long nKilledCells;
+    unsigned long long nEmptyCells;
+    unsigned long long nZeroTrackCells;
+  };
+
+  using HitsView = TrackingRecHit2DSOAView;
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+  struct QualityCuts {
+    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
+    float chi2Coeff[4];
+    float chi2MaxPt;  // GeV
+    float chi2Scale;
+
+    struct region {
+      float maxTip;  // cm
+      float minPt;   // GeV
+      float maxZip;  // cm
+    };
+
+    region triplet;
+    region quadruplet;
+  };
+
+  // params
+  struct Params {
+    Params(bool onGPU,
+           uint32_t minHitsPerNtuplet,
+           uint32_t maxNumberOfDoublets,
+           bool useRiemannFit,
+           bool fit5as4,
+           bool includeJumpingForwardDoublets,
+           bool earlyFishbone,
+           bool lateFishbone,
+           bool idealConditions,
+           bool doStats,
+           bool doClusterCut,
+           bool doZ0Cut,
+           bool doPtCut,
+           float ptmin,
+           float CAThetaCutBarrel,
+           float CAThetaCutForward,
+           float hardCurvCut,
+           float dcaCutInnerTriplet,
+           float dcaCutOuterTriplet,
+           QualityCuts const& cuts)
+        : onGPU_(onGPU),
+          minHitsPerNtuplet_(minHitsPerNtuplet),
+          maxNumberOfDoublets_(maxNumberOfDoublets),
+          useRiemannFit_(useRiemannFit),
+          fit5as4_(fit5as4),
+          includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
+          earlyFishbone_(earlyFishbone),
+          lateFishbone_(lateFishbone),
+          idealConditions_(idealConditions),
+          doStats_(doStats),
+          doClusterCut_(doClusterCut),
+          doZ0Cut_(doZ0Cut),
+          doPtCut_(doPtCut),
+          ptmin_(ptmin),
+          CAThetaCutBarrel_(CAThetaCutBarrel),
+          CAThetaCutForward_(CAThetaCutForward),
+          hardCurvCut_(hardCurvCut),
+          dcaCutInnerTriplet_(dcaCutInnerTriplet),
+          dcaCutOuterTriplet_(dcaCutOuterTriplet),
+          cuts_(cuts) {}
+
+    const bool onGPU_;
+    const uint32_t minHitsPerNtuplet_;
+    const uint32_t maxNumberOfDoublets_;
+    const bool useRiemannFit_;
+    const bool fit5as4_;
+    const bool includeJumpingForwardDoublets_;
+    const bool earlyFishbone_;
+    const bool lateFishbone_;
+    const bool idealConditions_;
+    const bool doStats_;
+    const bool doClusterCut_;
+    const bool doZ0Cut_;
+    const bool doPtCut_;
+    const float ptmin_;
+    const float CAThetaCutBarrel_;
+    const float CAThetaCutForward_;
+    const float hardCurvCut_;
+    const float dcaCutInnerTriplet_;
+    const float dcaCutOuterTriplet_;
+
+    // quality cuts
+    QualityCuts cuts_{// polynomial coefficients for the pT-dependent chi2 cut
+                      {0.68177776, 0.74609577, -0.08035491, 0.00315399},
+                      // max pT used to determine the chi2 cut
+                      10.,
+                      // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                      30.,
+                      // regional cuts for triplets
+                      {
+                          0.3,  // |Tip| < 0.3 cm
+                          0.5,  // pT > 0.5 GeV
+                          12.0  // |Zip| < 12.0 cm
+                      },
+                      // regional cuts for quadruplets
+                      {
+                          0.5,  // |Tip| < 0.5 cm
+                          0.3,  // pT > 0.3 GeV
+                          12.0  // |Zip| < 12.0 cm
+                      }};
+
+  };  // Params
+
+}  // namespace cAHitNtupletGenerator
+
+template <typename TTraits>
+class CAHitNtupletGeneratorKernels {
+public:
+  using Traits = TTraits;
+
+  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
+  using Params = cAHitNtupletGenerator::Params;
+  using Counters = cAHitNtupletGenerator::Counters;
+
+  template <typename T>
+  using unique_ptr = typename Traits::template unique_ptr<T>;
+
+  using HitsView = TrackingRecHit2DSOAView;
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+  CAHitNtupletGeneratorKernels(Params const& params) : m_params(params) {}
+  ~CAHitNtupletGeneratorKernels() = default;
+
+  TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
+
+  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+
+  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+
+  void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
+
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void allocateOnGPU(cudaStream_t stream);
+  void cleanup(cudaStream_t cudaStream);
+
+  static void printCounters(Counters const* counters);
+  Counters* counters_ = nullptr;
+
+private:
+  // workspace
+  unique_ptr<unsigned char[]> cellStorage_;
+  unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
+  CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
+  unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
+  CAConstants::CellTracks* device_theCellTracksContainer_;
+
+  unique_ptr<GPUCACell[]> device_theCells_;
+  unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
+  uint32_t* device_nCells_ = nullptr;
+
+  unique_ptr<HitToTuple> device_hitToTuple_;
+  cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
+
+  cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr;
+
+  unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
+
+  unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
+  // params
+  Params const& m_params;
+};
+
+using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
+using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits>;
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cc
new file mode 100644
index 000000000..963816733
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -0,0 +1 @@
+#include "CAHitNtupletGeneratorKernelsAlloc.h"
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cu b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cu
new file mode 100644
index 000000000..963816733
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.cu
@@ -0,0 +1 @@
+#include "CAHitNtupletGeneratorKernelsAlloc.h"
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
new file mode 100644
index 000000000..fb505b126
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
@@ -0,0 +1,39 @@
+#include "CAHitNtupletGeneratorKernels.h"
+
+#include "CUDACore/cudaCheck.h"
+
+template <>
+#ifdef __CUDACC__
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
+#else
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
+#endif
+  //////////////////////////////////////////////////////////
+  // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
+  //////////////////////////////////////////////////////////
+
+  device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
+  device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);
+
+  device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
+
+  device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
+
+  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
+
+  device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
+  device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
+  device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
+
+  if
+#ifndef __CUDACC__
+      constexpr
+#endif
+      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
+  } else {
+    *device_nCells_ = 0;
+  }
+  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
+  cms::cuda::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
new file mode 100644
index 000000000..e35e20be9
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
@@ -0,0 +1,605 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+// #define NTUPLE_DEBUG
+
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/cuda_assert.h"
+#include "CondFormats/pixelCPEforGPU.h"
+
+#include "CAConstants.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "GPUCACell.h"
+#include "gpuFishbone.h"
+#include "gpuPixelDoublets.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using HitsOnCPU = TrackingRecHit2DCUDA;
+
+using HitToTuple = CAConstants::HitToTuple;
+using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+using Quality = pixelTrack::Quality;
+using TkSoA = pixelTrack::TrackSoA;
+using HitContainer = pixelTrack::HitContainer;
+
+__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
+                                      CAConstants::TupleMultiplicity *tupleMultiplicity,
+                                      cms::cuda::AtomicPairCounter *apc,
+                                      GPUCACell const *__restrict__ cells,
+                                      uint32_t const *__restrict__ nCells,
+                                      gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
+                                      gpuPixelDoublets::CellTracksVector const *cellTracks,
+                                      GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                                      uint32_t nHits,
+                                      uint32_t maxNumberOfDoublets,
+                                      CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto &c = *counters;
+  // counters once per event
+  if (0 == first) {
+    atomicAdd(&c.nEvents, 1);
+    atomicAdd(&c.nHits, nHits);
+    atomicAdd(&c.nCells, *nCells);
+    atomicAdd(&c.nTuples, apc->get().m);
+    atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
+  }
+
+#ifdef NTUPLE_DEBUG
+  if (0 == first) {
+    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
+           *nCells,
+           apc->get().m,
+           apc->get().n,
+           nHits);
+    if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) {
+      assert(foundNtuplets->size(apc->get().m) == 0);
+      assert(foundNtuplets->size() == apc->get().n);
+    }
+  }
+
+  for (int idx = first, nt = foundNtuplets->nbins(); idx < nt; idx += gridDim.x * blockDim.x) {
+    if (foundNtuplets->size(idx) > 5)
+      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
+    assert(foundNtuplets->size(idx) < 6);
+    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+      assert(*ih < nHits);
+  }
+#endif
+
+  if (0 == first) {
+    if (apc->get().m >= CAConstants::maxNumberOfQuadruplets())
+      printf("Tuples overflow\n");
+    if (*nCells >= maxNumberOfDoublets)
+      printf("Cells overflow\n");
+    if (cellNeighbors && cellNeighbors->full())
+      printf("cellNeighbors overflow\n");
+    if (cellTracks && cellTracks->full())
+      printf("cellTracks overflow\n");
+  }
+
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.theDoubletId < 0)
+      atomicAdd(&c.nKilledCells, 1);
+    if (0 == thisCell.theUsed)
+      atomicAdd(&c.nEmptyCells, 1);
+    if (thisCell.tracks().empty())
+      atomicAdd(&c.nZeroTrackCells, 1);
+  }
+
+  for (int idx = first, nt = nHits; idx < nt; idx += gridDim.x * blockDim.x) {
+    if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
+      printf("OuterHitOfCell overflow %d\n", idx);
+  }
+}
+
+__global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
+  constexpr auto bad = trackQuality::bad;
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.theDoubletId >= 0)
+      continue;
+
+    for (auto it : thisCell.tracks())
+      quality[it] = bad;
+  }
+}
+
+__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
+                                             uint32_t const *__restrict__ nCells,
+                                             HitContainer *foundNtuplets,
+                                             Quality *quality) {
+  // constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+
+    if (thisCell.tracks().size() < 2)
+      continue;
+    //if (0==thisCell.theUsed) continue;
+    // if (thisCell.theDoubletId < 0) continue;
+
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it : thisCell.tracks()) {
+      auto nh = foundNtuplets->size(it);
+      maxNh = std::max(nh, maxNh);
+    }
+
+    for (auto it : thisCell.tracks()) {
+      if (foundNtuplets->size(it) != maxNh)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+  }
+}
+
+__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer const *__restrict__ foundNtuplets,
+                                            TkSoA *__restrict__ tracks) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.tracks().size() < 2)
+      continue;
+    // if (thisCell.theDoubletId < 0) continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+
+    auto score = [&](auto it) {
+      return std::abs(tracks->tip(it));  // tip
+      // return tracks->chi2(it);  //chi2
+    };
+
+    // find min socre
+    for (auto it : thisCell.tracks()) {
+      if (tracks->quality(it) == loose && score(it) < mc) {
+        mc = score(it);
+        im = it;
+      }
+    }
+    // mark all other duplicates
+    for (auto it : thisCell.tracks()) {
+      if (tracks->quality(it) != bad && it != im)
+        tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
+    }
+  }
+}
+
+__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
+                               cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
+                               GPUCACell::Hits const *__restrict__ hhp,
+                               GPUCACell *cells,
+                               uint32_t const *__restrict__ nCells,
+                               gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
+                               GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                               float hardCurvCut,
+                               float ptmin,
+                               float CAThetaCutBarrel,
+                               float CAThetaCutForward,
+                               float dcaCutInnerTriplet,
+                               float dcaCutOuterTriplet) {
+  auto const &hh = *hhp;
+
+  auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+  auto first = threadIdx.x;
+  auto stride = blockDim.x;
+
+  if (0 == (firstCellIndex + first)) {
+    (*apc1) = 0;
+    (*apc2) = 0;
+  }  // ready for next kernel
+
+  for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
+    auto cellIndex = idx;
+    auto &thisCell = cells[idx];
+    //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
+    //  continue;
+    auto innerHitId = thisCell.get_inner_hit_id();
+    int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+    auto vi = isOuterHitOfCell[innerHitId].data();
+
+    constexpr uint32_t last_bpix1_detIndex = 96;
+    constexpr uint32_t last_barrel_detIndex = 1184;
+    auto ri = thisCell.get_inner_r(hh);
+    auto zi = thisCell.get_inner_z(hh);
+
+    auto ro = thisCell.get_outer_r(hh);
+    auto zo = thisCell.get_outer_z(hh);
+    auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
+
+    for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
+      auto otherCell = __ldg(vi + j);
+      auto &oc = cells[otherCell];
+      // if (cells[otherCell].theDoubletId < 0 ||
+      //    cells[otherCell].theUsed>1 )
+      //  continue;
+      auto r1 = oc.get_inner_r(hh);
+      auto z1 = oc.get_inner_z(hh);
+      // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
+      bool aligned = GPUCACell::areAlignedRZ(
+          r1,
+          z1,
+          ri,
+          zi,
+          ro,
+          zo,
+          ptmin,
+          isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+      if (aligned &&
+          thisCell.dcaCut(hh,
+                          oc,
+                          oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
+                          hardCurvCut)) {  // FIXME tune cuts
+        oc.addOuterNeighbor(cellIndex, *cellNeighbors);
+        thisCell.theUsed |= 1;
+        oc.theUsed |= 1;
+      }
+    }  // loop on inner cells
+  }    // loop on outer cells
+}
+
+__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells,
+                                     gpuPixelDoublets::CellTracksVector *cellTracks,
+                                     HitContainer *foundNtuplets,
+                                     cms::cuda::AtomicPairCounter *apc,
+                                     Quality *__restrict__ quality,
+                                     unsigned int minHitsPerNtuplet) {
+  // recursive: not obvious to widen
+  auto const &hh = *hhp;
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.theDoubletId < 0)
+      continue;  // cut by earlyFishbone
+
+    auto pid = thisCell.theLayerPairId;
+    auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
+    if (doit) {
+      GPUCACell::TmpTuple stack;
+      stack.reset();
+      thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
+      assert(stack.empty());
+      // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
+    }
+  }
+}
+
+__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
+                                 GPUCACell *__restrict__ cells,
+                                 uint32_t const *nCells) {
+  // auto const &hh = *hhp;
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto &thisCell = cells[idx];
+    if (!thisCell.tracks().empty())
+      thisCell.theUsed |= 2;
+  }
+}
+
+__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                         Quality const *__restrict__ quality,
+                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = foundNtuplets->size(it);
+    if (nhits < 3)
+      continue;
+    if (quality[it] == trackQuality::dup)
+      continue;
+    assert(quality[it] == trackQuality::bad);
+    if (nhits > 5)
+      printf("wrong mult %d %d\n", it, nhits);
+    assert(nhits < 8);
+    tupleMultiplicity->countDirect(nhits);
+  }
+}
+
+__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                        Quality const *__restrict__ quality,
+                                        CAConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = foundNtuplets->size(it);
+    if (nhits < 3)
+      continue;
+    if (quality[it] == trackQuality::dup)
+      continue;
+    assert(quality[it] == trackQuality::bad);
+    if (nhits > 5)
+      printf("wrong mult %d %d\n", it, nhits);
+    assert(nhits < 8);
+    tupleMultiplicity->fillDirect(nhits, it);
+  }
+}
+
+__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
+                                      TkSoA const *__restrict__ tracks,
+                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
+                                      Quality *__restrict__ quality) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int it = first, nt = tuples->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tuples->size(it);
+    if (nhits == 0)
+      break;  // guard
+
+    // if duplicate: not even fit
+    if (quality[it] == trackQuality::dup)
+      continue;
+
+    assert(quality[it] == trackQuality::bad);
+
+    // mark doublets as bad
+    if (nhits < 3)
+      continue;
+
+    // if the fit has any invalid parameters, mark it as bad
+    bool isNaN = false;
+    for (int i = 0; i < 5; ++i) {
+      isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
+    }
+    if (isNaN) {
+#ifdef NTUPLE_DEBUG
+      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
+#endif
+      continue;
+    }
+
+    // compute a pT-dependent chi2 cut
+    // default parameters:
+    //   - chi2MaxPt = 10 GeV
+    //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
+    //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
+    // (see CAHitNtupletGeneratorGPU.cc)
+    float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
+    float chi2Cut = cuts.chi2Scale *
+                    (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+    // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
+    if (3.f * tracks->chi2(it) >= chi2Cut) {
+#ifdef NTUPLE_DEBUG
+      printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
+             it,
+             tuples->size(it),
+             tracks->pt(it),
+             tracks->eta(it),
+             3.f * tracks->chi2(it));
+#endif
+      continue;
+    }
+
+    // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+    // default cuts:
+    //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+    //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+    // (see CAHitNtupletGeneratorGPU.cc)
+    auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
+    bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
+                (std::abs(tracks->zip(it)) < region.maxZip);
+
+    if (isOk)
+      quality[it] = trackQuality::loose;
+  }
+}
+
+__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      break;  //guard
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    atomicAdd(&(counters->nGoodTracks), 1);
+  }
+}
+
+__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      break;  // guard
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->countDirect(*h);
+  }
+}
+
+__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
+                                       Quality const *__restrict__ quality,
+                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      break;  // guard
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->fillDirect(*h, idx);
+  }
+}
+
+__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
+                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                         HitContainer *__restrict__ hitDetIndices) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  // copy offsets
+  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    hitDetIndices->off[idx] = tuples->off[idx];
+  }
+  // fill hit indices
+  auto const &hh = *hhp;
+  auto nhits = hh.nHits();
+  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    assert(tuples->bins[idx] < nhits);
+    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+  }
+}
+
+__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
+                                             CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  auto &c = *counters;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple->size(idx) == 0)
+      continue;  // SHALL NOT BE break
+    atomicAdd(&c.nUsedHits, 1);
+    if (hitToTuple->size(idx) > 1)
+      atomicAdd(&c.nDupHits, 1);
+  }
+}
+
+__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const *__restrict__ ptracks,
+                                      Quality *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  auto &hitToTuple = *phitToTuple;
+  auto const &foundNtuplets = *ptuples;
+  auto const &tracks = *ptracks;
+
+  //  auto const & hh = *hhp;
+  // auto l1end = hh.hitsLayerStart_d[1];
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple.size(idx) < 2)
+      continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      maxNh = std::max(nh, maxNh);
+    }
+    // kill all tracks shorter than maxHn (only triplets???)
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      if (maxNh != nh)
+        quality[*it] = dup;
+    }
+
+    if (maxNh > 3)
+      continue;
+    // if (idx>=l1end) continue;  // only for layer 1
+    // for triplets choose best tip!
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
+        mc = std::abs(tracks.tip(it));
+        im = it;
+      }
+    }
+    // mark duplicates
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && it != im)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+  }  // loop over hits
+}
+
+__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                            HitContainer const *__restrict__ ptuples,
+                                            TkSoA const *__restrict__ ptracks,
+                                            Quality const *__restrict__ quality,
+                                            CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
+                                            uint32_t maxPrint,
+                                            int iev) {
+  auto const &foundNtuplets = *ptuples;
+  auto const &tracks = *ptracks;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i < np; i += blockDim.x * gridDim.x) {
+    auto nh = foundNtuplets.size(i);
+    if (nh < 3)
+      continue;
+    printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
+           10000 * iev + i,
+           int(quality[i]),
+           nh,
+           tracks.charge(i),
+           tracks.pt(i),
+           tracks.eta(i),
+           tracks.phi(i),
+           tracks.tip(i),
+           tracks.zip(i),
+           //           asinhf(fit_results[i].par(3)),
+           tracks.chi2(i),
+           *foundNtuplets.begin(i),
+           *(foundNtuplets.begin(i) + 1),
+           *(foundNtuplets.begin(i) + 2),
+           nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1,
+           nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1);
+  }
+}
+
+__global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
+  auto const &c = *counters;
+  printf(
+      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | "
+      "nKilledCells | "
+      "nEmptyCells | nZeroTrackCells ||\n");
+  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+         c.nEvents,
+         c.nHits,
+         c.nCells,
+         c.nTuples,
+         c.nGoodTracks,
+         c.nFitTracks,
+         c.nUsedHits,
+         c.nDupHits,
+         c.nKilledCells,
+         c.nEmptyCells,
+         c.nZeroTrackCells);
+  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
+         c.nEvents,
+         c.nHits / double(c.nEvents),
+         c.nCells / double(c.nEvents),
+         c.nTuples / double(c.nEvents),
+         c.nFitTracks / double(c.nEvents),
+         c.nGoodTracks / double(c.nEvents),
+         c.nUsedHits / double(c.nEvents),
+         c.nDupHits / double(c.nEvents),
+         c.nKilledCells / double(c.nEvents),
+         c.nEmptyCells / double(c.nCells),
+         c.nZeroTrackCells / double(c.nCells));
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
new file mode 100644
index 000000000..d0e428da6
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
@@ -0,0 +1,167 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+#include "Framework/Event.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+
+namespace {
+
+  template <typename T>
+  T sqr(T x) {
+    return x * x;
+  }
+
+  cAHitNtupletGenerator::QualityCuts makeQualityCuts() {
+    auto coeff = std::vector<double>{0.68177776, 0.74609577, -0.08035491, 0.00315399};  // chi2Coeff
+    return cAHitNtupletGenerator::QualityCuts{// polynomial coefficients for the pT-dependent chi2 cut
+                                              {(float)coeff[0], (float)coeff[1], (float)coeff[2], (float)coeff[3]},
+                                              // max pT used to determine the chi2 cut
+                                              10.f,  // chi2MaxPt
+                                                     // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                                              30.f,  // chi2Scale
+                                                     // regional cuts for triplets
+                                              {
+                                                  0.3f,  //tripletMaxTip
+                                                  0.5f,  // tripletMinPt
+                                                  12.f   // tripletMaxZip
+                                              },
+                                              // regional cuts for quadruplets
+                                              {
+                                                  0.5f,  // quadrupletMaxTip
+                                                  0.3f,  // quadrupletMinPt
+                                                  12.f   // quadrupletMaxZip
+                                              }};
+  }
+}  // namespace
+
+using namespace std;
+CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(edm::ProductRegistry& reg)
+    : m_params(true,              // onGPU
+               3,                 // minHitsPerNtuplet,
+               458752,            // maxNumberOfDoublets
+               false,             //useRiemannFit
+               true,              // fit5as4,
+               true,              //includeJumpingForwardDoublets
+               true,              // earlyFishbone
+               false,             // lateFishbone
+               true,              // idealConditions
+               false,             //fillStatistics
+               true,              // doClusterCut
+               true,              // doZ0Cut
+               true,              // doPtCut
+               0.899999976158,    // ptmin
+               0.00200000009499,  // CAThetaCutBarrel
+               0.00300000002608,  // CAThetaCutForward
+               0.0328407224959,   // hardCurvCut
+               0.15000000596,     // dcaCutInnerTriplet
+               0.25,              // dcaCutOuterTriplet
+               makeQualityCuts()) {
+#ifdef DUMP_GPU_TK_TUPLES
+  printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+         "tid",
+         "qual",
+         "nh",
+         "charge",
+         "pt",
+         "eta",
+         "phi",
+         "tip",
+         "zip",
+         "chi2",
+         "h1",
+         "h2",
+         "h3",
+         "h4",
+         "h5");
+#endif
+
+  if (m_params.onGPU_) {
+    cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
+    cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+  } else {
+    m_counters = new Counters();
+    memset(m_counters, 0, sizeof(Counters));
+  }
+}
+
+CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() {
+  if (m_params.onGPU_) {
+    if (m_params.doStats_) {
+      // crash on multi-gpu processes
+      CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
+    }
+    cudaFree(m_counters);
+  } else {
+    if (m_params.doStats_) {
+      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
+    }
+    delete m_counters;
+  }
+}
+
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
+                                                                    float bfield,
+                                                                    cudaStream_t stream) const {
+  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
+
+  auto* soa = tracks.get();
+
+  CAHitNtupletGeneratorKernelsGPU kernels(m_params);
+  kernels.counters_ = m_counters;
+
+  kernels.allocateOnGPU(stream);
+
+  kernels.buildDoublets(hits_d, stream);
+  kernels.launchKernels(hits_d, soa, stream);
+  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
+
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  } else {
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  }
+  kernels.classifyTuples(hits_d, soa, stream);
+
+  return tracks;
+}
+
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
+  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+
+  auto* soa = tracks.get();
+  assert(soa);
+
+  CAHitNtupletGeneratorKernelsCPU kernels(m_params);
+  kernels.counters_ = m_counters;
+  kernels.allocateOnGPU(nullptr);
+
+  kernels.buildDoublets(hits_d, nullptr);
+  kernels.launchKernels(hits_d, soa, nullptr);
+  kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
+
+  if (0 == hits_d.nHits())
+    return tracks;
+
+  // now fit
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+  } else {
+    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+  }
+
+  kernels.classifyTuples(hits_d, soa, nullptr);
+
+  return tracks;
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
new file mode 100644
index 000000000..823987658
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
@@ -0,0 +1,56 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/SimpleVector.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+
+#include "CAHitNtupletGeneratorKernels.h"
+#include "GPUCACell.h"
+#include "HelixFitOnGPU.h"
+
+namespace edm {
+  class Event;
+  class EventSetup;
+  class ProductRegistry;
+}  // namespace edm
+
+class CAHitNtupletGeneratorOnGPU {
+public:
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+  using hindex_type = TrackingRecHit2DSOAView::hindex_type;
+
+  using Quality = pixelTrack::Quality;
+  using OutputSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+  using Tuple = HitContainer;
+
+  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
+  using Params = cAHitNtupletGenerator::Params;
+  using Counters = cAHitNtupletGenerator::Counters;
+
+public:
+  CAHitNtupletGeneratorOnGPU(edm::ProductRegistry& reg);
+
+  ~CAHitNtupletGeneratorOnGPU();
+
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+
+  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+
+private:
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
+
+  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
+
+  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
+
+  Params m_params;
+
+  Counters* m_counters = nullptr;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
diff --git a/src/cudacompat/plugin-PixelTriplets/CircleEq.h b/src/cudacompat/plugin-PixelTriplets/CircleEq.h
new file mode 100644
index 000000000..dfe7da010
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/CircleEq.h
@@ -0,0 +1,107 @@
+#ifndef RecoPixelVertexingPixelTripletsCircleEq_H
+#define RecoPixelVertexingPixelTripletsCircleEq_H
+/**
+| 1) circle is parameterized as:                                              |
+|    C*[(X-Xp)**2+(Y-Yp)**2] - 2*alpha*(X-Xp) - 2*beta*(Y-Yp) = 0             |
+|    Xp,Yp is a point on the track;                                           |
+|    C = 1/r0 is the curvature  ( sign of C is charge of particle );          |
+|    alpha & beta are the direction cosines of the radial vector at Xp,Yp     |
+|    i.e.  alpha = C*(X0-Xp),                                                 |
+|          beta  = C*(Y0-Yp),                                                 |
+|    where center of circle is at X0,Y0.                                      |
+|                                                                             |
+|    Slope dy/dx of tangent at Xp,Yp is -alpha/beta.                          |
+| 2) the z dimension of the helix is parameterized by gamma = dZ/dSperp       |
+|    this is also the tangent of the pitch angle of the helix.                |
+|    with this parameterization, (alpha,beta,gamma) rotate like a vector.     |
+| 3) For tracks going inward at (Xp,Yp), C, alpha, beta, and gamma change sign|
+|
+*/
+
+#include <cmath>
+
+template <typename T>
+class CircleEq {
+public:
+  CircleEq() {}
+
+  constexpr CircleEq(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); }
+
+  constexpr void compute(T x1, T y1, T x2, T y2, T x3, T y3);
+
+  // dca to origin divided by curvature
+  constexpr T dca0() const {
+    auto x = m_c * m_xp + m_alpha;
+    auto y = m_c * m_yp + m_beta;
+    return std::sqrt(x * x + y * y) - T(1);
+  }
+
+  // dca to given point (divided by curvature)
+  constexpr T dca(T x, T y) const {
+    x = m_c * (m_xp - x) + m_alpha;
+    y = m_c * (m_yp - y) + m_beta;
+    return std::sqrt(x * x + y * y) - T(1);
+  }
+
+  // curvature
+  constexpr auto curvature() const { return m_c; }
+
+  // alpha and beta
+  constexpr std::pair<T, T> cosdir() const { return std::make_pair(m_alpha, m_beta); }
+
+  // alpha and beta af given point
+  constexpr std::pair<T, T> cosdir(T x, T y) const {
+    return std::make_pair(m_alpha - m_c * (x - m_xp), m_beta - m_c * (y - m_yp));
+  }
+
+  // center
+  constexpr std::pair<T, T> center() const { return std::make_pair(m_xp + m_alpha / m_c, m_yp + m_beta / m_c); }
+
+  constexpr auto radius() const { return T(1) / m_c; }
+
+  T m_xp = 0;
+  T m_yp = 0;
+  T m_c = 0;
+  T m_alpha = 0;
+  T m_beta = 0;
+};
+
+template <typename T>
+constexpr void CircleEq<T>::compute(T x1, T y1, T x2, T y2, T x3, T y3) {
+  bool noflip = std::abs(x3 - x1) < std::abs(y3 - y1);
+
+  auto x1p = noflip ? x1 - x2 : y1 - y2;
+  auto y1p = noflip ? y1 - y2 : x1 - x2;
+  auto d12 = x1p * x1p + y1p * y1p;
+  auto x3p = noflip ? x3 - x2 : y3 - y2;
+  auto y3p = noflip ? y3 - y2 : x3 - x2;
+  auto d32 = x3p * x3p + y3p * y3p;
+
+  auto num = x1p * y3p - y1p * x3p;  // num also gives correct sign for CT
+  auto det = d12 * y3p - d32 * y1p;
+
+  /*
+  auto ct  = num/det;
+  auto sn  = det>0 ? T(1.) : T(-1.);
+  auto st2 = (d12*x3p-d32*x1p)/det;
+  auto seq = T(1.) +st2*st2;
+  auto al2 = sn/std::sqrt(seq);
+  auto be2 = -st2*al2;
+  ct *= T(2.)*al2;
+  */
+
+  auto st2 = (d12 * x3p - d32 * x1p);
+  auto seq = det * det + st2 * st2;
+  auto al2 = T(1.) / std::sqrt(seq);
+  auto be2 = -st2 * al2;
+  auto ct = T(2.) * num * al2;
+  al2 *= det;
+
+  m_xp = x2;
+  m_yp = y2;
+  m_c = noflip ? ct : -ct;
+  m_alpha = noflip ? al2 : -be2;
+  m_beta = noflip ? be2 : -al2;
+}
+
+#endif
diff --git a/src/cudacompat/plugin-PixelTriplets/FitResult.h b/src/cudacompat/plugin-PixelTriplets/FitResult.h
new file mode 100644
index 000000000..b97dda4e6
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/FitResult.h
@@ -0,0 +1,65 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+namespace Rfit {
+
+  using Vector2d = Eigen::Vector2d;
+  using Vector3d = Eigen::Vector3d;
+  using Vector4d = Eigen::Vector4d;
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix2d = Eigen::Matrix2d;
+  using Matrix3d = Eigen::Matrix3d;
+  using Matrix4d = Eigen::Matrix4d;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+  using Matrix6d = Eigen::Matrix<double, 6, 6>;
+
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;  // used for inputs hits
+
+  struct circle_fit {
+    Vector3d par;  //!< parameter: (X0,Y0,R)
+    Matrix3d cov;
+    /*!< covariance matrix: \n
+      |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
+      |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
+      |cov(X0, R)|cov(Y0, R)|cov( R, R)|
+    */
+    int32_t q;  //!< particle charge
+    float chi2;
+  };
+
+  struct line_fit {
+    Vector2d par;  //!<(cotan(theta),Zip)
+    Matrix2d cov;
+    /*!<
+      |cov(c_t,c_t)|cov(Zip,c_t)| \n
+      |cov(c_t,Zip)|cov(Zip,Zip)|
+    */
+    double chi2;
+  };
+
+  struct helix_fit {
+    Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
+    Matrix5d cov;
+    /*!< ()->cov() \n
+      |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
+      |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
+      |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
+      |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
+      |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
+    */
+    float chi2_circle;
+    float chi2_line;
+    //    Vector4d fast_fit;
+    int32_t q;  //!< particle charge
+  };            // __attribute__((aligned(16)));
+
+}  // namespace Rfit
+#endif
diff --git a/src/cudacompat/plugin-PixelTriplets/FitUtils.h b/src/cudacompat/plugin-PixelTriplets/FitUtils.h
new file mode 100644
index 000000000..d69e03194
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/FitUtils.h
@@ -0,0 +1,246 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
+
+#include "CUDACore/cuda_assert.h"
+
+#include "choleskyInversion.h"
+#include "FitResult.h"
+
+namespace Rfit {
+
+  constexpr double d = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
+
+  using VectorXd = Eigen::VectorXd;
+  using MatrixXd = Eigen::MatrixXd;
+  template <int N>
+  using MatrixNd = Eigen::Matrix<double, N, N>;
+  template <int N>
+  using MatrixNplusONEd = Eigen::Matrix<double, N + 1, N + 1>;
+  template <int N>
+  using ArrayNd = Eigen::Array<double, N, N>;
+  template <int N>
+  using Matrix2Nd = Eigen::Matrix<double, 2 * N, 2 * N>;
+  template <int N>
+  using Matrix3Nd = Eigen::Matrix<double, 3 * N, 3 * N>;
+  template <int N>
+  using Matrix2xNd = Eigen::Matrix<double, 2, N>;
+  template <int N>
+  using Array2xNd = Eigen::Array<double, 2, N>;
+  template <int N>
+  using MatrixNx3d = Eigen::Matrix<double, N, 3>;
+  template <int N>
+  using MatrixNx5d = Eigen::Matrix<double, N, 5>;
+  template <int N>
+  using VectorNd = Eigen::Matrix<double, N, 1>;
+  template <int N>
+  using VectorNplusONEd = Eigen::Matrix<double, N + 1, 1>;
+  template <int N>
+  using Vector2Nd = Eigen::Matrix<double, 2 * N, 1>;
+  template <int N>
+  using Vector3Nd = Eigen::Matrix<double, 3 * N, 1>;
+  template <int N>
+  using RowVectorNd = Eigen::Matrix<double, 1, 1, N>;
+  template <int N>
+  using RowVector2Nd = Eigen::Matrix<double, 1, 2 * N>;
+
+  using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
+
+  using Matrix3f = Eigen::Matrix3f;
+  using Vector3f = Eigen::Vector3f;
+  using Vector4f = Eigen::Vector4f;
+  using Vector6f = Eigen::Matrix<double, 6, 1>;
+
+  using u_int = unsigned int;
+
+  template <class C>
+  __host__ __device__ void printIt(C* m, const char* prefix = "") {
+#ifdef RFIT_DEBUG
+    for (u_int r = 0; r < m->rows(); ++r) {
+      for (u_int c = 0; c < m->cols(); ++c) {
+        printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
+      }
+    }
+#endif
+  }
+
+  /*!
+    \brief raise to square.
+  */
+  template <typename T>
+  constexpr T sqr(const T a) {
+    return a * a;
+  }
+
+  /*!
+    \brief Compute cross product of two 2D vector (assuming z component 0),
+    returning z component of the result.
+    \param a first 2D vector in the product.
+    \param b second 2D vector in the product.
+    \return z component of the cross product.
+  */
+
+  __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b) {
+    return a.x() * b.y() - a.y() * b.x();
+  }
+
+  /*!
+   *  load error in CMSSW format to our formalism
+   *  
+   */
+  template <typename M6xNf, typename M2Nd>
+  __host__ __device__ void loadCovariance2D(M6xNf const& ge, M2Nd& hits_cov) {
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+    for (uint32_t i = 0; i < hits_in_fit; ++i) {
+      auto ge_idx = 0;
+      auto j = 0;
+      auto l = 0;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 2;
+      j = 1;
+      l = 1;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1;
+      j = 1;
+      l = 0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
+    }
+  }
+
+  template <typename M6xNf, typename M3xNd>
+  __host__ __device__ void loadCovariance(M6xNf const& ge, M3xNd& hits_cov) {
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+    for (uint32_t i = 0; i < hits_in_fit; ++i) {
+      auto ge_idx = 0;
+      auto j = 0;
+      auto l = 0;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 2;
+      j = 1;
+      l = 1;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 5;
+      j = 2;
+      l = 2;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1;
+      j = 1;
+      l = 0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
+      ge_idx = 3;
+      j = 2;
+      l = 0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
+      ge_idx = 4;
+      j = 2;
+      l = 1;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
+    }
+  }
+
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+    \param B magnetic field in Gev/cm/c unit.
+    \param error flag for errors computation.
+  */
+  __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool error) {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
+        circle.par(2) * B;
+    if (error) {
+      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+      const double temp3 = 1. / temp1 * circle.q;
+      Matrix3d J4;
+      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.q, 0., 0., B;
+      circle.cov = J4 * circle.cov * J4.transpose();
+    }
+    circle.par = par_pak;
+  }
+
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+  */
+  __host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
+        circle.q / circle.par(2);
+
+    const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+    const double temp3 = 1. / temp1 * circle.q;
+    Matrix3d J4;
+    J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+        circle.par(1) * temp3, -circle.q, 0., 0., -circle.q / (circle.par(2) * circle.par(2));
+    circle.cov = J4 * circle.cov * J4.transpose();
+
+    circle.par = par_pak;
+  }
+
+  // transformation between the "perigee" to cmssw localcoord frame
+  // the plane of the latter is the perigee plane...
+  // from   //!<(phi,Tip,q/pt,cotan(theta)),Zip)
+  // to q/p,dx/dz,dy/dz,x,z
+  template <typename VI5, typename MI5, typename VO5, typename MO5>
+  __host__ __device__ inline void transformToPerigeePlane(VI5 const& ip, MI5 const& icov, VO5& op, MO5& ocov) {
+    auto sinTheta2 = 1. / (1. + ip(3) * ip(3));
+    auto sinTheta = std::sqrt(sinTheta2);
+    auto cosTheta = ip(3) * sinTheta;
+
+    op(0) = sinTheta * ip(2);
+    op(1) = 0.;
+    op(2) = -ip(3);
+    op(3) = ip(1);
+    op(4) = -ip(4);
+
+    Matrix5d J = Matrix5d::Zero();
+
+    J(0, 2) = sinTheta;
+    J(0, 3) = -sinTheta2 * cosTheta * ip(2);
+    J(1, 0) = 1.;
+    J(2, 3) = -1.;
+    J(3, 1) = 1.;
+    J(4, 4) = -1;
+
+    ocov = J * icov * J.transpose();
+  }
+
+}  // namespace Rfit
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
diff --git a/src/cudacompat/plugin-PixelTriplets/GPUCACell.h b/src/cudacompat/plugin-PixelTriplets/GPUCACell.h
new file mode 100644
index 000000000..df4354e59
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/GPUCACell.h
@@ -0,0 +1,348 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+// #define ONLY_TRIPLETS_IN_HOLE
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/SimpleVector.h"
+#include "CUDACore/VecArray.h"
+#include "CUDACore/cuda_assert.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+
+#include "CAConstants.h"
+#include "CircleEq.h"
+
+class GPUCACell {
+public:
+  using ptrAsInt = unsigned long long;
+
+  static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit();
+  using OuterHitOfCell = CAConstants::OuterHitOfCell;
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
+
+  using Hits = TrackingRecHit2DSOAView;
+  using hindex_type = Hits::hindex_type;
+
+  using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
+
+  using HitContainer = pixelTrack::HitContainer;
+  using Quality = trackQuality::Quality;
+  static constexpr auto bad = trackQuality::bad;
+
+  GPUCACell() = default;
+
+  __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
+                                       CellTracksVector& cellTracks,
+                                       Hits const& hh,
+                                       int layerPairId,
+                                       int doubletId,
+                                       hindex_type innerHitId,
+                                       hindex_type outerHitId) {
+    theInnerHitId = innerHitId;
+    theOuterHitId = outerHitId;
+    theDoubletId = doubletId;
+    theLayerPairId = layerPairId;
+    theUsed = 0;
+
+    // optimization that depends on access pattern
+    theInnerZ = hh.zGlobal(innerHitId);
+    theInnerR = hh.rGlobal(innerHitId);
+
+    // link to default empty
+    theOuterNeighbors = &cellNeighbors[0];
+    theTracks = &cellTracks[0];
+    assert(outerNeighbors().empty());
+    assert(tracks().empty());
+  }
+
+  __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
+    // use smart cache
+    if (outerNeighbors().empty()) {
+      auto i = cellNeighbors.extend();  // maybe waisted....
+      if (i > 0) {
+        cellNeighbors[i].reset();
+#ifdef __CUDACC__
+        auto zero = (ptrAsInt)(&cellNeighbors[0]);
+        atomicCAS((ptrAsInt*)(&theOuterNeighbors),
+                  zero,
+                  (ptrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
+#else
+        theOuterNeighbors = &cellNeighbors[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
+    return outerNeighbors().push_back(t);
+  }
+
+  __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
+    if (tracks().empty()) {
+      auto i = cellTracks.extend();  // maybe waisted....
+      if (i > 0) {
+        cellTracks[i].reset();
+#ifdef __CUDACC__
+        auto zero = (ptrAsInt)(&cellTracks[0]);
+        atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
+#else
+        theTracks = &cellTracks[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
+    return tracks().push_back(t);
+  }
+
+  __device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
+  __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
+  __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
+  __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
+  __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_z(Hits const& hh) const { return theInnerZ; }
+  // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_r(Hits const& hh) const { return theInnerR; }
+  // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
+
+  __device__ __forceinline__ auto get_inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
+  __device__ __forceinline__ auto get_outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
+
+  __device__ __forceinline__ float get_inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
+
+  constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; }
+  constexpr unsigned int get_outer_hit_id() const { return theOuterHitId; }
+
+  __device__ void print_cell() const {
+    printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
+           theDoubletId,
+           theLayerPairId,
+           theInnerHitId,
+           theOuterHitId);
+  }
+
+  __device__ bool check_alignment(Hits const& hh,
+                                  GPUCACell const& otherCell,
+                                  const float ptmin,
+                                  const float hardCurvCut,
+                                  const float CAThetaCutBarrel,
+                                  const float CAThetaCutForward,
+                                  const float dcaCutInnerTriplet,
+                                  const float dcaCutOuterTriplet) const {
+    // detIndex of the layerStart for the Phase1 Pixel Detector:
+    // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
+    // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
+    constexpr uint32_t last_bpix1_detIndex = 96;
+    constexpr uint32_t last_barrel_detIndex = 1184;
+    auto ri = get_inner_r(hh);
+    auto zi = get_inner_z(hh);
+
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
+
+    auto r1 = otherCell.get_inner_r(hh);
+    auto z1 = otherCell.get_inner_z(hh);
+    auto isBarrel = otherCell.get_outer_detIndex(hh) < last_barrel_detIndex;
+    bool aligned = areAlignedRZ(r1,
+                                z1,
+                                ri,
+                                zi,
+                                ro,
+                                zo,
+                                ptmin,
+                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+    return (aligned &&
+            dcaCut(hh,
+                   otherCell,
+                   otherCell.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
+                   hardCurvCut));  // FIXME tune cuts
+  }
+
+  __device__ __forceinline__ static bool areAlignedRZ(
+      float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, const float thetaCut) {
+    float radius_diff = std::abs(r1 - ro);
+    float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
+
+    float pMin = ptmin * std::sqrt(distance_13_squared);  // this needs to be divided by
+                                                          // radius_diff later
+
+    float tan_12_13_half_mul_distance_13_squared = fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
+    return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
+  }
+
+  __device__ inline bool dcaCut(Hits const& hh,
+                                GPUCACell const& otherCell,
+                                const float region_origin_radius_plus_tolerance,
+                                const float maxCurv) const {
+    auto x1 = otherCell.get_inner_x(hh);
+    auto y1 = otherCell.get_inner_y(hh);
+
+    auto x2 = get_inner_x(hh);
+    auto y2 = get_inner_y(hh);
+
+    auto x3 = get_outer_x(hh);
+    auto y3 = get_outer_y(hh);
+
+    CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+
+    if (eq.curvature() > maxCurv)
+      return false;
+
+    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
+  }
+
+  __device__ __forceinline__ static bool dcaCutH(float x1,
+                                                 float y1,
+                                                 float x2,
+                                                 float y2,
+                                                 float x3,
+                                                 float y3,
+                                                 const float region_origin_radius_plus_tolerance,
+                                                 const float maxCurv) {
+    CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+
+    if (eq.curvature() > maxCurv)
+      return false;
+
+    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
+  }
+
+  __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
+    constexpr uint32_t max_ladder_bpx0 = 12;
+    constexpr uint32_t first_ladder_bpx0 = 0;
+    constexpr float module_length = 6.7f;
+    constexpr float module_tolerance = 0.4f;  // projection to cylinder is inaccurate on BPIX1
+    int p = innerCell.get_inner_iphi(hh);
+    if (p < 0)
+      p += std::numeric_limits<unsigned short>::max();
+    p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
+    p %= max_ladder_bpx0;
+    auto il = first_ladder_bpx0 + p;
+    auto r0 = hh.averageGeometry().ladderR[il];
+    auto ri = innerCell.get_inner_r(hh);
+    auto zi = innerCell.get_inner_z(hh);
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
+    auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
+    auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]);
+    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
+    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    return gap;
+  }
+
+  __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
+    constexpr uint32_t max_ladder_bpx4 = 64;
+    constexpr uint32_t first_ladder_bpx4 = 84;
+    // constexpr float radius_even_ladder = 15.815f;
+    // constexpr float radius_odd_ladder = 16.146f;
+    constexpr float module_length = 6.7f;
+    constexpr float module_tolerance = 0.2f;
+    // constexpr float barrel_z_length = 26.f;
+    // constexpr float forward_z_begin = 32.f;
+    int p = get_outer_iphi(hh);
+    if (p < 0)
+      p += std::numeric_limits<unsigned short>::max();
+    p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
+    p %= max_ladder_bpx4;
+    auto il = first_ladder_bpx4 + p;
+    auto r4 = hh.averageGeometry().ladderR[il];
+    auto ri = innerCell.get_inner_r(hh);
+    auto zi = innerCell.get_inner_z(hh);
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
+    auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
+    auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]);
+    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
+    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
+    auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1];
+    return gap || holeP || holeN;
+  }
+
+  // trying to free the track building process from hardcoded layers, leaving
+  // the visit of the graph based on the neighborhood connections between cells.
+  __device__ inline void find_ntuplets(Hits const& hh,
+                                       GPUCACell* __restrict__ cells,
+                                       CellTracksVector& cellTracks,
+                                       HitContainer& foundNtuplets,
+                                       cms::cuda::AtomicPairCounter& apc,
+                                       Quality* __restrict__ quality,
+                                       TmpTuple& tmpNtuplet,
+                                       const unsigned int minHitsPerNtuplet,
+                                       bool startAt0) const {
+    // the building process for a track ends if:
+    // it has no right neighbor
+    // it has no compatible neighbor
+    // the ntuplets is then saved if the number of hits it contains is greater
+    // than a threshold
+
+    tmpNtuplet.push_back_unsafe(theDoubletId);
+    assert(tmpNtuplet.size() <= 4);
+
+    bool last = true;
+    for (int j = 0; j < outerNeighbors().size(); ++j) {
+      auto otherCell = outerNeighbors()[j];
+      if (cells[otherCell].theDoubletId < 0)
+        continue;  // killed by earlyFishbone
+      last = false;
+      cells[otherCell].find_ntuplets(
+          hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
+    }
+    if (last) {  // if long enough save...
+      if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
+#ifdef ONLY_TRIPLETS_IN_HOLE
+        // triplets accepted only pointing to the hole
+        if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
+            ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
+#endif
+        {
+          hindex_type hits[6];
+          auto nh = 0U;
+          for (auto c : tmpNtuplet) {
+            hits[nh++] = cells[c].theInnerHitId;
+          }
+          hits[nh] = theOuterHitId;
+          auto it = foundNtuplets.bulkFill(apc, hits, tmpNtuplet.size() + 1);
+          if (it >= 0) {  // if negative is overflow....
+            for (auto c : tmpNtuplet)
+              cells[c].addTrack(it, cellTracks);
+            quality[it] = bad;  // initialize to bad
+          }
+        }
+      }
+    }
+    tmpNtuplet.pop_back();
+    assert(tmpNtuplet.size() < 4);
+  }
+
+private:
+  CellNeighbors* theOuterNeighbors;
+  CellTracks* theTracks;
+
+public:
+  int32_t theDoubletId;
+  int16_t theLayerPairId;
+  uint16_t theUsed;  // tbd
+
+private:
+  float theInnerZ;
+  float theInnerR;
+  hindex_type theInnerHitId;
+  hindex_type theOuterHitId;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
diff --git a/src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.cc b/src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.cc
new file mode 100644
index 000000000..bae8a88b6
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.cc
@@ -0,0 +1,16 @@
+#include "CUDACore/cudaCheck.h"
+#include "HelixFitOnGPU.h"
+
+void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
+                                  TupleMultiplicity const *tupleMultiplicity,
+                                  OutputSoA *helix_fit_results) {
+  tuples_d = tuples;
+  tupleMultiplicity_d = tupleMultiplicity;
+  outputSoa_d = helix_fit_results;
+
+  assert(tuples_d);
+  assert(tupleMultiplicity_d);
+  assert(outputSoa_d);
+}
+
+void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.h b/src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.h
new file mode 100644
index 000000000..77ce7719d
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/HelixFitOnGPU.h
@@ -0,0 +1,68 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
+#define RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
+
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+
+#include "CAConstants.h"
+#include "FitResult.h"
+
+namespace Rfit {
+  // in case of memory issue can be made smaller
+  constexpr uint32_t maxNumberOfConcurrentFits() { return CAConstants::maxNumberOfTuples(); }
+  constexpr uint32_t stride() { return maxNumberOfConcurrentFits(); }
+  using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
+  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride(), stride()> >;
+  using Matrix6x4f = Eigen::Matrix<float, 6, 4>;
+  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride(), stride()> >;
+
+  // hits
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()> >;
+  // errors
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()> >;
+  // fast fit
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
+
+}  // namespace Rfit
+
+class HelixFitOnGPU {
+public:
+  using HitsView = TrackingRecHit2DSOAView;
+
+  using Tuples = pixelTrack::HitContainer;
+  using OutputSoA = pixelTrack::TrackSoA;
+
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
+  ~HelixFitOnGPU() { deallocateOnGPU(); }
+
+  void setBField(double bField) { bField_ = bField; }
+  void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+
+  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+
+  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
+  void deallocateOnGPU();
+
+private:
+  static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
+
+  // fowarded
+  Tuples const *tuples_d = nullptr;
+  TupleMultiplicity const *tupleMultiplicity_d = nullptr;
+  OutputSoA *outputSoa_d;
+  float bField_;
+
+  const bool fit5as4_;
+};
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
diff --git a/src/cudacompat/plugin-PixelTriplets/RiemannFit.h b/src/cudacompat/plugin-PixelTriplets/RiemannFit.h
new file mode 100644
index 000000000..994b1dcf9
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/RiemannFit.h
@@ -0,0 +1,1005 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+
+#include "FitUtils.h"
+
+namespace Rfit {
+
+  /*!  Compute the Radiation length in the uniform hypothesis
+ *
+ * The Pixel detector, barrel and forward, is considered as an omogeneous
+ * cilinder of material, whose radiation lengths has been derived from the TDR
+ * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
+ * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
+ * lengths are computed using this unique number, in both regions, barrel and
+ * endcap.
+ *
+ * NB: no angle corrections nor projections are computed inside this routine.
+ * It is therefore the responsibility of the caller to supply the proper
+ * lengths in input. These lenghts are the path travelled by the particle along
+ * its trajectory, namely the so called S of the helix in 3D space.
+ *
+ * \param length_values vector of incremental distances that will be translated
+ * into radiation length equivalent. Each radiation length i is computed
+ * incrementally with respect to the previous length i-1. The first lenght has
+ * no reference point (i.e. it has the dca).
+ *
+ * \return incremental radiation lengths that correspond to each segment.
+ */
+
+  template <typename VNd1, typename VNd2>
+  __host__ __device__ inline void computeRadLenUniformMaterial(const VNd1& length_values, VNd2& rad_lengths) {
+    // Radiation length of the pixel detector in the uniform assumption, with
+    // 0.06 rad_len at 16 cm
+    constexpr double XX_0_inv = 0.06 / 16.;
+    u_int n = length_values.rows();
+    rad_lengths(0) = length_values(0) * XX_0_inv;
+    for (u_int j = 1; j < n; ++j) {
+      rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * XX_0_inv;
+    }
+  }
+
+  /*!
+    \brief Compute the covariance matrix along cartesian S-Z of points due to
+    multiple Coulomb scattering to be used in the line_fit, for the barrel
+    and forward cases.
+    The input covariance matrix is in the variables s-z, original and
+    unrotated.
+    The multiple scattering component is computed in the usual linear
+    approximation, using the 3D path which is computed as the squared root of
+    the squared sum of the s and z components passed in.
+    Internally a rotation by theta is performed and the covariance matrix
+    returned is the one in the direction orthogonal to the rotated S3D axis,
+    i.e. along the rotated Z axis.
+    The choice of the rotation is not arbitrary, but derived from the fact that
+    putting the horizontal axis along the S3D direction allows the usage of the
+    ordinary least squared fitting techiques with the trivial parametrization y
+    = mx + q, avoiding the patological case with m = +/- inf, that would
+    correspond to the case at eta = 0.
+ */
+
+  template <typename V4, typename VNd1, typename VNd2, int N>
+  __host__ __device__ inline auto Scatter_cov_line(Matrix2d const* cov_sz,
+                                                   const V4& fast_fit,
+                                                   VNd1 const& s_arcs,
+                                                   VNd2 const& z_values,
+                                                   const double theta,
+                                                   const double B,
+                                                   MatrixNd<N>& ret) {
+#ifdef RFIT_DEBUG
+    Rfit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
+#endif
+    constexpr u_int n = N;
+    double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    VectorNd<N> rad_lengths_S;
+    // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
+    // Basically, to perform cwise operations on Matrices and Vectors, you need
+    // to transform them into Array-like objects.
+    VectorNd<N> S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    S_values = S_values.array().sqrt();
+    computeRadLenUniformMaterial(S_values, rad_lengths_S);
+    VectorNd<N> sig2_S;
+    sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
+#ifdef RFIT_DEBUG
+    Rfit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
+#endif
+    Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
+    for (u_int k = 0; k < n; ++k) {
+      tmp(k, k) = cov_sz[k](0, 0);
+      tmp(k + n, k + n) = cov_sz[k](1, 1);
+      tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
+    }
+    for (u_int k = 0; k < n; ++k) {
+      for (u_int l = k; l < n; ++l) {
+        for (u_int i = 0; i < std::min(k, l); ++i) {
+          tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
+        }
+        tmp(l + n, k + n) = tmp(k + n, l + n);
+      }
+    }
+    // We are interested only in the errors orthogonal to the rotated s-axis
+    // which, in our formalism, are in the lower square matrix.
+#ifdef RFIT_DEBUG
+    Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
+#endif
+    ret = tmp.block(n, n, n, n);
+  }
+
+  /*!
+    \brief Compute the covariance matrix (in radial coordinates) of points in
+    the transverse plane due to multiple Coulomb scattering.
+    \param p2D 2D points in the transverse plane.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, Tan(Theta))).
+    \param B magnetic field use to compute p
+    \return scatter_cov_rad errors due to multiple scattering.
+    \warning input points must be ordered radially from the detector center
+    (from inner layer to outer ones; points on the same layer must ordered too).
+    \details Only the tangential component is computed (the radial one is
+    negligible).
+ */
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline MatrixNd<N> Scatter_cov_rad(const M2xN& p2D,
+                                                         const V4& fast_fit,
+                                                         VectorNd<N> const& rad,
+                                                         double B) {
+    constexpr u_int n = N;
+    double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    double theta = atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI : theta;
+    VectorNd<N> s_values;
+    VectorNd<N> rad_lengths;
+    const Vector2d o(fast_fit(0), fast_fit(1));
+
+    // associated Jacobian, used in weights and errors computation
+    for (u_int i = 0; i < n; ++i) {  // x
+      Vector2d p = p2D.block(0, i, 2, 1) - o;
+      const double cross = cross2D(-o, p);
+      const double dot = (-o).dot(p);
+      const double atan2_ = atan2(cross, dot);
+      s_values(i) = std::abs(atan2_ * fast_fit(2));
+    }
+    computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / (fast_fit(3) * fast_fit(3))), rad_lengths);
+    MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
+    VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+    sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
+    for (u_int k = 0; k < n; ++k) {
+      for (u_int l = k; l < n; ++l) {
+        for (u_int i = 0; i < std::min(k, l); ++i) {
+          scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
+        }
+        scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+      }
+    }
+#ifdef RFIT_DEBUG
+    Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+#endif
+    return scatter_cov_rad;
+  }
+
+  /*!
+    \brief Transform covariance matrix from radial (only tangential component)
+    to Cartesian coordinates (only transverse plane component).
+    \param p2D 2D points in the transverse plane.
+    \param cov_rad covariance matrix in radial coordinate.
+    \return cov_cart covariance matrix in Cartesian coordinates.
+*/
+
+  template <typename M2xN, int N>
+  __host__ __device__ inline Matrix2Nd<N> cov_radtocart(const M2xN& p2D,
+                                                        const MatrixNd<N>& cov_rad,
+                                                        const VectorNd<N>& rad) {
+#ifdef RFIT_DEBUG
+    printf("Address of p2D: %p\n", &p2D);
+#endif
+    printIt(&p2D, "cov_radtocart - p2D:");
+    constexpr u_int n = N;
+    Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
+    VectorNd<N> rad_inv = rad.cwiseInverse();
+    printIt(&rad_inv, "cov_radtocart - rad_inv:");
+    for (u_int i = 0; i < n; ++i) {
+      for (u_int j = i; j < n; ++j) {
+        cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(j, i) = cov_cart(i, j);
+        cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
+        cov_cart(j + n, i) = cov_cart(i, j + n);
+        cov_cart(j, i + n) = cov_cart(i + n, j);
+      }
+    }
+    return cov_cart;
+  }
+
+  /*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to radial coordinates (both radial and
+    tangential component but only diagonal terms, correlation between different
+    point are not managed).
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+    \return cov_rad covariance matrix in raidal coordinate.
+    \warning correlation between different point are not computed.
+*/
+  template <typename M2xN, int N>
+  __host__ __device__ inline VectorNd<N> cov_carttorad(const M2xN& p2D,
+                                                       const Matrix2Nd<N>& cov_cart,
+                                                       const VectorNd<N>& rad) {
+    constexpr u_int n = N;
+    VectorNd<N> cov_rad;
+    const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
+    for (u_int i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);
+      else {
+        cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
+                                    2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+      }
+    }
+    return cov_rad;
+  }
+
+  /*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to coordinates system orthogonal to the
+    pre-fitted circle in each point.
+    Further information in attached documentation.
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, tan(theta))).
+    \return cov_rad covariance matrix in the pre-fitted circle's
+    orthogonal system.
+*/
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline VectorNd<N> cov_carttorad_prefit(const M2xN& p2D,
+                                                              const Matrix2Nd<N>& cov_cart,
+                                                              V4& fast_fit,
+                                                              const VectorNd<N>& rad) {
+    constexpr u_int n = N;
+    VectorNd<N> cov_rad;
+    for (u_int i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);  // TO FIX
+      else {
+        Vector2d a = p2D.col(i);
+        Vector2d b = p2D.col(i) - fast_fit.head(2);
+        const double x2 = a.dot(b);
+        const double y2 = cross2D(a, b);
+        const double tan_c = -y2 / x2;
+        const double tan_c2 = sqr(tan_c);
+        cov_rad(i) =
+            1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+      }
+    }
+    return cov_rad;
+  }
+
+  /*!
+    \brief Compute the points' weights' vector for the circle fit when multiple
+    scattering is managed.
+    Further information in attached documentation.
+    \param cov_rad_inv covariance matrix inverse in radial coordinated
+    (or, beter, pre-fitted circle's orthogonal system).
+    \return weight VectorNd points' weights' vector.
+    \bug I'm not sure this is the right way to compute the weights for non
+    diagonal cov matrix. Further investigation needed.
+*/
+
+  template <int N>
+  __host__ __device__ inline VectorNd<N> Weight_circle(const MatrixNd<N>& cov_rad_inv) {
+    return cov_rad_inv.colwise().sum().transpose();
+  }
+
+  /*!
+    \brief Find particle q considering the  sign of cross product between
+    particles velocity (estimated by the first 2 hits) and the vector radius
+    between the first hit and the center of the fitted circle.
+    \param p2D 2D points in transverse plane.
+    \param par_uvr result of the circle fit in this form: (X0,Y0,R).
+    \return q int 1 or -1.
+*/
+  template <typename M2xN>
+  __host__ __device__ inline int32_t Charge(const M2xN& p2D, const Vector3d& par_uvr) {
+    return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
+            0)
+               ? -1
+               : 1;
+  }
+
+  /*!
+    \brief Compute the eigenvector associated to the minimum eigenvalue.
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored.
+    \return the eigenvector associated to the minimum eigenvalue.
+    \warning double precision is needed for a correct assessment of chi2.
+    \details The minimus eigenvalue is related to chi2.
+    We exploit the fact that the matrix is symmetrical and small (2x2 for line
+    fit and 3x3 for circle fit), so the SelfAdjointEigenSolver from Eigen
+    library is used, with the computedDirect  method (available only for 2x2
+    and 3x3 Matrix) wich computes eigendecomposition of given matrix using a
+    fast closed-form algorithm.
+    For this optimization the matrix type must be known at compiling time.
+*/
+
+  __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
+#ifdef RFIT_DEBUG
+    printf("min_eigen3D - enter\n");
+#endif
+    Eigen::SelfAdjointEigenSolver<Matrix3d> solver(3);
+    solver.computeDirect(A);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
+#ifdef RFIT_DEBUG
+    printf("min_eigen3D - exit\n");
+#endif
+    return solver.eigenvectors().col(min_index);
+  }
+
+  /*!
+    \brief A faster version of min_eigen3D() where double precision is not
+    needed.
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+    \return the eigenvector associated to the minimum eigenvalue.
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 3x3 Matrix
+    indeed, use trigonometry function (it solves a third degree equation) which
+    speed up in  single precision.
+*/
+
+  __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
+    Eigen::SelfAdjointEigenSolver<Matrix3f> solver(3);
+    solver.computeDirect(A.cast<float>());
+    int min_index;
+    solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index).cast<double>();
+  }
+
+  /*!
+    \brief 2D version of min_eigen3D().
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+    \return the eigenvector associated to the minimum eigenvalue.
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix
+    do not use special math function (just sqrt) therefore it doesn't speed up
+    significantly in single precision.
+*/
+
+  __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
+    Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
+    solver.computeDirect(A);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index);
+  }
+
+  /*!
+    \brief A very fast helix fit: it fits a circle by three points (first, middle
+    and last point) and a line by two points (first and last).
+    \param hits points to be fitted
+    \return result in this form: (X0,Y0,R,tan(theta)).
+    \warning points must be passed ordered (from internal layer to external) in
+    order to maximize accuracy and do not mistake tan(theta) sign.
+    \details This fast fit is used as pre-fit which is needed for:
+    - weights estimation and chi2 computation in line fit (fundamental);
+    - weights estimation and chi2 computation in circle fit (useful);
+    - computation of error due to multiple scattering.
+*/
+
+  template <typename M3xN, typename V4>
+  __host__ __device__ inline void Fast_fit(const M3xN& hits, V4& result) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;  // get the number of hits
+    printIt(&hits, "Fast_fit - hits: ");
+
+    // CIRCLE FIT
+    // Make segments between middle-to-first(b) and last-to-first(c) hits
+    const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+    printIt(&b, "Fast_fit - b: ");
+    printIt(&c, "Fast_fit - c: ");
+    // Compute their lengths
+    auto b2 = b.squaredNorm();
+    auto c2 = c.squaredNorm();
+    // The algebra has been verified (MR). The usual approach has been followed:
+    // * use an orthogonal reference frame passing from the first point.
+    // * build the segments (chords)
+    // * build orthogonal lines through mid points
+    // * make a system and solve for X0 and Y0.
+    // * add the initial point
+    bool flip = abs(b.x()) < abs(b.y());
+    auto bx = flip ? b.y() : b.x();
+    auto by = flip ? b.x() : b.y();
+    auto cx = flip ? c.y() : c.x();
+    auto cy = flip ? c.x() : c.y();
+    //!< in case b.x is 0 (2 hits with same x)
+    auto div = 2. * (cx * by - bx * cy);
+    // if aligned TO FIX
+    auto Y0 = (cx * b2 - bx * c2) / div;
+    auto X0 = (0.5 * b2 - Y0 * by) / bx;
+    result(0) = hits(0, 0) + (flip ? Y0 : X0);
+    result(1) = hits(1, 0) + (flip ? X0 : Y0);
+    result(2) = sqrt(sqr(X0) + sqr(Y0));
+    printIt(&result, "Fast_fit - result: ");
+
+    // LINE FIT
+    const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+    printIt(&e, "Fast_fit - e: ");
+    printIt(&d, "Fast_fit - d: ");
+    // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
+    auto dr = result(2) * atan2(cross2D(d, e), d.dot(e));
+    // Simple difference in Z between last and first hit
+    auto dz = hits(2, n - 1) - hits(2, 0);
+
+    result(3) = (dr / dz);
+
+#ifdef RFIT_DEBUG
+    printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
+#endif
+  }
+
+  /*!
+    \brief Fit a generic number of 2D points with a circle using Riemann-Chernov
+    algorithm. Covariance matrix of fitted parameter is optionally computed.
+    Multiple scattering (currently only in barrel layer) is optionally handled.
+    \param hits2D 2D points to be fitted.
+    \param hits_cov2D covariance matrix of 2D points.
+    \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)).
+    (tan(theta) is not used).
+    \param B magnetic field
+    \param error flag for error computation.
+    \param scattering flag for multiple scattering
+    \return circle circle_fit:
+    -par parameter of the fitted circle in this form (X0,Y0,R); \n
+    -cov covariance matrix of the fitted parameter (not initialized if
+    error = false); \n
+    -q charge of the particle; \n
+    -chi2.
+    \warning hits must be passed ordered from inner to outer layer (double hits
+    on the same layer must be ordered too) so that multiple scattering is
+    treated properly.
+    \warning Multiple scattering for barrel is still not tested.
+    \warning Multiple scattering for endcap hits is not handled (yet). Do not
+    fit endcap hits with scattering = true !
+    \bug for small pt (<0.3 Gev/c) chi2 could be slightly underestimated.
+    \bug further investigation needed for error propagation with multiple
+    scattering.
+*/
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline circle_fit Circle_fit(const M2xN& hits2D,
+                                                   const Matrix2Nd<N>& hits_cov2D,
+                                                   const V4& fast_fit,
+                                                   const VectorNd<N>& rad,
+                                                   const double B,
+                                                   const bool error) {
+#ifdef RFIT_DEBUG
+    printf("circle_fit - enter\n");
+#endif
+    // INITIALIZATION
+    Matrix2Nd<N> V = hits_cov2D;
+    constexpr u_int n = N;
+    printIt(&hits2D, "circle_fit - hits2D:");
+    printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - WEIGHT COMPUTATION\n");
+#endif
+    // WEIGHT COMPUTATION
+    VectorNd<N> weight;
+    MatrixNd<N> G;
+    double renorm;
+    {
+      MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad).asDiagonal();
+      MatrixNd<N> scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
+      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
+      printIt(&hits2D, "circle_fit - hits2D bis:");
+#ifdef RFIT_DEBUG
+      printf("Address of hits2D: a) %p\n", &hits2D);
+#endif
+      V += cov_radtocart(hits2D, scatter_cov_rad, rad);
+      printIt(&V, "circle_fit - V:");
+      cov_rad += scatter_cov_rad;
+      printIt(&cov_rad, "circle_fit - cov_rad:");
+      math::cholesky::invert(cov_rad, G);
+      // G = cov_rad.inverse();
+      renorm = G.sum();
+      G *= 1. / renorm;
+      weight = Weight_circle(G);
+    }
+    printIt(&weight, "circle_fit - weight:");
+
+    // SPACE TRANSFORMATION
+#ifdef RFIT_DEBUG
+    printf("circle_fit - SPACE TRANSFORMATION\n");
+#endif
+
+    // center
+#ifdef RFIT_DEBUG
+    printf("Address of hits2D: b) %p\n", &hits2D);
+#endif
+    const Vector2d h_ = hits2D.rowwise().mean();  // centroid
+    printIt(&h_, "circle_fit - h_:");
+    Matrix3xNd<N> p3D;
+    p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
+    printIt(&p3D, "circle_fit - p3D: a)");
+    Vector2Nd<N> mc;  // centered hits, used in error computation
+    mc << p3D.row(0).transpose(), p3D.row(1).transpose();
+    printIt(&mc, "circle_fit - mc(centered hits):");
+
+    // scale
+    const double q = mc.squaredNorm();
+    const double s = sqrt(n * 1. / q);  // scaling factor
+    p3D *= s;
+
+    // project on paraboloid
+    p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
+    printIt(&p3D, "circle_fit - p3D: b)");
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - COST FUNCTION\n");
+#endif
+    // COST FUNCTION
+
+    // compute
+    Vector3d r0;
+    r0.noalias() = p3D * weight;  // center of gravity
+    const Matrix3xNd<N> X = p3D.colwise() - r0;
+    Matrix3d A = X * G * X.transpose();
+    printIt(&A, "circle_fit - A:");
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - MINIMIZE\n");
+#endif
+    // minimize
+    double chi2;
+    Vector3d v = min_eigen3D(A, chi2);
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN\n");
+#endif
+    printIt(&v, "v BEFORE INVERSION");
+    v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+    printIt(&v, "v AFTER INVERSION");
+    // This hack to be able to run on GPU where the automatic assignment to a
+    // double from the vector multiplication is not working.
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN 1\n");
+#endif
+    Eigen::Matrix<double, 1, 1> cm;
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN 2\n");
+#endif
+    cm = -v.transpose() * r0;
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN 3\n");
+#endif
+    const double c = cm(0, 0);
+    //  const double c = -v.transpose() * r0;
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
+#endif
+    // COMPUTE CIRCLE PARAMETER
+
+    // auxiliary quantities
+    const double h = sqrt(1. - sqr(v(2)) - 4. * c * v(2));
+    const double v2x2_inv = 1. / (2. * v(2));
+    const double s_inv = 1. / s;
+    Vector3d par_uvr_;  // used in error propagation
+    par_uvr_ << -v(0) * v2x2_inv, -v(1) * v2x2_inv, h * v2x2_inv;
+
+    circle_fit circle;
+    circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
+    circle.q = Charge(hits2D, circle.par);
+    circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
+    printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
+    printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
+#ifdef RFIT_DEBUG
+    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.q);
+#endif
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - ERROR PROPAGATION\n");
+#endif
+    // ERROR PROPAGATION
+    if (error) {
+#ifdef RFIT_DEBUG
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+#endif
+      ArrayNd<N> Vcs_[2][2];  // cov matrix of center & scaled points
+      MatrixNd<N> C[3][3];    // cov matrix of 3D transformed points
+#ifdef RFIT_DEBUG
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+#endif
+      {
+        Eigen::Matrix<double, 1, 1> cm;
+        Eigen::Matrix<double, 1, 1> cm2;
+        cm = mc.transpose() * V * mc;
+        const double c = cm(0, 0);
+        Matrix2Nd<N> Vcs;
+        Vcs.template triangularView<Eigen::Upper>() =
+            (sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
+                              (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
+                              (mc * mc.transpose()));
+
+        printIt(&Vcs, "circle_fit - Vcs:");
+        C[0][0] = Vcs.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
+        Vcs_[0][1] = Vcs.block(0, n, n, n);
+        C[1][1] = Vcs.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
+        Vcs_[1][0] = Vcs_[0][1].transpose();
+        printIt(&Vcs, "circle_fit - Vcs:");
+      }
+
+      {
+        const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+        const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+        const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
+        const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
+        const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
+        const ArrayNd<N> t10 = t01.transpose();
+        Vcs_[0][0] = C[0][0];
+        ;
+        C[0][1] = Vcs_[0][1];
+        C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
+        Vcs_[1][1] = C[1][1];
+        C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
+        MatrixNd<N> tmp;
+        tmp.template triangularView<Eigen::Upper>() =
+            (2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
+                   Vcs_[1][1] * Vcs_[1][1]) +
+             4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11))
+                .matrix();
+        C[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+      }
+      printIt(&C[0][0], "circle_fit - C[0][0]:");
+
+      Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+      for (u_int i = 0; i < 3; ++i) {
+        for (u_int j = i; j < 3; ++j) {
+          Eigen::Matrix<double, 1, 1> tmp;
+          tmp = weight.transpose() * C[i][j] * weight;
+          const double c = tmp(0, 0);
+          C0(i, j) = c;  //weight.transpose() * C[i][j] * weight;
+          C0(j, i) = C0(i, j);
+        }
+      }
+      printIt(&C0, "circle_fit - C0:");
+
+      const MatrixNd<N> W = weight * weight.transpose();
+      const MatrixNd<N> H = MatrixNd<N>::Identity().rowwise() - weight.transpose();
+      const MatrixNx3d<N> s_v = H * p3D.transpose();
+      printIt(&W, "circle_fit - W:");
+      printIt(&H, "circle_fit - H:");
+      printIt(&s_v, "circle_fit - s_v:");
+
+      MatrixNd<N> D_[3][3];  // cov(s_v)
+      {
+        D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
+        D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
+        D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
+        D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
+        D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
+        D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
+        D_[1][0] = D_[0][1].transpose();
+        D_[2][0] = D_[0][2].transpose();
+        D_[2][1] = D_[1][2].transpose();
+      }
+      printIt(&D_[0][0], "circle_fit - D_[0][0]:");
+
+      constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+      Matrix6d E;  // cov matrix of the 6 independent elements of A
+      for (u_int a = 0; a < 6; ++a) {
+        const u_int i = nu[a][0], j = nu[a][1];
+        for (u_int b = a; b < 6; ++b) {
+          const u_int k = nu[b][0], l = nu[b][1];
+          VectorNd<N> t0(n);
+          VectorNd<N> t1(n);
+          if (l == k) {
+            t0 = 2. * D_[j][l] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = 2. * D_[i][l] * s_v.col(l);
+          } else {
+            t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
+          }
+
+          if (i == j) {
+            Eigen::Matrix<double, 1, 1> cm;
+            cm = s_v.col(i).transpose() * (t0 + t1);
+            const double c = cm(0, 0);
+            E(a, b) = 0. + c;
+          } else {
+            Eigen::Matrix<double, 1, 1> cm;
+            cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+            const double c = cm(0, 0);
+            E(a, b) = 0. + c;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+          }
+          if (b != a)
+            E(b, a) = E(a, b);
+        }
+      }
+      printIt(&E, "circle_fit - E:");
+
+      Eigen::Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
+      for (u_int a = 0; a < 6; ++a) {
+        const u_int i = nu[a][0], j = nu[a][1];
+        Matrix3d Delta = Matrix3d::Zero();
+        Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
+        J2.col(a) = min_eigen3D_fast(A + Delta);
+        const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
+        J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
+      }
+      printIt(&J2, "circle_fit - J2:");
+
+      Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
+      {
+        Matrix3d t0 = J2 * E * J2.transpose();
+        Vector3d t1 = -t0 * r0;
+        Cvc.block(0, 0, 3, 3) = t0;
+        Cvc.block(0, 3, 3, 1) = t1;
+        Cvc.block(3, 0, 1, 3) = t1.transpose();
+        Eigen::Matrix<double, 1, 1> cm1;
+        Eigen::Matrix<double, 1, 1> cm3;
+        cm1 = (v.transpose() * C0 * v);
+        //      cm2 = (C0.cwiseProduct(t0)).sum();
+        cm3 = (r0.transpose() * t0 * r0);
+        const double c = cm1(0, 0) + (C0.cwiseProduct(t0)).sum() + cm3(0, 0);
+        Cvc(3, 3) = c;
+        // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+      }
+      printIt(&Cvc, "circle_fit - Cvc:");
+
+      Eigen::Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+      {
+        const double t = 1. / h;
+        J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
+            v(0) * v2x2_inv * t, v(1) * v2x2_inv * t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+      }
+      printIt(&J3, "circle_fit - J3:");
+
+      const RowVector2Nd<N> Jq = mc.transpose() * s * 1. / n;  // var(q)
+      printIt(&Jq, "circle_fit - Jq:");
+
+      Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                         + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
+
+      circle.cov = cov_uvr;
+    }
+
+    printIt(&circle.cov, "Circle cov:");
+#ifdef RFIT_DEBUG
+    printf("circle_fit - exit\n");
+#endif
+    return circle;
+  }
+
+  /*!  \brief Perform an ordinary least square fit in the s-z plane to compute
+ * the parameters cotTheta and Zip.
+ *
+ * The fit is performed in the rotated S3D-Z' plane, following the formalism of
+ * Frodesen, Chapter 10, p. 259.
+ *
+ * The system has been rotated to both try to use the combined errors in s-z
+ * along Z', as errors in the Y direction and to avoid the patological case of
+ * degenerate lines with angular coefficient m = +/- inf.
+ *
+ * The rotation is using the information on the theta angle computed in the
+ * fast fit. The rotation is such that the S3D axis will be the X-direction,
+ * while the rotated Z-axis will be the Y-direction. This pretty much follows
+ * what is done in the same fit in the Broken Line approach.
+ */
+
+  template <typename M3xN, typename M6xN, typename V4>
+  __host__ __device__ inline line_fit Line_fit(const M3xN& hits,
+                                               const M6xN& hits_ge,
+                                               const circle_fit& circle,
+                                               const V4& fast_fit,
+                                               const double B,
+                                               const bool error) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;
+    double theta = -circle.q * atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI : theta;
+
+    // Prepare the Rotation Matrix to rotate the points
+    Eigen::Matrix<double, 2, 2> rot;
+    rot << sin(theta), cos(theta), -cos(theta), sin(theta);
+
+    // PROJECTION ON THE CILINDER
+    //
+    // p2D will be:
+    // [s1, s2, s3, ..., sn]
+    // [z1, z2, z3, ..., zn]
+    // s values will be ordinary x-values
+    // z values will be ordinary y-values
+
+    Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
+    Eigen::Matrix<double, 2, 6> Jx;
+
+#ifdef RFIT_DEBUG
+    printf("Line_fit - B: %g\n", B);
+    printIt(&hits, "Line_fit points: ");
+    printIt(&hits_ge, "Line_fit covs: ");
+    printIt(&rot, "Line_fit rot: ");
+#endif
+    // x & associated Jacobian
+    // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+    // Slide 11
+    // a ==> -o i.e. the origin of the circle in XY plane, negative
+    // b ==> p i.e. distances of the points wrt the origin of the circle.
+    const Vector2d o(circle.par(0), circle.par(1));
+
+    // associated Jacobian, used in weights and errors computation
+    Matrix6d Cov = Matrix6d::Zero();
+    Matrix2d cov_sz[N];
+    for (u_int i = 0; i < n; ++i) {
+      Vector2d p = hits.block(0, i, 2, 1) - o;
+      const double cross = cross2D(-o, p);
+      const double dot = (-o).dot(p);
+      // atan2(cross, dot) give back the angle in the transverse plane so tha the
+      // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
+      const double atan2_ = -circle.q * atan2(cross, dot);
+      //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
+      p2D(0, i) = atan2_ * circle.par(2);
+
+      // associated Jacobian, used in weights and errors- computation
+      const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+      double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+      if (error) {
+        d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
+        d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
+        d_R = atan2_;
+      }
+      const double d_x = temp0 * (o(1) * dot + o(0) * cross);
+      const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
+      Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+      Cov.block(0, 0, 3, 3) = circle.cov;
+      Cov(3, 3) = hits_ge.col(i)[0];              // x errors
+      Cov(4, 4) = hits_ge.col(i)[2];              // y errors
+      Cov(5, 5) = hits_ge.col(i)[5];              // z errors
+      Cov(3, 4) = Cov(4, 3) = hits_ge.col(i)[1];  // cov_xy
+      Cov(3, 5) = Cov(5, 3) = hits_ge.col(i)[3];  // cov_xz
+      Cov(4, 5) = Cov(5, 4) = hits_ge.col(i)[4];  // cov_yz
+      Matrix2d tmp = Jx * Cov * Jx.transpose();
+      cov_sz[i].noalias() = rot * tmp * rot.transpose();
+    }
+    // Math of d_{X0,Y0,R,x,y} all verified by hand
+    p2D.row(1) = hits.row(2);
+
+    // The following matrix will contain errors orthogonal to the rotated S
+    // component only, with the Multiple Scattering properly treated!!
+    MatrixNd<N> cov_with_ms;
+    Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B, cov_with_ms);
+#ifdef RFIT_DEBUG
+    printIt(cov_sz, "line_fit - cov_sz:");
+    printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+#endif
+
+    // Rotate Points with the shape [2, n]
+    Matrix2xNd<N> p2D_rot = rot * p2D;
+
+#ifdef RFIT_DEBUG
+    printf("Fast fit Tan(theta): %g\n", fast_fit(3));
+    printf("Rotation angle: %g\n", theta);
+    printIt(&rot, "Rotation Matrix:");
+    printIt(&p2D, "Original Hits(s,z):");
+    printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
+    printIt(&rot, "Rotation Matrix:");
+#endif
+
+    // Build the A Matrix
+    Matrix2xNd<N> A;
+    A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+
+#ifdef RFIT_DEBUG
+    printIt(&A, "A Matrix:");
+#endif
+
+    // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
+    MatrixNd<N> Vy_inv;
+    math::cholesky::invert(cov_with_ms, Vy_inv);
+    // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
+    Eigen::Matrix<double, 2, 2> Cov_params = A * Vy_inv * A.transpose();
+    // Compute the Covariance Matrix of the fit parameters
+    math::cholesky::invert(Cov_params, Cov_params);
+
+    // Now Compute the Parameters in the form [2,1]
+    // The first component is q.
+    // The second component is m.
+    Eigen::Matrix<double, 2, 1> sol = Cov_params * A * Vy_inv * p2D_rot.row(1).transpose();
+
+#ifdef RFIT_DEBUG
+    printIt(&sol, "Rotated solutions:");
+#endif
+
+    // We need now to transfer back the results in the original s-z plane
+    auto common_factor = 1. / (sin(theta) - sol(1, 0) * cos(theta));
+    Eigen::Matrix<double, 2, 2> J;
+    J << 0., common_factor * common_factor, common_factor, sol(0, 0) * cos(theta) * common_factor * common_factor;
+
+    double m = common_factor * (sol(1, 0) * sin(theta) + cos(theta));
+    double q = common_factor * sol(0, 0);
+    auto cov_mq = J * Cov_params * J.transpose();
+
+    VectorNd<N> res = p2D_rot.row(1).transpose() - A.transpose() * sol;
+    double chi2 = res.transpose() * Vy_inv * res;
+
+    line_fit line;
+    line.par << m, q;
+    line.cov << cov_mq;
+    line.chi2 = chi2;
+
+#ifdef RFIT_DEBUG
+    printf("Common_factor: %g\n", common_factor);
+    printIt(&J, "Jacobian:");
+    printIt(&sol, "Rotated solutions:");
+    printIt(&Cov_params, "Cov_params:");
+    printIt(&cov_mq, "Rotated Covariance Matrix:");
+    printIt(&(line.par), "Real Parameters:");
+    printIt(&(line.cov), "Real Covariance Matrix:");
+    printf("Chi2: %g\n", chi2);
+#endif
+
+    return line;
+  }
+
+  /*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of hits projected in the transverse plane by Riemann-Chernov
+        algorithm (see Circle_fit() for further info); \n
+    -line fit of hits projected on cylinder surface by orthogonal distance
+        regression (see Line_fit for further info). \n
+    Points must be passed ordered (from inner to outer layer).
+    \param hits Matrix3xNd hits coordinates in this form: \n
+        |x0|x1|x2|...|xn| \n
+        |y0|y1|y2|...|yn| \n
+        |z0|z1|z2|...|zn|
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+   |(x0,x0)|(x1,x0)|(x2,x0)|.|(y0,x0)|(y1,x0)|(y2,x0)|.|(z0,x0)|(z1,x0)|(z2,x0)| \n
+   |(x0,x1)|(x1,x1)|(x2,x1)|.|(y0,x1)|(y1,x1)|(y2,x1)|.|(z0,x1)|(z1,x1)|(z2,x1)| \n
+   |(x0,x2)|(x1,x2)|(x2,x2)|.|(y0,x2)|(y1,x2)|(y2,x2)|.|(z0,x2)|(z1,x2)|(z2,x2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,y0)|(x1,y0)|(x2,y0)|.|(y0,y0)|(y1,y0)|(y2,x0)|.|(z0,y0)|(z1,y0)|(z2,y0)| \n
+   |(x0,y1)|(x1,y1)|(x2,y1)|.|(y0,y1)|(y1,y1)|(y2,x1)|.|(z0,y1)|(z1,y1)|(z2,y1)| \n
+   |(x0,y2)|(x1,y2)|(x2,y2)|.|(y0,y2)|(y1,y2)|(y2,x2)|.|(z0,y2)|(z1,y2)|(z2,y2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n
+   |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n
+   |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)|
+   \param B magnetic field in the center of the detector in Gev/cm/c
+   unit, in order to perform pt calculation.
+   \param error flag for error computation.
+   \param scattering flag for multiple scattering treatment.
+   (see Circle_fit() documentation for further info).
+   \warning see Circle_fit(), Line_fit() and Fast_fit() warnings.
+   \bug see Circle_fit(), Line_fit() and Fast_fit() bugs.
+*/
+
+  template <int N>
+  inline helix_fit Helix_fit(const Matrix3xNd<N>& hits,
+                             const Eigen::Matrix<float, 6, N>& hits_ge,
+                             const double B,
+                             const bool error) {
+    constexpr u_int n = N;
+    VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+    // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
+    Vector4d fast_fit;
+    Fast_fit(hits, fast_fit);
+    Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+    Rfit::loadCovariance2D(hits_ge, hits_cov);
+    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, B, error);
+    line_fit line = Line_fit(hits, hits_ge, circle, fast_fit, B, error);
+
+    par_uvrtopak(circle, B, error);
+
+    helix_fit helix;
+    helix.par << circle.par, line.par;
+    if (error) {
+      helix.cov = MatrixXd::Zero(5, 5);
+      helix.cov.block(0, 0, 3, 3) = circle.cov;
+      helix.cov.block(3, 3, 2, 2) = line.cov;
+    }
+    helix.q = circle.q;
+    helix.chi2_circle = circle.chi2;
+    helix.chi2_line = line.chi2;
+
+    return helix;
+  }
+
+}  // namespace Rfit
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
diff --git a/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cc b/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cc
new file mode 100644
index 000000000..347636286
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cc
@@ -0,0 +1,110 @@
+#include "RiemannFitOnGPU.h"
+
+void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
+  assert(tuples_d);
+
+  //  Fit internals
+  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU_ =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
+  auto circle_fit_resultsGPU_holder = std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
+  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    kernelFastFit<3>(
+        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+
+    kernelCircleFit<3>(tupleMultiplicity_d,
+                       3,
+                       bField_,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
+
+    kernelLineFit<3>(tupleMultiplicity_d,
+                     3,
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     circle_fit_resultsGPU_,
+                     offset);
+
+    // quads
+    kernelFastFit<4>(
+        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+
+    kernelCircleFit<4>(tupleMultiplicity_d,
+                       4,
+                       bField_,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
+
+    kernelLineFit<4>(tupleMultiplicity_d,
+                     4,
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     circle_fit_resultsGPU_,
+                     offset);
+
+    if (fit5as4_) {
+      // penta
+      kernelFastFit<4>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+
+      kernelCircleFit<4>(tupleMultiplicity_d,
+                         5,
+                         bField_,
+                         hitsGPU_.get(),
+                         hits_geGPU_.get(),
+                         fast_fit_resultsGPU_.get(),
+                         circle_fit_resultsGPU_,
+                         offset);
+
+      kernelLineFit<4>(tupleMultiplicity_d,
+                       5,
+                       bField_,
+                       outputSoa_d,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
+
+    } else {
+      // penta all 5
+      kernelFastFit<5>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+
+      kernelCircleFit<5>(tupleMultiplicity_d,
+                         5,
+                         bField_,
+                         hitsGPU_.get(),
+                         hits_geGPU_.get(),
+                         fast_fit_resultsGPU_.get(),
+                         circle_fit_resultsGPU_,
+                         offset);
+
+      kernelLineFit<5>(tupleMultiplicity_d,
+                       5,
+                       bField_,
+                       outputSoa_d,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
+    }
+  }
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cu b/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cu
new file mode 100644
index 000000000..fe27153ac
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.cu
@@ -0,0 +1,131 @@
+#include "RiemannFitOnGPU.h"
+#include "CUDACore/device_unique_ptr.h"
+
+void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples,
+                                         cudaStream_t stream) {
+  assert(tuples_d);
+
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto circle_fit_resultsGPU_holder =
+      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
+  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                 3,
+                                                                 bField_,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 circle_fit_resultsGPU_,
+                                                                 offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                               3,
+                                                               bField_,
+                                                               outputSoa_d,
+                                                               hitsGPU_.get(),
+                                                               hits_geGPU_.get(),
+                                                               fast_fit_resultsGPU_.get(),
+                                                               circle_fit_resultsGPU_,
+                                                               offset);
+    cudaCheck(cudaGetLastError());
+
+    // quads
+    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                     4,
+                                                                     bField_,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   4,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   circle_fit_resultsGPU_,
+                                                                   offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // penta
+      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+      cudaCheck(cudaGetLastError());
+    } else {
+      // penta all 5
+      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+      cudaCheck(cudaGetLastError());
+    }
+  }
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.h b/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.h
new file mode 100644
index 000000000..02766b557
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/RiemannFitOnGPU.h
@@ -0,0 +1,187 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/cuda_assert.h"
+#include "CondFormats/pixelCPEforGPU.h"
+
+#include "RiemannFit.h"
+#include "HelixFitOnGPU.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
+
+template <int N>
+__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
+                              CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                              uint32_t nHits,
+                              HitsOnGPU const *__restrict__ hhp,
+                              double *__restrict__ phits,
+                              float *__restrict__ phits_ge,
+                              double *__restrict__ pfast_fit,
+                              uint32_t offset) {
+  constexpr uint32_t hitsInFit = N;
+
+  assert(hitsInFit <= nHits);
+
+  assert(pfast_fit);
+  assert(foundNtuplets);
+  assert(tupleMultiplicity);
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+
+#ifdef RIEMANN_DEBUG
+  if (0 == local_start)
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+#endif
+
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it from the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    assert(tkid < foundNtuplets->nbins());
+
+    assert(foundNtuplets->size(tkid) == nHits);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    // Prepare data structure
+    auto const *hitId = foundNtuplets->begin(tkid);
+    for (unsigned int i = 0; i < hitsInFit; ++i) {
+      auto hit = hitId[i];
+      // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
+      float ge[6];
+      hhp->cpeParams()
+          .detParams(hhp->detectorIndex(hit))
+          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+      // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+
+      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+    }
+    Rfit::Fast_fit(hits, fast_fit);
+
+    // no NaN here....
+    assert(fast_fit(0) == fast_fit(0));
+    assert(fast_fit(1) == fast_fit(1));
+    assert(fast_fit(2) == fast_fit(2));
+    assert(fast_fit(3) == fast_fit(3));
+  }
+}
+
+template <int N>
+__global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                uint32_t nHits,
+                                double B,
+                                double *__restrict__ phits,
+                                float *__restrict__ phits_ge,
+                                double *__restrict__ pfast_fit_input,
+                                Rfit::circle_fit *circle_fit,
+                                uint32_t offset) {
+  assert(circle_fit);
+  assert(N <= nHits);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+
+    Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
+    Rfit::loadCovariance2D(hits_ge, hits_cov);
+
+    circle_fit[local_idx] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
+
+#ifdef RIEMANN_DEBUG
+//    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid,
+//         circle_fit[local_idx].par(0), circle_fit[local_idx].par(1), circle_fit[local_idx].par(2));
+#endif
+  }
+}
+
+template <int N>
+__global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                              uint32_t nHits,
+                              double B,
+                              OutputSoA *results,
+                              double *__restrict__ phits,
+                              float *__restrict__ phits_ge,
+                              double *__restrict__ pfast_fit_input,
+                              Rfit::circle_fit *__restrict__ circle_fit,
+                              uint32_t offset) {
+  assert(results);
+  assert(circle_fit);
+  assert(N <= nHits);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it for the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_idx], fast_fit, B, true);
+
+    Rfit::fromCircleToPerigee(circle_fit[local_idx]);
+
+    results->stateAtBS.copyFromCircle(
+        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(B), tkid);
+    results->pt(tkid) = B / std::abs(circle_fit[local_idx].par(2));
+    results->eta(tkid) = asinhf(line_fit.par(0));
+    results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
+
+#ifdef RIEMANN_DEBUG
+    printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+           N,
+           nHits,
+           tkid,
+           circle_fit[local_idx].par(0),
+           circle_fit[local_idx].par(1),
+           circle_fit[local_idx].par(2));
+    printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
+    printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
+           circle_fit[local_idx].chi2,
+           line_fit.chi2,
+           circle_fit[local_idx].cov(0, 0),
+           circle_fit[local_idx].cov(1, 1),
+           circle_fit[local_idx].cov(2, 2),
+           line_fit.cov(0, 0),
+           line_fit.cov(1, 1));
+#endif
+  }
+}
diff --git a/src/cudacompat/plugin-PixelTriplets/choleskyInversion.h b/src/cudacompat/plugin-PixelTriplets/choleskyInversion.h
new file mode 100644
index 000000000..2cb4105f8
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/choleskyInversion.h
@@ -0,0 +1,349 @@
+#ifndef DataFormat_Math_choleskyInversion_h
+#define DataFormat_Math_choleskyInversion_h
+
+#include <cmath>
+
+#include <Eigen/Core>
+
+/**
+ * fully inlined specialized code to perform the inversion of a
+ * positive defined matrix of rank up to 6.
+ *
+ * adapted from ROOT::Math::CholeskyDecomp
+ * originally by
+ * @author Manuel Schiller
+ * @date Aug 29 2008
+ *
+ *
+ */
+namespace math {
+  namespace cholesky {
+
+    template <typename M1, typename M2>
+    inline constexpr void invert11(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      dst(0, 0) = F(1.0) / src(0, 0);
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert22(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0) * src(1, 0) * luc0;
+      auto luc2 = F(1.0) / (src(1, 1) - luc1);
+
+      auto li21 = luc1 * luc0 * luc2;
+
+      dst(0, 0) = li21 + luc0;
+      dst(1, 0) = -src(1, 0) * luc0 * luc2;
+      dst(1, 1) = luc2;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert33(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4);
+      luc5 = F(1.0) / luc5;
+
+      auto li21 = -luc0 * luc1;
+      auto li32 = -(luc2 * luc4);
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+
+      dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0;
+      dst(1, 0) = luc5 * li31 * li32 + li21 * luc2;
+      dst(1, 1) = luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc5 * li31;
+      dst(2, 1) = luc5 * li32;
+      dst(2, 2) = luc5;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert44(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+
+      dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc9 * li41;
+      dst(3, 1) = luc9 * li42;
+      dst(3, 2) = luc9 * li43;
+      dst(3, 3) = luc9;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert55(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+      auto luc10 = src(4, 0);
+      auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+      auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+      auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+      auto luc14 =
+          src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+      luc14 = F(1.0) / luc14;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+      auto li54 = -luc13 * luc9;
+      auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+      auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+      auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                   luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                   luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                  luc0;
+
+      dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc14 * li51 * li54 + luc9 * li41;
+      dst(3, 1) = luc14 * li52 * li54 + luc9 * li42;
+      dst(3, 2) = luc14 * li53 * li54 + luc9 * li43;
+      dst(3, 3) = luc14 * li54 * li54 + luc9;
+      dst(4, 0) = luc14 * li51;
+      dst(4, 1) = luc14 * li52;
+      dst(4, 2) = luc14 * li53;
+      dst(4, 3) = luc14 * li54;
+      dst(4, 4) = luc14;
+    }
+
+    template <typename M1, typename M2>
+    inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+      auto luc10 = src(4, 0);
+      auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+      auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+      auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+      auto luc14 =
+          src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+      luc14 = F(1.0) / luc14;
+      auto luc15 = src(5, 0);
+      auto luc16 = (src(5, 1) - luc0 * luc1 * luc15);
+      auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16);
+      auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17);
+      auto luc19 =
+          (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18);
+      auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 +
+                                luc9 * luc18 * luc18 + luc14 * luc19 * luc19);
+      luc20 = F(1.0) / luc20;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+      auto li54 = -luc13 * luc9;
+      auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+      auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+      auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                   luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                   luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                  luc0;
+
+      auto li65 = -luc19 * luc14;
+      auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9;
+      auto li63 =
+          (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5;
+      auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 -
+                   luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 +
+                   luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) *
+                  luc2;
+      auto li61 =
+          (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 +
+           luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 +
+           luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 -
+           luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 -
+           luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 -
+           luc19 * luc13 * luc6 * luc9 * luc14 + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 +
+           luc19 * luc10 * luc14 - luc15) *
+          luc0;
+
+      dst(0, 0) = luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 +
+                  luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41;
+      dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42;
+      dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43;
+      dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9;
+      dst(4, 0) = luc20 * li61 * li65 + luc14 * li51;
+      dst(4, 1) = luc20 * li62 * li65 + luc14 * li52;
+      dst(4, 2) = luc20 * li63 * li65 + luc14 * li53;
+      dst(4, 3) = luc20 * li64 * li65 + luc14 * li54;
+      dst(4, 4) = luc20 * li65 * li65 + luc14;
+      dst(5, 0) = luc20 * li61;
+      dst(5, 1) = luc20 * li62;
+      dst(5, 2) = luc20 * li63;
+      dst(5, 3) = luc20 * li64;
+      dst(5, 4) = luc20 * li65;
+      dst(5, 5) = luc20;
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize11(M& dst) {}
+
+    template <typename M>
+    inline constexpr void symmetrize22(M& dst) {
+      dst(0, 1) = dst(1, 0);
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize33(M& dst) {
+      symmetrize22(dst);
+      dst(0, 2) = dst(2, 0);
+      dst(1, 2) = dst(2, 1);
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize44(M& dst) {
+      symmetrize33(dst);
+      dst(0, 3) = dst(3, 0);
+      dst(1, 3) = dst(3, 1);
+      dst(2, 3) = dst(3, 2);
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize55(M& dst) {
+      symmetrize44(dst);
+      dst(0, 4) = dst(4, 0);
+      dst(1, 4) = dst(4, 1);
+      dst(2, 4) = dst(4, 2);
+      dst(3, 4) = dst(4, 3);
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize66(M& dst) {
+      symmetrize55(dst);
+      dst(0, 5) = dst(5, 0);
+      dst(1, 5) = dst(5, 1);
+      dst(2, 5) = dst(5, 2);
+      dst(3, 5) = dst(5, 3);
+      dst(4, 5) = dst(5, 4);
+    }
+
+    template <typename M1, typename M2, int N>
+    struct Inverter {
+      static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 1> {
+      static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 2> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert22(src, dst);
+        symmetrize22(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 3> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert33(src, dst);
+        symmetrize33(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 4> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert44(src, dst);
+        symmetrize44(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 5> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert55(src, dst);
+        symmetrize55(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 6> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert66(src, dst);
+        symmetrize66(dst);
+      }
+    };
+
+    // Eigen interface
+    template <typename D1, typename D2>
+    inline constexpr void invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
+      using M1 = Eigen::DenseBase<D1>;
+      using M2 = Eigen::DenseBase<D2>;
+      Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+    }
+
+  }  // namespace cholesky
+}  // namespace math
+
+#endif  // DataFormat_Math_choleskyInversion_h
diff --git a/src/cudacompat/plugin-PixelTriplets/gpuFishbone.h b/src/cudacompat/plugin-PixelTriplets/gpuFishbone.h
new file mode 100644
index 000000000..2e2446ea3
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/gpuFishbone.h
@@ -0,0 +1,93 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "DataFormats/approx_atan2.h"
+#include "Geometry/phase1PixelTopology.h"
+#include "CUDACore/VecArray.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "GPUCACell.h"
+
+namespace gpuPixelDoublets {
+
+  //  __device__
+  //  __forceinline__
+  __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp,
+                           GPUCACell* cells,
+                           uint32_t const* __restrict__ nCells,
+                           GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
+                           uint32_t nHits,
+                           bool checkTrack) {
+    constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
+
+    auto const& hh = *hhp;
+    // auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
+
+    // x run faster...
+    auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
+    auto firstX = threadIdx.x;
+
+    float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
+    uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
+    uint32_t cc[maxCellsPerHit];
+
+    for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) {
+      auto const& vc = isOuterHitOfCell[idy];
+      auto s = vc.size();
+      if (s < 2)
+        continue;
+      // if alligned kill one of the two.
+      // in principle one could try to relax the cut (only in r-z?) for jumping-doublets
+      auto const& c0 = cells[vc[0]];
+      auto xo = c0.get_outer_x(hh);
+      auto yo = c0.get_outer_y(hh);
+      auto zo = c0.get_outer_z(hh);
+      auto sg = 0;
+      for (int32_t ic = 0; ic < s; ++ic) {
+        auto& ci = cells[vc[ic]];
+        if (0 == ci.theUsed)
+          continue;  // for triplets equivalent to next
+        if (checkTrack && ci.tracks().empty())
+          continue;
+        cc[sg] = vc[ic];
+        d[sg] = ci.get_inner_detIndex(hh);
+        //      l[sg] = layer(d[sg]);
+        x[sg] = ci.get_inner_x(hh) - xo;
+        y[sg] = ci.get_inner_y(hh) - yo;
+        z[sg] = ci.get_inner_z(hh) - zo;
+        n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
+        ++sg;
+      }
+      if (sg < 2)
+        continue;
+      // here we parallelize
+      for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) {
+        auto& ci = cells[cc[ic]];
+        for (auto jc = ic + 1; jc < sg; ++jc) {
+          auto& cj = cells[cc[jc]];
+          // must be different detectors (in the same layer)
+          //        if (d[ic]==d[jc]) continue;
+          // || l[ic]!=l[jc]) continue;
+          auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
+          if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
+            // alligned:  kill farthest  (prefer consecutive layers)
+            if (n[ic] > n[jc]) {
+              ci.theDoubletId = -1;
+              break;
+            } else {
+              cj.theDoubletId = -1;
+            }
+          }
+        }  //cj
+      }    // ci
+    }      // hits
+  }
+}  // namespace gpuPixelDoublets
+
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/src/cudacompat/plugin-PixelTriplets/gpuPixelDoublets.h b/src/cudacompat/plugin-PixelTriplets/gpuPixelDoublets.h
new file mode 100644
index 000000000..e906f85f1
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/gpuPixelDoublets.h
@@ -0,0 +1,130 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
+
+#include "gpuPixelDoubletsAlgos.h"
+
+#define CONSTANT_VAR __constant__
+
+namespace gpuPixelDoublets {
+
+  constexpr int nPairs = 13 + 2 + 4;
+  static_assert(nPairs <= CAConstants::maxNumberOfLayerPairs());
+
+  // start constants
+  // clang-format off
+
+  CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
+      0, 1, 0, 4, 0, 7,              // BPIX1 (3)
+      1, 2, 1, 4, 1, 7,              // BPIX2 (5)
+      4, 5, 7, 8,                    // FPIX1 (8)
+      2, 3, 2, 4, 2, 7, 5, 6, 8, 9,  // BPIX3 & FPIX2 (13)
+      0, 2, 1, 3,                    // Jumping Barrel (15)
+      0, 5, 0, 8,                    // Jumping Forward (BPIX1,FPIX2)
+      4, 6, 7, 9                     // Jumping Forward (19)
+  };
+
+  constexpr int16_t phi0p05 = 522;  // round(521.52189...) = phi2short(0.05);
+  constexpr int16_t phi0p06 = 626;  // round(625.82270...) = phi2short(0.06);
+  constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
+
+  CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05,
+                                             phi0p07,
+                                             phi0p07,
+                                             phi0p05,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05};
+  //   phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06};  // relaxed cuts
+
+  CONSTANT_VAR float const minz[nPairs] = {
+      -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.};
+  CONSTANT_VAR float const maxz[nPairs] = {
+      20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.};
+  CONSTANT_VAR float const maxr[nPairs] = {
+      20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.};
+
+  // end constants
+  // clang-format on
+
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
+
+  __global__ void initDoublets(GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                               int nHits,
+                               CellNeighborsVector* cellNeighbors,
+                               CellNeighbors* cellNeighborsContainer,
+                               CellTracksVector* cellTracks,
+                               CellTracks* cellTracksContainer) {
+    assert(isOuterHitOfCell);
+    int first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
+      isOuterHitOfCell[i].reset();
+
+    if (0 == first) {
+      cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
+      cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
+      auto i = cellNeighbors->extend();
+      assert(0 == i);
+      (*cellNeighbors)[0].reset();
+      i = cellTracks->extend();
+      assert(0 == i);
+      (*cellTracks)[0].reset();
+    }
+  }
+
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
+  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
+
+  __global__
+#ifdef __CUDACC__
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP)
+#endif
+      void getDoubletsFromHisto(GPUCACell* cells,
+                                uint32_t* nCells,
+                                CellNeighborsVector* cellNeighbors,
+                                CellTracksVector* cellTracks,
+                                TrackingRecHit2DSOAView const* __restrict__ hhp,
+                                GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                int nActualPairs,
+                                bool ideal_cond,
+                                bool doClusterCut,
+                                bool doZ0Cut,
+                                bool doPtCut,
+                                uint32_t maxNumOfDoublets) {
+    auto const& __restrict__ hh = *hhp;
+    doubletsFromHisto(layerPairs,
+                      nActualPairs,
+                      cells,
+                      nCells,
+                      cellNeighbors,
+                      cellTracks,
+                      hh,
+                      isOuterHitOfCell,
+                      phicuts,
+                      minz,
+                      maxz,
+                      maxr,
+                      ideal_cond,
+                      doClusterCut,
+                      doZ0Cut,
+                      doPtCut,
+                      maxNumOfDoublets);
+  }
+
+}  // namespace gpuPixelDoublets
+
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
diff --git a/src/cudacompat/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h b/src/cudacompat/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
new file mode 100644
index 000000000..6d6a62c88
--- /dev/null
+++ b/src/cudacompat/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
@@ -0,0 +1,244 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "DataFormats/approx_atan2.h"
+#include "CUDACore/VecArray.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "CAConstants.h"
+#include "GPUCACell.h"
+
+namespace gpuPixelDoublets {
+
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
+
+  __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
+                                                    uint32_t nPairs,
+                                                    GPUCACell* cells,
+                                                    uint32_t* nCells,
+                                                    CellNeighborsVector* cellNeighbors,
+                                                    CellTracksVector* cellTracks,
+                                                    TrackingRecHit2DSOAView const& __restrict__ hh,
+                                                    GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                                    int16_t const* __restrict__ phicuts,
+                                                    float const* __restrict__ minz,
+                                                    float const* __restrict__ maxz,
+                                                    float const* __restrict__ maxr,
+                                                    bool ideal_cond,
+                                                    bool doClusterCut,
+                                                    bool doZ0Cut,
+                                                    bool doPtCut,
+                                                    uint32_t maxNumOfDoublets) {
+    // ysize cuts (z in the barrel)  times 8
+    // these are used if doClusterCut is true
+    constexpr int minYsizeB1 = 36;
+    constexpr int minYsizeB2 = 28;
+    constexpr int maxDYsize12 = 28;
+    constexpr int maxDYsize = 20;
+    constexpr int maxDYPred = 20;
+    constexpr float dzdrFact = 8 * 0.0285 / 0.015;  // from dz/dr to "DY"
+
+    bool isOuterLadder = ideal_cond;
+
+    using Hist = TrackingRecHit2DSOAView::Hist;
+
+    auto const& __restrict__ hist = hh.phiBinner();
+    uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
+    assert(offsets);
+
+    auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
+
+    // nPairsMax to be optimized later (originally was 64).
+    // If it should be much bigger, consider using a block-wide parallel prefix scan,
+    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+    const int nPairsMax = CAConstants::maxNumberOfLayerPairs();
+    assert(nPairs <= nPairsMax);
+    __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
+    __shared__ uint32_t ntot;
+    if (threadIdx.y == 0 && threadIdx.x == 0) {
+      innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
+      for (uint32_t i = 1; i < nPairs; ++i) {
+        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(layerPairs[2 * i]);
+      }
+      ntot = innerLayerCumulativeSize[nPairs - 1];
+    }
+    __syncthreads();
+
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+
+    uint32_t pairLayerId = 0;  // cannot go backward
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
+      while (j >= innerLayerCumulativeSize[pairLayerId++])
+        ;
+      --pairLayerId;  // move to lower_bound ??
+
+      assert(pairLayerId < nPairs);
+      assert(j < innerLayerCumulativeSize[pairLayerId]);
+      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
+
+      uint8_t inner = layerPairs[2 * pairLayerId];
+      uint8_t outer = layerPairs[2 * pairLayerId + 1];
+      assert(outer > inner);
+
+      auto hoff = Hist::histOff(outer);
+
+      auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
+      i += offsets[inner];
+
+      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
+
+      assert(i >= offsets[inner]);
+      assert(i < offsets[inner + 1]);
+
+      // found hit corresponding to our cuda thread, now do the job
+      auto mi = hh.detectorIndex(i);
+      if (mi > 2000)
+        continue;  // invalid
+
+      /* maybe clever, not effective when zoCut is on
+      auto bpos = (mi%8)/4;  // if barrel is 1 for z>0
+      auto fpos = (outer>3) & (outer<7);
+      if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
+      */
+
+      auto mez = hh.zGlobal(i);
+
+      if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
+        continue;
+
+      int16_t mes = -1;  // make compiler happy
+      if (doClusterCut) {
+        // if ideal treat inner ladder as outer
+        if (inner == 0)
+          assert(mi < 96);
+        isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
+
+        // in any case we always test mes>0 ...
+        mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1;
+
+        if (inner == 0 && outer > 3)  // B1 and F1
+          if (mes > 0 && mes < minYsizeB1)
+            continue;                 // only long cluster  (5*8)
+        if (inner == 1 && outer > 3)  // B2 and F1
+          if (mes > 0 && mes < minYsizeB2)
+            continue;
+      }
+      auto mep = hh.iphi(i);
+      auto mer = hh.rGlobal(i);
+
+      // all cuts: true if fails
+      constexpr float z0cut = 12.f;      // cm
+      constexpr float hardPtCut = 0.5f;  // GeV
+      constexpr float minRadius =
+          hardPtCut * 87.78f;  // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
+      auto ptcut = [&](int j, int16_t idphi) {
+        auto r2t4 = minRadius2T4;
+        auto ri = mer;
+        auto ro = hh.rGlobal(j);
+        auto dphi = short2phi(idphi);
+        return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
+      };
+      auto z0cutoff = [&](int j) {
+        auto zo = hh.zGlobal(j);
+        auto ro = hh.rGlobal(j);
+        auto dr = ro - mer;
+        return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
+      };
+
+      auto zsizeCut = [&](int j) {
+        auto onlyBarrel = outer < 4;
+        auto so = hh.clusterSizeY(j);
+        auto dy = inner == 0 ? maxDYsize12 : maxDYsize;
+        // in the barrel cut on difference in size
+        // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well)
+        // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ...
+        auto zo = hh.zGlobal(j);
+        auto ro = hh.rGlobal(j);
+        return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy
+                          : (inner < 4) && mes > 0 &&
+                                std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred;
+      };
+
+      auto iphicut = phicuts[pairLayerId];
+
+      auto kl = Hist::bin(int16_t(mep - iphicut));
+      auto kh = Hist::bin(int16_t(mep + iphicut));
+      auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
+      // bool piWrap = std::abs(kh-kl) > Hist::nbins()/2;
+
+#ifdef GPU_DEBUG
+      int tot = 0;
+      int nmin = 0;
+      int tooMany = 0;
+#endif
+
+      auto khh = kh;
+      incr(khh);
+      for (auto kk = kl; kk != khh; incr(kk)) {
+#ifdef GPU_DEBUG
+        if (kk != kl && kk != kh)
+          nmin += hist.size(kk + hoff);
+#endif
+        auto const* __restrict__ p = hist.begin(kk + hoff);
+        auto const* __restrict__ e = hist.end(kk + hoff);
+        p += first;
+        for (; p < e; p += stride) {
+          auto oi = __ldg(p);
+          assert(oi >= offsets[outer]);
+          assert(oi < offsets[outer + 1]);
+          auto mo = hh.detectorIndex(oi);
+          if (mo > 2000)
+            continue;  //    invalid
+
+          if (doZ0Cut && z0cutoff(oi))
+            continue;
+
+          auto mop = hh.iphi(oi);
+          uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
+          if (idphi > iphicut)
+            continue;
+
+          if (doClusterCut && zsizeCut(oi))
+            continue;
+          if (doPtCut && ptcut(oi, idphi))
+            continue;
+
+          auto ind = atomicAdd(nCells, 1);
+          if (ind >= maxNumOfDoublets) {
+            atomicSub(nCells, 1);
+            break;
+          }  // move to SimpleVector??
+          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
+          cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi);
+          isOuterHitOfCell[oi].push_back(ind);
+#ifdef GPU_DEBUG
+          if (isOuterHitOfCell[oi].full())
+            ++tooMany;
+          ++tot;
+#endif
+        }
+      }
+#ifdef GPU_DEBUG
+      if (tooMany > 0)
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
+#endif
+    }  // loop in block...
+  }
+
+}  // namespace gpuPixelDoublets
+
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
new file mode 100644
index 000000000..15e3c486e
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
@@ -0,0 +1,90 @@
+#include <cuda_runtime.h>
+
+#include "CUDACore/Product.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+#include "Framework/RunningAverage.h"
+#include "CUDACore/ScopedContext.h"
+
+#include "gpuVertexFinder.h"
+
+class PixelVertexProducerCUDA : public edm::EDProducer {
+public:
+  explicit PixelVertexProducerCUDA(edm::ProductRegistry& reg);
+  ~PixelVertexProducerCUDA() override = default;
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  bool m_OnGPU;
+
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
+
+  const gpuVertexFinder::Producer m_gpuAlgo;
+
+  // Tracking cuts before sending tracks to vertex algo
+  const float m_ptMin;
+};
+
+PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg)
+    : m_OnGPU(true),
+      m_gpuAlgo(true,   // oneKernel
+                true,   // useDensity
+                false,  // useDBSCAN
+                false,  // useIterative
+                2,      // minT
+                0.07,   // eps
+                0.01,   // errmax
+                9       // chi2max
+                ),
+      m_ptMin(0.5)  // 0.5 GeV
+{
+  if (m_OnGPU) {
+    tokenGPUTrack_ = reg.consumes<cms::cuda::Product<PixelTrackHeterogeneous>>();
+    tokenGPUVertex_ = reg.produces<ZVertexCUDAProduct>();
+  } else {
+    tokenCPUTrack_ = reg.consumes<PixelTrackHeterogeneous>();
+    tokenCPUVertex_ = reg.produces<ZVertexHeterogeneous>();
+  }
+}
+
+void PixelVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  if (m_OnGPU) {
+    auto const& ptracks = iEvent.get(tokenGPUTrack_);
+
+    cms::cuda::ScopedContextProduce ctx{ptracks};
+    auto const* tracks = ctx.get(ptracks).get();
+
+    assert(tracks);
+
+    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+
+  } else {
+    auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+    assert(tracks);
+
+    /*
+    auto const & tsoa = *tracks;
+    auto maxTracks = tsoa.stride();
+    std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+    int32_t nt = 0;
+    for (int32_t it = 0; it < maxTracks; ++it) {
+      auto nHits = tsoa.nHits(it);
+      assert(nHits==int(tsoa.hitIndices.size(it)));
+      if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+      nt++;
+    }
+    std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+    */
+
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+  }
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerCUDA);
diff --git a/src/cudacompat/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
new file mode 100644
index 000000000..d709f0c5e
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
@@ -0,0 +1,49 @@
+#include <cuda_runtime.h>
+
+#include "CUDACore/Product.h"
+#include "CUDACore/HostProduct.h"
+#include "CUDADataFormats/ZVertexHeterogeneous.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+#include "Framework/RunningAverage.h"
+#include "CUDACore/ScopedContext.h"
+
+class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork {
+public:
+  explicit PixelVertexSoAFromCUDA(edm::ProductRegistry& reg);
+  ~PixelVertexSoAFromCUDA() override = default;
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
+
+  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+};
+
+PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg)
+    : tokenCUDA_(reg.consumes<cms::cuda::Product<ZVertexHeterogeneous>>()),
+      tokenSOA_(reg.produces<ZVertexHeterogeneous>()) {}
+
+void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                     edm::EventSetup const& iSetup,
+                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_soa = inputData.toHostAsync(ctx.stream());
+}
+
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // No copies....
+  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+}
+
+DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h b/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
new file mode 100644
index 000000000..201971770
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
@@ -0,0 +1,234 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  //
+  // based on Rodrighez&Laio algo
+  //
+  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
+                                                         gpuVertexFinder::WorkSpace* pws,
+                                                         int minT,      // min number of neighbours to be "seed"
+                                                         float eps,     // max absolute distance to cluster
+                                                         float errmax,  // max error to be "seed"
+                                                         float chi2max  // max normalized distance to cluster
+  ) {
+    using namespace gpuVertexFinder;
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find closest above me .... (we ignore the possibility of two j at same distance from i)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // (break natural order???)
+        mdist = dist;
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+#ifdef GPU_DEBUG
+    __syncthreads();
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto minJ = i;
+      auto mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        mdist = dist;
+        minJ = j;
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+      // should belong to the same cluster...
+      assert(iv[i] == iv[minJ]);
+      assert(nn[i] <= nn[iv[i]]);
+    }
+    __syncthreads();
+#endif
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
+                                               gpuVertexFinder::WorkSpace* pws,
+                                               int minT,      // min number of neighbours to be "seed"
+                                               float eps,     // max absolute distance to cluster
+                                               float errmax,  // max error to be "seed"
+                                               float chi2max  // max normalized distance to cluster
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h b/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
new file mode 100644
index 000000000..504ee4cf2
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
@@ -0,0 +1,242 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksDBSCAN(ZVertices* pdata,
+                                      WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "core"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find NN with smaller z...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      float mz = zt[i];
+      auto loop = [&](uint32_t j) {
+        if (zt[j] >= mz)
+          return;
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        mz = zt[j];
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      assert(zt[iv[i]] <= zt[i]);
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        // they should belong to the same cluster, isn't it?
+        if (iv[i] != iv[j]) {
+          printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
+          printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
+          ;
+        }
+        assert(iv[i] == iv[j]);
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+    __syncthreads();
+#endif
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksIterative.h b/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
new file mode 100644
index 000000000..6e7da0efd
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
@@ -0,0 +1,213 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksIterative(ZVertices* pdata,
+                                         WorkSpace* pws,
+                                         int minT,      // min number of neighbours to be "core"
+                                         float eps,     // max absolute distance to cluster
+                                         float errmax,  // max error to be "seed"
+                                         float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ int nloops;
+    nloops = 0;
+
+    __syncthreads();
+
+    // cluster seeds only
+    bool more = true;
+    while (__syncthreads_or(more)) {
+      if (1 == nloops % 2) {
+        for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+          auto m = iv[i];
+          while (m != iv[m])
+            m = iv[m];
+          iv[i] = m;
+        }
+      } else {
+        more = false;
+        for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+          auto p = hist.begin() + k;
+          auto i = (*p);
+          auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
+          if (nn[i] < minT)
+            continue;  // DBSCAN core rule
+          auto loop = [&](uint32_t j) {
+            assert(i != j);
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+              return;
+            auto old = atomicMin(&iv[j], iv[i]);
+            if (old != iv[i]) {
+              // end the loop only if no changes were applied
+              more = true;
+            }
+            atomicMin(&iv[i], old);
+          };
+          ++p;
+          for (; p < hist.end(be); ++p)
+            loop(*p);
+        }  // for i
+      }
+      if (threadIdx.x == 0)
+        ++nloops;
+    }  // while
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](int j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuFitVertices.h b/src/cudacompat/plugin-PixelVertexFinding/gpuFitVertices.h
new file mode 100644
index 000000000..3840a3f99
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuFitVertices.h
@@ -0,0 +1,113 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
+                                              WorkSpace* pws,
+                                              float chi2Max  // for outlier rejection
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    assert(nvFinal <= nvIntermediate);
+    nvFinal = nvIntermediate;
+    auto foundClusters = nvFinal;
+
+    // zero
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      zv[i] = 0;
+      wv[i] = 0;
+      chi2[i] = 0;
+    }
+
+    // only for test
+    __shared__ int noise;
+    if (verbose && 0 == threadIdx.x)
+      noise = 0;
+
+    __syncthreads();
+
+    // compute cluster location
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990) {
+        if (verbose)
+          atomicAdd(&noise, 1);
+        continue;
+      }
+      assert(iv[i] >= 0);
+      assert(iv[i] < int(foundClusters));
+      auto w = 1.f / ezt2[i];
+      atomicAdd(&zv[iv[i]], zt[i] * w);
+      atomicAdd(&wv[iv[i]], w);
+    }
+
+    __syncthreads();
+    // reuse nn
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      assert(wv[i] > 0.f);
+      zv[i] /= wv[i];
+      nn[i] = -1;  // ndof
+    }
+    __syncthreads();
+
+    // compute chi2
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+
+      auto c2 = zv[iv[i]] - zt[i];
+      c2 *= c2 / ezt2[i];
+      if (c2 > chi2Max) {
+        iv[i] = 9999;
+        continue;
+      }
+      atomicAdd(&chi2[iv[i]], c2);
+      atomicAdd(&nn[iv[i]], 1);
+    }
+    __syncthreads();
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
+      if (nn[i] > 0)
+        wv[i] *= float(nn[i]) / chi2[i];
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto clusters ", foundClusters);
+    if (verbose && 0 == threadIdx.x)
+      printf("and %d noise\n", noise);
+  }
+
+  __global__ void fitVerticesKernel(ZVertices* pdata,
+                                    WorkSpace* pws,
+                                    float chi2Max  // for outlier rejection
+  ) {
+    fitVertices(pdata, pws, chi2Max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuSortByPt2.h b/src/cudacompat/plugin-PixelVertexFinding/gpuSortByPt2.h
new file mode 100644
index 000000000..9fa98f9e4
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuSortByPt2.h
@@ -0,0 +1,73 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cuda_assert.h"
+#ifdef __CUDA_ARCH__
+#include "CUDACore/radixSort.h"
+#endif
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ ptt2 = ws.ptt2;
+    uint32_t const& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ iv = ws.iv;
+    float* __restrict__ ptv2 = data.ptv2;
+    uint16_t* __restrict__ sortInd = data.sortInd;
+
+    // if (threadIdx.x == 0)
+    //    printf("sorting %d vertices\n",nvFinal);
+
+    if (nvFinal < 1)
+      return;
+
+    // fill indexing
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      data.idv[ws.itrk[i]] = iv[i];
+    }
+
+    // can be done asynchronoisly at the end of previous event
+    for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) {
+      ptv2[i] = 0;
+    }
+    __syncthreads();
+
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+      atomicAdd(&ptv2[iv[i]], ptt2[i]);
+    }
+    __syncthreads();
+
+    if (1 == nvFinal) {
+      if (threadIdx.x == 0)
+        sortInd[0] = 0;
+      return;
+    }
+#ifdef __CUDA_ARCH__
+    __shared__ uint16_t sws[1024];
+    // sort using only 16 bits
+    radixSort<float, 2>(ptv2, sortInd, sws, nvFinal);
+#else
+    for (uint16_t i = 0; i < nvFinal; ++i)
+      sortInd[i] = i;
+    std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
+#endif
+  }
+
+  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuSplitVertices.h b/src/cudacompat/plugin-PixelVertexFinding/gpuSplitVertices.h
new file mode 100644
index 000000000..7c779b75b
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuSplitVertices.h
@@ -0,0 +1,139 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float const* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    // one vertex per block
+    for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
+      if (nn[kv] < 4)
+        continue;
+      if (chi2[kv] < maxChi2 * float(nn[kv]))
+        continue;
+
+      constexpr int MAXTK = 512;
+      assert(nn[kv] < MAXTK);
+      if (nn[kv] >= MAXTK)
+        continue;                      // too bad FIXME
+      __shared__ uint32_t it[MAXTK];   // track index
+      __shared__ float zz[MAXTK];      // z pos
+      __shared__ uint8_t newV[MAXTK];  // 0 or 1
+      __shared__ float ww[MAXTK];      // z weight
+
+      __shared__ uint32_t nq;  // number of track for this vertex
+      nq = 0;
+      __syncthreads();
+
+      // copy to local
+      for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
+        if (iv[k] == int(kv)) {
+          auto old = atomicInc(&nq, MAXTK);
+          zz[old] = zt[k] - zv[kv];
+          newV[old] = zz[old] < 0 ? 0 : 1;
+          ww[old] = 1.f / ezt2[k];
+          it[old] = k;
+        }
+      }
+
+      __shared__ float znew[2], wnew[2];  // the new vertices
+
+      __syncthreads();
+      assert(int(nq) == nn[kv] + 1);
+
+      int maxiter = 20;
+      // kt-min....
+      bool more = true;
+      while (__syncthreads_or(more)) {
+        more = false;
+        if (0 == threadIdx.x) {
+          znew[0] = 0;
+          znew[1] = 0;
+          wnew[0] = 0;
+          wnew[1] = 0;
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto i = newV[k];
+          atomicAdd(&znew[i], zz[k] * ww[k]);
+          atomicAdd(&wnew[i], ww[k]);
+        }
+        __syncthreads();
+        if (0 == threadIdx.x) {
+          znew[0] /= wnew[0];
+          znew[1] /= wnew[1];
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto d0 = fabs(zz[k] - znew[0]);
+          auto d1 = fabs(zz[k] - znew[1]);
+          auto newer = d0 < d1 ? 0 : 1;
+          more |= newer != newV[k];
+          newV[k] = newer;
+        }
+        --maxiter;
+        if (maxiter <= 0)
+          more = false;
+      }
+
+      // avoid empty vertices
+      if (0 == wnew[0] || 0 == wnew[1])
+        continue;
+
+      // quality cut
+      auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
+
+      auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
+
+      if (verbose && 0 == threadIdx.x)
+        printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
+
+      if (chi2Dist < 4)
+        continue;
+
+      // get a new global vertex
+      __shared__ uint32_t igv;
+      if (0 == threadIdx.x)
+        igv = atomicAdd(&ws.nvIntermediate, 1);
+      __syncthreads();
+      for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+        if (1 == newV[k])
+          iv[it[k]] = igv;
+      }
+
+    }  // loop on vertices
+  }
+
+  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    splitVertices(pdata, pws, maxChi2);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cc b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cc
new file mode 100644
index 000000000..084763385
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cc
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cu b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cu
new file mode 100644
index 000000000..084763385
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.cu
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.h b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.h
new file mode 100644
index 000000000..d42a5d93a
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinder.h
@@ -0,0 +1,83 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+
+#include <cstddef>
+#include <cstdint>
+
+#include "CUDADataFormats/ZVertexHeterogeneous.h"
+
+namespace gpuVertexFinder {
+
+  using ZVertices = ZVertexSoA;
+  using TkSoA = pixelTrack::TrackSoA;
+
+  // workspace used in the vertex reco algos
+  struct WorkSpace {
+    static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
+    static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
+
+    uint32_t ntrks;            // number of "selected tracks"
+    uint16_t itrk[MAXTRACKS];  // index of original track
+    float zt[MAXTRACKS];       // input track z at bs
+    float ezt2[MAXTRACKS];     // input error^2 on the above
+    float ptt2[MAXTRACKS];     // input pt^2 on the above
+    uint8_t izt[MAXTRACKS];    // interized z-position of input tracks
+    int32_t iv[MAXTRACKS];     // vertex index for each associated track
+
+    uint32_t nvIntermediate;  // the number of vertices after splitting pruning etc.
+
+    __host__ __device__ void init() {
+      ntrks = 0;
+      nvIntermediate = 0;
+    }
+  };
+
+  __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
+    pdata->init();
+    pws->init();
+  }
+
+  class Producer {
+  public:
+    using ZVertices = ZVertexSoA;
+    using WorkSpace = gpuVertexFinder::WorkSpace;
+    using TkSoA = pixelTrack::TrackSoA;
+
+    Producer(bool oneKernel,
+             bool useDensity,
+             bool useDBSCAN,
+             bool useIterative,
+             int iminT,      // min number of neighbours to be "core"
+             float ieps,     // max absolute distance to cluster
+             float ierrmax,  // max error to be "seed"
+             float ichi2max  // max normalized distance to cluster
+             )
+        : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
+          useDensity_(useDensity),
+          useDBSCAN_(useDBSCAN),
+          useIterative_(useIterative),
+          minT(iminT),
+          eps(ieps),
+          errmax(ierrmax),
+          chi2max(ichi2max) {}
+
+    ~Producer() = default;
+
+    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
+
+  private:
+    const bool oneKernel_;
+    const bool useDensity_;
+    const bool useDBSCAN_;
+    const bool useIterative_;
+
+    int minT;       // min number of neighbours to be "core"
+    float eps;      // max absolute distance to cluster
+    float errmax;   // max error to be "seed"
+    float chi2max;  // max normalized distance to cluster
+  };
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
new file mode 100644
index 000000000..f3260cad7
--- /dev/null
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
@@ -0,0 +1,173 @@
+#include "CUDACore/cudaCheck.h"
+
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
+#include "gpuFitVertices.h"
+#include "gpuSortByPt2.h"
+#include "gpuSplitVertices.h"
+
+namespace gpuVertexFinder {
+
+  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
+    assert(ptracks);
+    assert(soa);
+    auto const& tracks = *ptracks;
+    auto const& fit = tracks.stateAtBS;
+    auto const* quality = tracks.qualityData();
+
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = tracks.nHits(idx);
+      if (nHits == 0)
+        break;  // this is a guard: maybe we need to move to nTracks...
+
+      // initialize soa...
+      soa->idv[idx] = -1;
+
+      if (nHits < 4)
+        continue;  // no triplets
+      if (quality[idx] != trackQuality::loose)
+        continue;
+
+      auto pt = tracks.pt(idx);
+
+      if (pt < ptMin)
+        continue;
+
+      auto& data = *pws;
+      auto it = atomicAdd(&data.ntrks, 1);
+      data.itrk[it] = idx;
+      data.zt[it] = tracks.zip(idx);
+      data.ezt2[it] = fit.covariance(idx)(14);
+      data.ptt2[it] = pt * pt;
+    }
+  }
+
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                        gpuVertexFinder::WorkSpace* pws,
+                                        int minT,      // min number of neighbours to be "seed"
+                                        float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                        float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, 50.);
+    __syncthreads();
+    splitVertices(pdata, pws, 9.f);
+    __syncthreads();
+    fitVertices(pdata, pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#else
+  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, 50.);
+  }
+
+  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+    fitVertices(pdata, pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#endif
+
+#ifdef __CUDACC__
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+    // std::cout << "producing Vertices on GPU" << std::endl;
+    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+#else
+  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
+    // std::cout << "producing Vertices on  CPU" <<    std::endl;
+    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+#endif
+    assert(tksoa);
+    auto* soa = vertices.get();
+    assert(soa);
+
+#ifdef __CUDACC__
+    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+#else
+    auto ws_d = std::make_unique<WorkSpace>();
+#endif
+
+#ifdef __CUDACC__
+    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    auto blockSize = 128;
+    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
+    cudaCheck(cudaGetLastError());
+#else
+    cms::cudacompat::resetGrid();
+    init(soa, ws_d.get());
+    loadTracks(tksoa, soa, ws_d.get(), ptMin);
+#endif
+
+#ifdef __CUDACC__
+    if (oneKernel_) {
+      // implemented only for density clustesrs
+#ifndef THREE_KERNELS
+      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+#else
+      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+#endif
+    } else {  // five kernels
+      if (useDensity_) {
+        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useDBSCAN_) {
+        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useIterative_) {
+        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      }
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
+      cudaCheck(cudaGetLastError());
+      sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+    }
+    cudaCheck(cudaGetLastError());
+#else  // __CUDACC__
+    if (useDensity_) {
+      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useDBSCAN_) {
+      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useIterative_) {
+      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    }
+    // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+    fitVertices(soa, ws_d.get(), 50.);
+    // one block per vertex!
+    blockIdx.x = 0;
+    gridDim.x = 1;
+    splitVertices(soa, ws_d.get(), 9.f);
+    resetGrid();
+    fitVertices(soa, ws_d.get(), 5000.);
+    sortByPt2(soa, ws_d.get());
+#endif
+
+    return vertices;
+  }
+
+}  // namespace gpuVertexFinder
+
+#undef FROM
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.cc b/src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.cc
new file mode 100644
index 000000000..26e03d3b4
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.cc
@@ -0,0 +1,91 @@
+#include "ErrorChecker.h"
+
+#include "DataFormats/FEDHeader.h"
+#include "DataFormats/FEDTrailer.h"
+
+#include <bitset>
+#include <sstream>
+#include <iostream>
+
+namespace {
+  constexpr int CRC_bits = 1;
+  constexpr int LINK_bits = 6;
+  constexpr int ROC_bits = 5;
+  constexpr int DCOL_bits = 5;
+  constexpr int PXID_bits = 8;
+  constexpr int ADC_bits = 8;
+  constexpr int OMIT_ERR_bits = 1;
+
+  constexpr int CRC_shift = 2;
+  constexpr int ADC_shift = 0;
+  constexpr int PXID_shift = ADC_shift + ADC_bits;
+  constexpr int DCOL_shift = PXID_shift + PXID_bits;
+  constexpr int ROC_shift = DCOL_shift + DCOL_bits;
+  constexpr int LINK_shift = ROC_shift + ROC_bits;
+  constexpr int OMIT_ERR_shift = 20;
+
+  constexpr uint32_t dummyDetId = 0xffffffff;
+
+  constexpr ErrorChecker::Word64 CRC_mask = ~(~ErrorChecker::Word64(0) << CRC_bits);
+  constexpr ErrorChecker::Word32 ERROR_mask = ~(~ErrorChecker::Word32(0) << ROC_bits);
+  constexpr ErrorChecker::Word32 LINK_mask = ~(~ErrorChecker::Word32(0) << LINK_bits);
+  constexpr ErrorChecker::Word32 ROC_mask = ~(~ErrorChecker::Word32(0) << ROC_bits);
+  constexpr ErrorChecker::Word32 OMIT_ERR_mask = ~(~ErrorChecker::Word32(0) << OMIT_ERR_bits);
+}  // namespace
+
+ErrorChecker::ErrorChecker() { includeErrors = false; }
+
+bool ErrorChecker::checkCRC(bool& errorsInEvent, int fedId, const Word64* trailer, Errors& errors) {
+  int CRC_BIT = (*trailer >> CRC_shift) & CRC_mask;
+  if (CRC_BIT == 0)
+    return true;
+  errorsInEvent = true;
+  if (includeErrors) {
+    int errorType = 39;
+    SiPixelRawDataError error(*trailer, errorType, fedId);
+    errors[dummyDetId].push_back(error);
+  }
+  return false;
+}
+
+bool ErrorChecker::checkHeader(bool& errorsInEvent, int fedId, const Word64* header, Errors& errors) {
+  FEDHeader fedHeader(reinterpret_cast<const unsigned char*>(header));
+  if (!fedHeader.check())
+    return false;  // throw exception?
+  if (fedHeader.sourceID() != fedId) {
+    std::cout << "PixelDataFormatter::interpretRawData, fedHeader.sourceID() != fedId"
+              << ", sourceID = " << fedHeader.sourceID() << ", fedId = " << fedId << ", errorType = 32" << std::endl;
+    errorsInEvent = true;
+    if (includeErrors) {
+      int errorType = 32;
+      SiPixelRawDataError error(*header, errorType, fedId);
+      errors[dummyDetId].push_back(error);
+    }
+  }
+  return fedHeader.moreHeaders();
+}
+
+bool ErrorChecker::checkTrailer(
+    bool& errorsInEvent, int fedId, unsigned int nWords, const Word64* trailer, Errors& errors) {
+  FEDTrailer fedTrailer(reinterpret_cast<const unsigned char*>(trailer));
+  if (!fedTrailer.check()) {
+    if (includeErrors) {
+      int errorType = 33;
+      SiPixelRawDataError error(*trailer, errorType, fedId);
+      errors[dummyDetId].push_back(error);
+    }
+    errorsInEvent = true;
+    std::cout << "fedTrailer.check failed, Fed: " << fedId << ", errorType = 33" << std::endl;
+    return false;
+  }
+  if (fedTrailer.fragmentLength() != nWords) {
+    std::cout << "fedTrailer.fragmentLength()!= nWords !! Fed: " << fedId << ", errorType = 34" << std::endl;
+    errorsInEvent = true;
+    if (includeErrors) {
+      int errorType = 34;
+      SiPixelRawDataError error(*trailer, errorType, fedId);
+      errors[dummyDetId].push_back(error);
+    }
+  }
+  return fedTrailer.moreTrailers();
+}
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.h b/src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.h
new file mode 100644
index 000000000..ff0aeb6d6
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/ErrorChecker.h
@@ -0,0 +1,33 @@
+#ifndef ErrorChecker_H
+#define ErrorChecker_H
+/** \class ErrorChecker
+ *
+ *  
+ */
+
+#include <vector>
+#include <map>
+
+#include "DataFormats/SiPixelRawDataError.h"
+
+class ErrorChecker {
+public:
+  typedef uint32_t Word32;
+  typedef uint64_t Word64;
+
+  typedef std::vector<SiPixelRawDataError> DetErrors;
+  typedef std::map<uint32_t, DetErrors> Errors;
+
+  ErrorChecker();
+
+  bool checkCRC(bool& errorsInEvent, int fedId, const Word64* trailer, Errors& errors);
+
+  bool checkHeader(bool& errorsInEvent, int fedId, const Word64* header, Errors& errors);
+
+  bool checkTrailer(bool& errorsInEvent, int fedId, unsigned int nWords, const Word64* trailer, Errors& errors);
+
+private:
+  bool includeErrors;
+};
+
+#endif
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelFedCablingMapGPUWrapperESProducer.cc b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelFedCablingMapGPUWrapperESProducer.cc
new file mode 100644
index 000000000..263e6d066
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelFedCablingMapGPUWrapperESProducer.cc
@@ -0,0 +1,43 @@
+#include "CondFormats/SiPixelFedIds.h"
+#include "CondFormats/SiPixelFedCablingMapGPU.h"
+#include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"
+#include "Framework/ESProducer.h"
+#include "Framework/EventSetup.h"
+#include "Framework/ESPluginFactory.h"
+
+#include <fstream>
+#include <memory>
+
+class SiPixelFedCablingMapGPUWrapperESProducer : public edm::ESProducer {
+public:
+  explicit SiPixelFedCablingMapGPUWrapperESProducer(std::filesystem::path const& datadir) : data_(datadir) {}
+  void produce(edm::EventSetup& eventSetup);
+
+private:
+  std::filesystem::path data_;
+};
+
+void SiPixelFedCablingMapGPUWrapperESProducer::produce(edm::EventSetup& eventSetup) {
+  {
+    std::ifstream in(data_ / "fedIds.bin", std::ios::binary);
+    in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+    unsigned int nfeds;
+    in.read(reinterpret_cast<char*>(&nfeds), sizeof(unsigned));
+    std::vector<unsigned int> fedIds(nfeds);
+    in.read(reinterpret_cast<char*>(fedIds.data()), sizeof(unsigned int) * nfeds);
+    eventSetup.put(std::make_unique<SiPixelFedIds>(std::move(fedIds)));
+  }
+  {
+    std::ifstream in(data_ / "cablingMap.bin", std::ios::binary);
+    in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+    SiPixelFedCablingMapGPU obj;
+    in.read(reinterpret_cast<char*>(&obj), sizeof(SiPixelFedCablingMapGPU));
+    unsigned int modToUnpDefSize;
+    in.read(reinterpret_cast<char*>(&modToUnpDefSize), sizeof(unsigned int));
+    std::vector<unsigned char> modToUnpDefault(modToUnpDefSize);
+    in.read(reinterpret_cast<char*>(modToUnpDefault.data()), modToUnpDefSize);
+    eventSetup.put(std::make_unique<SiPixelFedCablingMapGPUWrapper>(obj, std::move(modToUnpDefault)));
+  }
+}
+
+DEFINE_FWK_EVENTSETUP_MODULE(SiPixelFedCablingMapGPUWrapperESProducer);
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelGainCalibrationForHLTGPUESProducer.cc b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelGainCalibrationForHLTGPUESProducer.cc
new file mode 100644
index 000000000..4a28c25e2
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelGainCalibrationForHLTGPUESProducer.cc
@@ -0,0 +1,32 @@
+#include "CondFormats/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/SiPixelGainForHLTonGPU.h"
+#include "Framework/ESProducer.h"
+#include "Framework/EventSetup.h"
+#include "Framework/ESPluginFactory.h"
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+class SiPixelGainCalibrationForHLTGPUESProducer : public edm::ESProducer {
+public:
+  explicit SiPixelGainCalibrationForHLTGPUESProducer(std::filesystem::path const& datadir) : data_(datadir) {}
+  void produce(edm::EventSetup& eventSetup);
+
+private:
+  std::filesystem::path data_;
+};
+
+void SiPixelGainCalibrationForHLTGPUESProducer::produce(edm::EventSetup& eventSetup) {
+  std::ifstream in(data_ / "gain.bin", std::ios::binary);
+  in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
+  SiPixelGainForHLTonGPU gain;
+  in.read(reinterpret_cast<char*>(&gain), sizeof(SiPixelGainForHLTonGPU));
+  unsigned int nbytes;
+  in.read(reinterpret_cast<char*>(&nbytes), sizeof(unsigned int));
+  std::vector<char> gainData(nbytes);
+  in.read(gainData.data(), nbytes);
+  eventSetup.put(std::make_unique<SiPixelGainCalibrationForHLTGPU>(gain, std::move(gainData)));
+}
+
+DEFINE_FWK_EVENTSETUP_MODULE(SiPixelGainCalibrationForHLTGPUESProducer);
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
new file mode 100644
index 000000000..06624744e
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -0,0 +1,176 @@
+#include "CUDACore/Product.h"
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
+#include "CondFormats/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"
+#include "CondFormats/SiPixelFedIds.h"
+#include "DataFormats/PixelErrors.h"
+#include "DataFormats/FEDNumbering.h"
+#include "DataFormats/FEDRawData.h"
+#include "DataFormats/FEDRawDataCollection.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+#include "CUDACore/ScopedContext.h"
+
+#include "ErrorChecker.h"
+#include "SiPixelRawToClusterGPUKernel.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
+public:
+  explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg);
+  ~SiPixelRawToClusterCUDA() override = default;
+
+private:
+  void acquire(const edm::Event& iEvent,
+               const edm::EventSetup& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  cms::cuda::ContextState ctxState_;
+
+  edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
+  edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
+  edm::EDPutTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
+  edm::EDPutTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterPutToken_;
+
+  pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
+  std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
+  PixelFormatterErrors errors_;
+
+  const bool isRun2_;
+  const bool includeErrors_;
+  const bool useQuality_;
+};
+
+SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
+    : rawGetToken_(reg.consumes<FEDRawDataCollection>()),
+      digiPutToken_(reg.produces<cms::cuda::Product<SiPixelDigisCUDA>>()),
+      clusterPutToken_(reg.produces<cms::cuda::Product<SiPixelClustersCUDA>>()),
+      isRun2_(true),
+      includeErrors_(true),
+      useQuality_(true) {
+  if (includeErrors_) {
+    digiErrorPutToken_ = reg.produces<cms::cuda::Product<SiPixelDigiErrorsCUDA>>();
+  }
+
+  wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
+}
+
+void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
+                                      const edm::EventSetup& iSetup,
+                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
+
+  auto const& hgpuMap = iSetup.get<SiPixelFedCablingMapGPUWrapper>();
+  if (hgpuMap.hasQuality() != useQuality_) {
+    throw std::runtime_error("UseQuality of the module (" + std::to_string(useQuality_) +
+                             ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.");
+  }
+  // get the GPU product already here so that the async transfer can begin
+  const auto* gpuMap = hgpuMap.getGPUProductAsync(ctx.stream());
+  const unsigned char* gpuModulesToUnpack = hgpuMap.getModToUnpAllAsync(ctx.stream());
+
+  auto const& hgains = iSetup.get<SiPixelGainCalibrationForHLTGPU>();
+  // get the GPU product already here so that the async transfer can begin
+  const auto* gpuGains = hgains.getGPUProductAsync(ctx.stream());
+
+  auto const& fedIds_ = iSetup.get<SiPixelFedIds>().fedIds();
+
+  const auto& buffers = iEvent.get(rawGetToken_);
+
+  errors_.clear();
+
+  // GPU specific: Data extraction for RawToDigi GPU
+  unsigned int wordCounterGPU = 0;
+  unsigned int fedCounter = 0;
+  bool errorsInEvent = false;
+
+  // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
+  ErrorChecker errorcheck;
+  for (int fedId : fedIds_) {
+    if (fedId == 40)
+      continue;  // skip pilot blade data
+
+    // for GPU
+    // first 150 index stores the fedId and next 150 will store the
+    // start index of word in that fed
+    assert(fedId >= 1200);
+    fedCounter++;
+
+    // get event data for this fed
+    const FEDRawData& rawData = buffers.FEDData(fedId);
+
+    // GPU specific
+    int nWords = rawData.size() / sizeof(uint64_t);
+    if (nWords == 0) {
+      continue;
+    }
+
+    // check CRC bit
+    const uint64_t* trailer = reinterpret_cast<const uint64_t*>(rawData.data()) + (nWords - 1);
+    if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
+      continue;
+    }
+
+    // check headers
+    const uint64_t* header = reinterpret_cast<const uint64_t*>(rawData.data());
+    header--;
+    bool moreHeaders = true;
+    while (moreHeaders) {
+      header++;
+      bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
+      moreHeaders = headerStatus;
+    }
+
+    // check trailers
+    bool moreTrailers = true;
+    trailer++;
+    while (moreTrailers) {
+      trailer--;
+      bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
+      moreTrailers = trailerStatus;
+    }
+
+    const uint32_t* bw = (const uint32_t*)(header + 1);
+    const uint32_t* ew = (const uint32_t*)(trailer);
+
+    assert(0 == (ew - bw) % 2);
+    wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw));
+    wordCounterGPU += (ew - bw);
+
+  }  // end of for loop
+
+  gpuAlgo_.makeClustersAsync(isRun2_,
+                             gpuMap,
+                             gpuModulesToUnpack,
+                             gpuGains,
+                             *wordFedAppender_,
+                             std::move(errors_),
+                             wordCounterGPU,
+                             fedCounter,
+                             useQuality_,
+                             includeErrors_,
+                             false,  // debug
+                             ctx.stream());
+}
+
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+  auto tmp = gpuAlgo_.getResults();
+  ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
+  ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
+  if (includeErrors_) {
+    ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors());
+  }
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelRawToClusterCUDA);
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
new file mode 100644
index 000000000..f5070130a
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
@@ -0,0 +1,679 @@
+/* Sushil Dubey, Shashi Dugad, TIFR, July 2017
+ *
+ * File Name: RawToClusterGPU.cu
+ * Description: It converts Raw data into Digi Format on GPU
+ * Finaly the Output of RawToDigi data is given to pixelClusterizer
+ *
+**/
+
+// C++ includes
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+// CUDA includes
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// CMSSW includes
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CondFormats/SiPixelFedCablingMapGPU.h"
+
+// local includes
+#include "SiPixelRawToClusterGPUKernel.h"
+#include "gpuCalibPixel.h"
+#include "gpuClusterChargeCut.h"
+#include "gpuClustering.h"
+
+namespace pixelgpudetails {
+
+  // number of words for all the FEDs
+  constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
+
+  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
+    word_ = cms::cuda::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+    fedId_ = cms::cuda::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+  }
+
+  void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId,
+                                                                        unsigned int wordCounterGPU,
+                                                                        const uint32_t *src,
+                                                                        unsigned int length) {
+    std::memcpy(word_.get() + wordCounterGPU, src, sizeof(uint32_t) * length);
+    std::memset(fedId_.get() + wordCounterGPU / 2, fedId - 1200, length / 2);
+  }
+
+  ////////////////////
+
+  __device__ uint32_t getLink(uint32_t ww) {
+    return ((ww >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask);
+  }
+
+  __device__ uint32_t getRoc(uint32_t ww) { return ((ww >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask); }
+
+  __device__ uint32_t getADC(uint32_t ww) { return ((ww >> pixelgpudetails::ADC_shift) & pixelgpudetails::ADC_mask); }
+
+  __device__ bool isBarrel(uint32_t rawId) { return (1 == ((rawId >> 25) & 0x7)); }
+
+  __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelFedCablingMapGPU *cablingMap,
+                                                uint8_t fed,
+                                                uint32_t link,
+                                                uint32_t roc) {
+    uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
+    pixelgpudetails::DetIdGPU detId = {
+        cablingMap->RawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]};
+    return detId;
+  }
+
+  //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
+  //http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
+  // Convert local pixel to pixelgpudetails::global pixel
+  __device__ pixelgpudetails::Pixel frameConversion(
+      bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
+    int slopeRow = 0, slopeCol = 0;
+    int rowOffset = 0, colOffset = 0;
+
+    if (bpix) {
+      if (side == -1 && layer != 1) {  // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
+        } else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
+          colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
+        }       // if roc
+      } else {  // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
+          colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
+        } else {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (16 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
+        }
+      }
+
+    } else {             // fpix
+      if (side == -1) {  // pannel 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
+        } else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
+          colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
+        }
+      } else {  // pannel 2
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
+        } else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
+          colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
+        }
+
+      }  // side
+    }
+
+    uint32_t gRow = rowOffset + slopeRow * local.row;
+    uint32_t gCol = colOffset + slopeCol * local.col;
+    //printf("Inside frameConversion row: %u, column: %u\n", gRow, gCol);
+    pixelgpudetails::Pixel global = {gRow, gCol};
+    return global;
+  }
+
+  __device__ uint8_t conversionError(uint8_t fedId, uint8_t status, bool debug = false) {
+    uint8_t errorType = 0;
+
+    // debug = true;
+
+    switch (status) {
+      case (1): {
+        if (debug)
+          printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId);
+        errorType = 35;
+        break;
+      }
+      case (2): {
+        if (debug)
+          printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
+        errorType = 36;
+        break;
+      }
+      case (3): {
+        if (debug)
+          printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
+        errorType = 37;
+        break;
+      }
+      case (4): {
+        if (debug)
+          printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
+        errorType = 38;
+        break;
+      }
+      default:
+        if (debug)
+          printf("Cabling check returned unexpected result, status = %i\n", status);
+    };
+
+    return errorType;
+  }
+
+  __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) {
+    uint32_t numRowsInRoc = 80;
+    uint32_t numColsInRoc = 52;
+
+    /// row and collumn in ROC representation
+    return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc));
+  }
+
+  __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
+
+  __device__ uint8_t checkROC(
+      uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false) {
+    uint8_t errorType = (errorWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ERROR_mask;
+    if (errorType < 25)
+      return 0;
+    bool errorFound = false;
+
+    switch (errorType) {
+      case (25): {
+        errorFound = true;
+        uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + 1;
+        if (index > 1 && index <= cablingMap->size) {
+          if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index]))
+            errorFound = false;
+        }
+        if (debug and errorFound)
+          printf("Invalid ROC = 25 found (errorType = 25)\n");
+        break;
+      }
+      case (26): {
+        if (debug)
+          printf("Gap word found (errorType = 26)\n");
+        errorFound = true;
+        break;
+      }
+      case (27): {
+        if (debug)
+          printf("Dummy word found (errorType = 27)\n");
+        errorFound = true;
+        break;
+      }
+      case (28): {
+        if (debug)
+          printf("Error fifo nearly full (errorType = 28)\n");
+        errorFound = true;
+        break;
+      }
+      case (29): {
+        if (debug)
+          printf("Timeout on a channel (errorType = 29)\n");
+        if ((errorWord >> pixelgpudetails::OMIT_ERR_shift) & pixelgpudetails::OMIT_ERR_mask) {
+          if (debug)
+            printf("...first errorType=29 error, this gets masked out\n");
+        }
+        errorFound = true;
+        break;
+      }
+      case (30): {
+        if (debug)
+          printf("TBM error trailer (errorType = 30)\n");
+        int StateMatch_bits = 4;
+        int StateMatch_shift = 8;
+        uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
+        int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
+        if (StateMatch != 1 && StateMatch != 8) {
+          if (debug)
+            printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
+        }
+        if (StateMatch == 1)
+          errorType = 40;  // 1=Overflow -> 40, 8=number of ROCs -> 30
+        errorFound = true;
+        break;
+      }
+      case (31): {
+        if (debug)
+          printf("Event number error (errorType = 31)\n");
+        errorFound = true;
+        break;
+      }
+      default:
+        errorFound = false;
+    };
+
+    return errorFound ? errorType : 0;
+  }
+
+  __device__ uint32_t getErrRawID(uint8_t fedId,
+                                  uint32_t errWord,
+                                  uint32_t errorType,
+                                  const SiPixelFedCablingMapGPU *cablingMap,
+                                  bool debug = false) {
+    uint32_t rID = 0xffffffff;
+
+    switch (errorType) {
+      case 25:
+      case 30:
+      case 31:
+      case 36:
+      case 40: {
+        //set dummy values for cabling just to get detId from link
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc = 1;
+        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if (rID_temp != 9999)
+          rID = rID_temp;
+        break;
+      }
+      case 29: {
+        int chanNmbr = 0;
+        const int DB0_shift = 0;
+        const int DB1_shift = DB0_shift + 1;
+        const int DB2_shift = DB1_shift + 1;
+        const int DB3_shift = DB2_shift + 1;
+        const int DB4_shift = DB3_shift + 1;
+        const uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
+
+        int CH1 = (errWord >> DB0_shift) & DataBit_mask;
+        int CH2 = (errWord >> DB1_shift) & DataBit_mask;
+        int CH3 = (errWord >> DB2_shift) & DataBit_mask;
+        int CH4 = (errWord >> DB3_shift) & DataBit_mask;
+        int CH5 = (errWord >> DB4_shift) & DataBit_mask;
+        int BLOCK_bits = 3;
+        int BLOCK_shift = 8;
+        uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
+        int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
+        int localCH = 1 * CH1 + 2 * CH2 + 3 * CH3 + 4 * CH4 + 5 * CH5;
+        if (BLOCK % 2 == 0)
+          chanNmbr = (BLOCK / 2) * 9 + localCH;
+        else
+          chanNmbr = ((BLOCK - 1) / 2) * 9 + 4 + localCH;
+        if ((chanNmbr < 1) || (chanNmbr > 36))
+          break;  // signifies unexpected result
+
+        // set dummy values for cabling just to get detId from link if in Barrel
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc = 1;
+        uint32_t link = chanNmbr;
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if (rID_temp != 9999)
+          rID = rID_temp;
+        break;
+      }
+      case 37:
+      case 38: {
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc = (errWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask;
+        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if (rID_temp != 9999)
+          rID = rID_temp;
+        break;
+      }
+      default:
+        break;
+    };
+
+    return rID;
+  }
+
+  // Kernel to perform Raw to Digi conversion
+  __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *cablingMap,
+                                   const unsigned char *modToUnp,
+                                   const uint32_t wordCounter,
+                                   const uint32_t *word,
+                                   const uint8_t *fedIds,
+                                   uint16_t *xx,
+                                   uint16_t *yy,
+                                   uint16_t *adc,
+                                   uint32_t *pdigi,
+                                   uint32_t *rawIdArr,
+                                   uint16_t *moduleId,
+                                   cms::cuda::SimpleVector<PixelErrorCompact> *err,
+                                   bool useQualityInfo,
+                                   bool includeErrors,
+                                   bool debug) {
+    //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
+
+    int32_t first = threadIdx.x + blockIdx.x * blockDim.x;
+    for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) {
+      auto gIndex = iloop;
+      xx[gIndex] = 0;
+      yy[gIndex] = 0;
+      adc[gIndex] = 0;
+      bool skipROC = false;
+
+      uint8_t fedId = fedIds[gIndex / 2];  // +1200;
+
+      // initialize (too many coninue below)
+      pdigi[gIndex] = 0;
+      rawIdArr[gIndex] = 0;
+      moduleId[gIndex] = 9999;
+
+      uint32_t ww = word[gIndex];  // Array containing 32 bit raw data
+      if (ww == 0) {
+        // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
+        continue;
+      }
+
+      uint32_t link = getLink(ww);  // Extract link
+      uint32_t roc = getRoc(ww);    // Extract Roc in link
+      pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
+
+      uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug);
+      skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
+      if (includeErrors and skipROC) {
+        uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
+        err->push_back(PixelErrorCompact{rID, ww, errorType, fedId});
+        continue;
+      }
+
+      uint32_t rawId = detId.RawId;
+      uint32_t rocIdInDetUnit = detId.rocInDet;
+      bool barrel = isBarrel(rawId);
+
+      uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
+      if (useQualityInfo) {
+        skipROC = cablingMap->badRocs[index];
+        if (skipROC)
+          continue;
+      }
+      skipROC = modToUnp[index];
+      if (skipROC)
+        continue;
+
+      uint32_t layer = 0;                   //, ladder =0;
+      int side = 0, panel = 0, module = 0;  //disk = 0, blade = 0
+
+      if (barrel) {
+        layer = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
+        module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
+        side = (module < 5) ? -1 : 1;
+      } else {
+        // endcap ids
+        layer = 0;
+        panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
+        //disk  = (rawId >> diskStartBit_) & diskMask_;
+        side = (panel == 1) ? -1 : 1;
+        //blade = (rawId >> bladeStartBit_) & bladeMask_;
+      }
+
+      // ***special case of layer to 1 be handled here
+      pixelgpudetails::Pixel localPix;
+      if (layer == 1) {
+        uint32_t col = (ww >> pixelgpudetails::COL_shift) & pixelgpudetails::COL_mask;
+        uint32_t row = (ww >> pixelgpudetails::ROW_shift) & pixelgpudetails::ROW_mask;
+        localPix.row = row;
+        localPix.col = col;
+        if (includeErrors) {
+          if (not rocRowColIsValid(row, col)) {
+            uint8_t error = conversionError(fedId, 3, debug);  //use the device function and fill the arrays
+            err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
+            if (debug)
+              printf("BPIX1  Error status: %i\n", error);
+            continue;
+          }
+        }
+      } else {
+        // ***conversion rules for dcol and pxid
+        uint32_t dcol = (ww >> pixelgpudetails::DCOL_shift) & pixelgpudetails::DCOL_mask;
+        uint32_t pxid = (ww >> pixelgpudetails::PXID_shift) & pixelgpudetails::PXID_mask;
+        uint32_t row = pixelgpudetails::numRowsInRoc - pxid / 2;
+        uint32_t col = dcol * 2 + pxid % 2;
+        localPix.row = row;
+        localPix.col = col;
+        if (includeErrors and not dcolIsValid(dcol, pxid)) {
+          uint8_t error = conversionError(fedId, 3, debug);
+          err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
+          if (debug)
+            printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
+          continue;
+        }
+      }
+
+      pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
+      xx[gIndex] = globalPix.row;  // origin shifting by 1 0-159
+      yy[gIndex] = globalPix.col;  // origin shifting by 1 0-415
+      adc[gIndex] = getADC(ww);
+      pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]);
+      moduleId[gIndex] = detId.moduleId;
+      rawIdArr[gIndex] = rawId;
+    }  // end of loop (gIndex < end)
+
+  }  // end of Raw to Digi kernel
+
+  __global__ void fillHitsModuleStart(uint32_t const *__restrict__ cluStart, uint32_t *__restrict__ moduleStart) {
+    assert(gpuClustering::MaxNumModules < 2048);  // easy to extend at least till 32*1024
+    assert(1 == gridDim.x);
+    assert(0 == blockIdx.x);
+
+    int first = threadIdx.x;
+
+    // limit to MaxHitsInModule;
+    for (int i = first, iend = gpuClustering::MaxNumModules; i < iend; i += blockDim.x) {
+      moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), cluStart[i]);
+    }
+
+    __shared__ uint32_t ws[32];
+    cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
+    cms::cuda::blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::MaxNumModules - 1024, ws);
+
+    for (int i = first + 1025, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) {
+      moduleStart[i] += moduleStart[1024];
+    }
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    assert(0 == moduleStart[0]);
+    auto c0 = std::min(gpuClustering::maxHitsInModule(), cluStart[0]);
+    assert(c0 == moduleStart[1]);
+    assert(moduleStart[1024] >= moduleStart[1023]);
+    assert(moduleStart[1025] >= moduleStart[1024]);
+    assert(moduleStart[gpuClustering::MaxNumModules] >= moduleStart[1025]);
+
+    for (int i = first, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) {
+      if (0 != i)
+        assert(moduleStart[i] >= moduleStart[i - i]);
+      // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
+      // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
+      if (i == 96 || i == 1184 || i == 1744 || i == gpuClustering::MaxNumModules)
+        printf("moduleStart %d %d\n", i, moduleStart[i]);
+    }
+#endif
+
+    // avoid overflow
+    constexpr auto MAX_HITS = gpuClustering::MaxNumClusters;
+    for (int i = first, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) {
+      if (moduleStart[i] > MAX_HITS)
+        moduleStart[i] = MAX_HITS;
+    }
+  }
+
+  // Interface to outside
+  void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2,
+                                                       const SiPixelFedCablingMapGPU *cablingMap,
+                                                       const unsigned char *modToUnp,
+                                                       const SiPixelGainForHLTonGPU *gains,
+                                                       const WordFedAppender &wordFed,
+                                                       PixelFormatterErrors &&errors,
+                                                       const uint32_t wordCounter,
+                                                       const uint32_t fedCounter,
+                                                       bool useQualityInfo,
+                                                       bool includeErrors,
+                                                       bool debug,
+                                                       cudaStream_t stream) {
+    nDigis = wordCounter;
+
+#ifdef GPU_DEBUG
+    std::cout << "decoding " << wordCounter << " digis. Max is " << pixelgpudetails::MAX_FED_WORDS << std::endl;
+#endif
+
+    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream);
+    if (includeErrors) {
+      digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
+    }
+    clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
+
+    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
+
+    if (wordCounter)  // protect in case of empty event....
+    {
+      const int threadsPerBlock = 512;
+      const int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;  // fill it all
+
+      assert(0 == wordCounter % 2);
+      // wordCounter is the total no of words in each event to be trasfered on device
+      auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
+      auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
+
+      cudaCheck(
+          cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
+      cudaCheck(cudaMemcpyAsync(
+          fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream));
+
+      // Launch rawToDigi kernel
+      RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream>>>(
+          cablingMap,
+          modToUnp,
+          wordCounter,
+          word_d.get(),
+          fedId_d.get(),
+          digis_d.xx(),
+          digis_d.yy(),
+          digis_d.adc(),
+          digis_d.pdigi(),
+          digis_d.rawIdArr(),
+          digis_d.moduleInd(),
+          digiErrors_d.error(),  // returns nullptr if default-constructed
+          useQualityInfo,
+          includeErrors,
+          debug);
+      cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+      cudaDeviceSynchronize();
+      cudaCheck(cudaGetLastError());
+#endif
+
+      if (includeErrors) {
+        digiErrors_d.copyErrorToHostAsync(stream);
+      }
+    }
+    // End of Raw2Digi and passing data for clustering
+
+    {
+      // clusterizer ...
+      using namespace gpuClustering;
+      int threadsPerBlock = 256;
+      int blocks =
+          (std::max(int(wordCounter), int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
+
+      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
+                                                                        digis_d.moduleInd(),
+                                                                        digis_d.c_xx(),
+                                                                        digis_d.c_yy(),
+                                                                        digis_d.adc(),
+                                                                        gains,
+                                                                        wordCounter,
+                                                                        clusters_d.moduleStart(),
+                                                                        clusters_d.clusInModule(),
+                                                                        clusters_d.clusModuleStart());
+      cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+      cudaDeviceSynchronize();
+      cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef GPU_DEBUG
+      std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock
+                << " threads\n";
+#endif
+
+      countModules<<<blocks, threadsPerBlock, 0, stream>>>(
+          digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
+      cudaCheck(cudaGetLastError());
+
+      // read the number of modules into a data member, used by getProduct())
+      cudaCheck(cudaMemcpyAsync(
+          &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
+
+      threadsPerBlock = 256;
+      blocks = MaxNumModules;
+#ifdef GPU_DEBUG
+      std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
+#endif
+      findClus<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.c_moduleInd(),
+                                                       digis_d.c_xx(),
+                                                       digis_d.c_yy(),
+                                                       clusters_d.c_moduleStart(),
+                                                       clusters_d.clusInModule(),
+                                                       clusters_d.moduleId(),
+                                                       digis_d.clus(),
+                                                       wordCounter);
+      cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+      cudaDeviceSynchronize();
+      cudaCheck(cudaGetLastError());
+#endif
+
+      // apply charge cut
+      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
+                                                               digis_d.c_adc(),
+                                                               clusters_d.c_moduleStart(),
+                                                               clusters_d.clusInModule(),
+                                                               clusters_d.c_moduleId(),
+                                                               digis_d.clus(),
+                                                               wordCounter);
+      cudaCheck(cudaGetLastError());
+
+      // count the module start indices already here (instead of
+      // rechits) so that the number of clusters/hits can be made
+      // available in the rechit producer without additional points of
+      // synchronization/ExternalWork
+
+      // MUST be ONE block
+      fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.c_clusInModule(), clusters_d.clusModuleStart());
+
+      // last element holds the number of all clusters
+      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]),
+                                clusters_d.clusModuleStart() + gpuClustering::MaxNumModules,
+                                sizeof(uint32_t),
+                                cudaMemcpyDefault,
+                                stream));
+
+#ifdef GPU_DEBUG
+      cudaDeviceSynchronize();
+      cudaCheck(cudaGetLastError());
+#endif
+
+    }  // end clusterizer scope
+  }
+}  // namespace pixelgpudetails
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
new file mode 100644
index 000000000..3cbce9e71
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
@@ -0,0 +1,223 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
+
+#include <algorithm>
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+#include "CUDACore/SimpleVector.h"
+#include "CUDACore/host_unique_ptr.h"
+#include "CUDACore/host_noncached_unique_ptr.h"
+#include "DataFormats/PixelErrors.h"
+
+struct SiPixelFedCablingMapGPU;
+class SiPixelGainForHLTonGPU;
+
+namespace pixelgpudetails {
+
+  // Phase 1 geometry constants
+  const uint32_t layerStartBit = 20;
+  const uint32_t ladderStartBit = 12;
+  const uint32_t moduleStartBit = 2;
+
+  const uint32_t panelStartBit = 10;
+  const uint32_t diskStartBit = 18;
+  const uint32_t bladeStartBit = 12;
+
+  const uint32_t layerMask = 0xF;
+  const uint32_t ladderMask = 0xFF;
+  const uint32_t moduleMask = 0x3FF;
+  const uint32_t panelMask = 0x3;
+  const uint32_t diskMask = 0xF;
+  const uint32_t bladeMask = 0x3F;
+
+  const uint32_t LINK_bits = 6;
+  const uint32_t ROC_bits = 5;
+  const uint32_t DCOL_bits = 5;
+  const uint32_t PXID_bits = 8;
+  const uint32_t ADC_bits = 8;
+
+  // special for layer 1
+  const uint32_t LINK_bits_l1 = 6;
+  const uint32_t ROC_bits_l1 = 5;
+  const uint32_t COL_bits_l1 = 6;
+  const uint32_t ROW_bits_l1 = 7;
+  const uint32_t OMIT_ERR_bits = 1;
+
+  const uint32_t maxROCIndex = 8;
+  const uint32_t numRowsInRoc = 80;
+  const uint32_t numColsInRoc = 52;
+
+  const uint32_t MAX_WORD = 2000;
+
+  const uint32_t ADC_shift = 0;
+  const uint32_t PXID_shift = ADC_shift + ADC_bits;
+  const uint32_t DCOL_shift = PXID_shift + PXID_bits;
+  const uint32_t ROC_shift = DCOL_shift + DCOL_bits;
+  const uint32_t LINK_shift = ROC_shift + ROC_bits_l1;
+  // special for layer 1 ROC
+  const uint32_t ROW_shift = ADC_shift + ADC_bits;
+  const uint32_t COL_shift = ROW_shift + ROW_bits_l1;
+  const uint32_t OMIT_ERR_shift = 20;
+
+  const uint32_t LINK_mask = ~(~uint32_t(0) << LINK_bits_l1);
+  const uint32_t ROC_mask = ~(~uint32_t(0) << ROC_bits_l1);
+  const uint32_t COL_mask = ~(~uint32_t(0) << COL_bits_l1);
+  const uint32_t ROW_mask = ~(~uint32_t(0) << ROW_bits_l1);
+  const uint32_t DCOL_mask = ~(~uint32_t(0) << DCOL_bits);
+  const uint32_t PXID_mask = ~(~uint32_t(0) << PXID_bits);
+  const uint32_t ADC_mask = ~(~uint32_t(0) << ADC_bits);
+  const uint32_t ERROR_mask = ~(~uint32_t(0) << ROC_bits_l1);
+  const uint32_t OMIT_ERR_mask = ~(~uint32_t(0) << OMIT_ERR_bits);
+
+  struct DetIdGPU {
+    uint32_t RawId;
+    uint32_t rocInDet;
+    uint32_t moduleId;
+  };
+
+  struct Pixel {
+    uint32_t row;
+    uint32_t col;
+  };
+
+  class Packing {
+  public:
+    using PackedDigiType = uint32_t;
+
+    // Constructor: pre-computes masks and shifts from field widths
+    __host__ __device__ inline constexpr Packing(unsigned int row_w,
+                                                 unsigned int column_w,
+                                                 unsigned int time_w,
+                                                 unsigned int adc_w)
+        : row_width(row_w),
+          column_width(column_w),
+          adc_width(adc_w),
+          row_shift(0),
+          column_shift(row_shift + row_w),
+          time_shift(column_shift + column_w),
+          adc_shift(time_shift + time_w),
+          row_mask(~(~0U << row_w)),
+          column_mask(~(~0U << column_w)),
+          time_mask(~(~0U << time_w)),
+          adc_mask(~(~0U << adc_w)),
+          rowcol_mask(~(~0U << (column_w + row_w))),
+          max_row(row_mask),
+          max_column(column_mask),
+          max_adc(adc_mask) {}
+
+    uint32_t row_width;
+    uint32_t column_width;
+    uint32_t adc_width;
+
+    uint32_t row_shift;
+    uint32_t column_shift;
+    uint32_t time_shift;
+    uint32_t adc_shift;
+
+    PackedDigiType row_mask;
+    PackedDigiType column_mask;
+    PackedDigiType time_mask;
+    PackedDigiType adc_mask;
+    PackedDigiType rowcol_mask;
+
+    uint32_t max_row;
+    uint32_t max_column;
+    uint32_t max_adc;
+  };
+
+  __host__ __device__ inline constexpr Packing packing() { return Packing(11, 11, 0, 10); }
+
+  __host__ __device__ inline uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) {
+    constexpr Packing thePacking = packing();
+    adc = std::min(adc, thePacking.max_adc);
+
+    return (row << thePacking.row_shift) | (col << thePacking.column_shift) | (adc << thePacking.adc_shift);
+  }
+
+  constexpr uint32_t pixelToChannel(int row, int col) {
+    constexpr Packing thePacking = packing();
+    return (row << thePacking.column_width) | col;
+  }
+
+  class SiPixelRawToClusterGPUKernel {
+  public:
+    class WordFedAppender {
+    public:
+      WordFedAppender();
+      ~WordFedAppender() = default;
+
+      void initializeWordFed(int fedId, unsigned int wordCounterGPU, const uint32_t* src, unsigned int length);
+
+      const unsigned int* word() const { return word_.get(); }
+      const unsigned char* fedId() const { return fedId_.get(); }
+
+    private:
+      cms::cuda::host::noncached::unique_ptr<unsigned int[]> word_;
+      cms::cuda::host::noncached::unique_ptr<unsigned char[]> fedId_;
+    };
+
+    SiPixelRawToClusterGPUKernel() = default;
+    ~SiPixelRawToClusterGPUKernel() = default;
+
+    SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete;
+    SiPixelRawToClusterGPUKernel(SiPixelRawToClusterGPUKernel&&) = delete;
+    SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
+    SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
+
+    void makeClustersAsync(bool isRun2,
+                           const SiPixelFedCablingMapGPU* cablingMap,
+                           const unsigned char* modToUnp,
+                           const SiPixelGainForHLTonGPU* gains,
+                           const WordFedAppender& wordFed,
+                           PixelFormatterErrors&& errors,
+                           const uint32_t wordCounter,
+                           const uint32_t fedCounter,
+                           bool useQualityInfo,
+                           bool includeErrors,
+                           bool debug,
+                           cudaStream_t stream);
+
+    std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
+      digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
+      clusters_d.setNClusters(nModules_Clusters_h[1]);
+      // need to explicitly deallocate while the associated CUDA
+      // stream is still alive
+      //
+      // technically the statement above is not true anymore now that
+      // the CUDA streams are cached within the cms::cuda::StreamCache, but it is
+      // still better to release as early as possible
+      nModules_Clusters_h.reset();
+      return std::make_pair(std::move(digis_d), std::move(clusters_d));
+    }
+
+    SiPixelDigiErrorsCUDA&& getErrors() { return std::move(digiErrors_d); }
+
+  private:
+    uint32_t nDigis = 0;
+
+    // Data to be put in the event
+    cms::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
+    SiPixelDigisCUDA digis_d;
+    SiPixelClustersCUDA clusters_d;
+    SiPixelDigiErrorsCUDA digiErrors_d;
+  };
+
+  // see RecoLocalTracker/SiPixelClusterizer
+  // all are runtime const, should be specified in python _cfg.py
+  struct ADCThreshold {
+    const int thePixelThreshold = 1000;      // default Pixel threshold in electrons
+    const int theSeedThreshold = 1000;       // seed thershold in electrons not used in our algo
+    const float theClusterThreshold = 4000;  // cluster threshold in electron
+    const int ConversionFactor = 65;         // adc to electron conversion factor
+
+    const int theStackADC_ = 255;               // the maximum adc count for stack layer
+    const int theFirstStack_ = 5;               // the index of the fits stack layer
+    const double theElectronPerADCGain_ = 600;  // ADC to electron conversion
+  };
+
+}  // namespace pixelgpudetails
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h
new file mode 100644
index 000000000..da36be6c4
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h
@@ -0,0 +1,69 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "CondFormats/SiPixelGainForHLTonGPU.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "gpuClusteringConstants.h"
+
+namespace gpuCalibPixel {
+
+  constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
+
+  // valid for run2
+  constexpr float VCaltoElectronGain = 47;         // L2-4: 47 +- 4.7
+  constexpr float VCaltoElectronGain_L1 = 50;      // L1:   49.6 +- 2.6
+  constexpr float VCaltoElectronOffset = -60;      // L2-4: -60 +- 130
+  constexpr float VCaltoElectronOffset_L1 = -670;  // L1:   -670 +- 220
+
+  __global__ void calibDigis(bool isRun2,
+                             uint16_t* id,
+                             uint16_t const* __restrict__ x,
+                             uint16_t const* __restrict__ y,
+                             uint16_t* adc,
+                             SiPixelGainForHLTonGPU const* __restrict__ ped,
+                             int numElements,
+                             uint32_t* __restrict__ moduleStart,        // just to zero first
+                             uint32_t* __restrict__ nClustersInModule,  // just to zero them
+                             uint32_t* __restrict__ clusModuleStart     // just to zero first
+  ) {
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+
+    // zero for next kernels...
+    if (0 == first)
+      clusModuleStart[0] = moduleStart[0] = 0;
+    for (int i = first; i < gpuClustering::MaxNumModules; i += gridDim.x * blockDim.x) {
+      nClustersInModule[i] = 0;
+    }
+
+    for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
+      if (InvId == id[i])
+        continue;
+
+      float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f;
+      float offset = (isRun2) ? (id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset) : 0;
+
+      bool isDeadColumn = false, isNoisyColumn = false;
+
+      int row = x[i];
+      int col = y[i];
+      auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn);
+      float pedestal = ret.first;
+      float gain = ret.second;
+      // float pedestal = 0; float gain = 1.;
+      if (isDeadColumn | isNoisyColumn) {
+        id[i] = InvId;
+        adc[i] = 0;
+        printf("bad pixel at %d in %d\n", i, id[i]);
+      } else {
+        float vcal = adc[i] * gain - pedestal * gain;
+        adc[i] = std::max(100, int(vcal * conversionFactor + offset));
+      }
+    }
+  }
+}  // namespace gpuCalibPixel
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
new file mode 100644
index 000000000..d0dd93044
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
@@ -0,0 +1,125 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "CUDACore/cuda_assert.h"
+#include "CUDACore/prefixScan.h"
+
+#include "gpuClusteringConstants.h"
+
+namespace gpuClustering {
+
+  __global__ void clusterChargeCut(
+      uint16_t* __restrict__ id,                 // module id of each pixel (modified if bad cluster)
+      uint16_t const* __restrict__ adc,          //  charge of each pixel
+      uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
+      uint32_t* __restrict__ nClustersInModule,  // modified: number of clusters found in each module
+      uint32_t const* __restrict__ moduleId,     // module id of each module
+      int32_t* __restrict__ clusterId,           // modified: cluster id of each pixel
+      uint32_t numElements) {
+    if (blockIdx.x >= moduleStart[0])
+      return;
+
+    auto firstPixel = moduleStart[1 + blockIdx.x];
+    auto thisModuleId = id[firstPixel];
+    assert(thisModuleId < MaxNumModules);
+    assert(thisModuleId == moduleId[blockIdx.x]);
+
+    auto nclus = nClustersInModule[thisModuleId];
+    if (nclus == 0)
+      return;
+
+    if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
+      printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+             thisModuleId,
+             blockIdx.x,
+             nclus,
+             MaxNumClustersPerModules);
+
+    auto first = firstPixel + threadIdx.x;
+
+    if (nclus > MaxNumClustersPerModules) {
+      // remove excess  FIXME find a way to cut charge first....
+      for (auto i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        if (clusterId[i] >= MaxNumClustersPerModules) {
+          id[i] = InvId;
+          clusterId[i] = InvId;
+        }
+      }
+      nclus = MaxNumClustersPerModules;
+    }
+
+#ifdef GPU_DEBUG
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+    __shared__ int32_t charge[MaxNumClustersPerModules];
+    __shared__ uint8_t ok[MaxNumClustersPerModules];
+    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
+
+    assert(nclus <= MaxNumClustersPerModules);
+    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+      charge[i] = 0;
+    }
+    __syncthreads();
+
+    for (auto i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId)
+        continue;  // not valid
+      if (id[i] != thisModuleId)
+        break;  // end of module
+      atomicAdd(&charge[clusterId[i]], adc[i]);
+    }
+    __syncthreads();
+
+    auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
+    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+      newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
+    }
+
+    __syncthreads();
+
+    // renumber
+    __shared__ uint16_t ws[32];
+    cms::cuda::blockPrefixScan(newclusId, nclus, ws);
+
+    assert(nclus >= newclusId[nclus - 1]);
+
+    if (nclus == newclusId[nclus - 1])
+      return;
+
+    nClustersInModule[thisModuleId] = newclusId[nclus - 1];
+    __syncthreads();
+
+    // mark bad cluster again
+    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+      if (0 == ok[i])
+        newclusId[i] = InvId + 1;
+    }
+    __syncthreads();
+
+    // reassign id
+    for (auto i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId)
+        continue;  // not valid
+      if (id[i] != thisModuleId)
+        break;  // end of module
+      clusterId[i] = newclusId[clusterId[i]] - 1;
+      if (clusterId[i] == InvId)
+        id[i] = InvId;
+    }
+
+    //done
+  }
+
+}  // namespace gpuClustering
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h
new file mode 100644
index 000000000..84609bd10
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h
@@ -0,0 +1,306 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "Geometry/phase1PixelTopology.h"
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cuda_assert.h"
+
+#include "gpuClusteringConstants.h"
+
+namespace gpuClustering {
+
+#ifdef GPU_DEBUG
+  __device__ uint32_t gMaxHit = 0;
+#endif
+
+  __global__ void countModules(uint16_t const* __restrict__ id,
+                               uint32_t* __restrict__ moduleStart,
+                               int32_t* __restrict__ clusterId,
+                               int numElements) {
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
+      clusterId[i] = i;
+      if (InvId == id[i])
+        continue;
+      auto j = i - 1;
+      while (j >= 0 and id[j] == InvId)
+        --j;
+      if (j < 0 or id[j] != id[i]) {
+        // boundary...
+        auto loc = atomicInc(moduleStart, MaxNumModules);
+        moduleStart[loc + 1] = i;
+      }
+    }
+  }
+
+  __global__
+      //  __launch_bounds__(256,4)
+      void
+      findClus(uint16_t const* __restrict__ id,           // module id of each pixel
+               uint16_t const* __restrict__ x,            // local coordinates of each pixel
+               uint16_t const* __restrict__ y,            //
+               uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
+               uint32_t* __restrict__ nClustersInModule,  // output: number of clusters found in each module
+               uint32_t* __restrict__ moduleId,           // output: module id of each module
+               int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
+               int numElements) {
+    if (blockIdx.x >= moduleStart[0])
+      return;
+
+    auto firstPixel = moduleStart[1 + blockIdx.x];
+    auto thisModuleId = id[firstPixel];
+    assert(thisModuleId < MaxNumModules);
+
+#ifdef GPU_DEBUG
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+    auto first = firstPixel + threadIdx.x;
+
+    // find the index of the first pixel not belonging to this module (or invalid)
+    __shared__ int msize;
+    msize = numElements;
+    __syncthreads();
+
+    // skip threads not associated to an existing pixel
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId)  // skip invalid pixels
+        continue;
+      if (id[i] != thisModuleId) {  // find the first pixel in a different module
+        atomicMin(&msize, i);
+        break;
+      }
+    }
+
+    //init hist  (ymax=416 < 512 : 9bits)
+    constexpr uint32_t maxPixInModule = 4000;
+    constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
+    using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter ws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
+
+    // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
+    if (0 == threadIdx.x) {
+      if (msize - firstPixel > maxPixInModule) {
+        printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
+        msize = maxPixInModule + firstPixel;
+      }
+    }
+
+    __syncthreads();
+    assert(msize - firstPixel <= maxPixInModule);
+
+#ifdef GPU_DEBUG
+    __shared__ uint32_t totGood;
+    totGood = 0;
+    __syncthreads();
+#endif
+
+    // fill histo
+    for (int i = first; i < msize; i += blockDim.x) {
+      if (id[i] == InvId)  // skip invalid pixels
+        continue;
+      hist.count(y[i]);
+#ifdef GPU_DEBUG
+      atomicAdd(&totGood, 1);
+#endif
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      ws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(ws);
+    __syncthreads();
+#ifdef GPU_DEBUG
+    assert(hist.size() == totGood);
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("histo size %d\n", hist.size());
+#endif
+    for (int i = first; i < msize; i += blockDim.x) {
+      if (id[i] == InvId)  // skip invalid pixels
+        continue;
+      hist.fill(y[i], i - firstPixel);
+    }
+
+#ifdef __CUDA_ARCH__
+    // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
+    constexpr int maxiter = 16;
+#else
+    auto maxiter = hist.size();
+#endif
+    // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
+    constexpr int maxNeighbours = 10;
+    assert((hist.size() / blockDim.x) <= maxiter);
+    // nearest neighbour
+    uint16_t nn[maxiter][maxNeighbours];
+    uint8_t nnn[maxiter];  // number of nn
+    for (uint32_t k = 0; k < maxiter; ++k)
+      nnn[k] = 0;
+
+    __syncthreads();  // for hit filling!
+
+#ifdef GPU_DEBUG
+    // look for anomalous high occupancy
+    __shared__ uint32_t n40, n60;
+    n40 = n60 = 0;
+    __syncthreads();
+    for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
+      if (hist.size(j) > 60)
+        atomicAdd(&n60, 1);
+      if (hist.size(j) > 40)
+        atomicAdd(&n40, 1);
+    }
+    __syncthreads();
+    if (0 == threadIdx.x) {
+      if (n60 > 0)
+        printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
+      else if (n40 > 0)
+        printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
+    }
+    __syncthreads();
+#endif
+
+    // fill NN
+    for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+      assert(k < maxiter);
+      auto p = hist.begin() + j;
+      auto i = *p + firstPixel;
+      assert(id[i] != InvId);
+      assert(id[i] == thisModuleId);  // same module
+      int be = Hist::bin(y[i] + 1);
+      auto e = hist.end(be);
+      ++p;
+      assert(0 == nnn[k]);
+      for (; p < e; ++p) {
+        auto m = (*p) + firstPixel;
+        assert(m != i);
+        assert(int(y[m]) - int(y[i]) >= 0);
+        assert(int(y[m]) - int(y[i]) <= 1);
+        if (std::abs(int(x[m]) - int(x[i])) > 1)
+          continue;
+        auto l = nnn[k]++;
+        assert(l < maxNeighbours);
+        nn[k][l] = *p;
+      }
+    }
+
+    // for each pixel, look at all the pixels until the end of the module;
+    // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+    // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+    // pixel in the cluster ( clus[i] == i ).
+    bool more = true;
+    int nloops = 0;
+    while (__syncthreads_or(more)) {
+      if (1 == nloops % 2) {
+        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+          auto p = hist.begin() + j;
+          auto i = *p + firstPixel;
+          auto m = clusterId[i];
+          while (m != clusterId[m])
+            m = clusterId[m];
+          clusterId[i] = m;
+        }
+      } else {
+        more = false;
+        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+          auto p = hist.begin() + j;
+          auto i = *p + firstPixel;
+          for (int kk = 0; kk < nnn[k]; ++kk) {
+            auto l = nn[k][kk];
+            auto m = l + firstPixel;
+            assert(m != i);
+            auto old = atomicMin(&clusterId[m], clusterId[i]);
+            if (old != clusterId[i]) {
+              // end the loop only if no changes were applied
+              more = true;
+            }
+            atomicMin(&clusterId[i], old);
+          }  // nnloop
+        }    // pixel loop
+      }
+      ++nloops;
+    }  // end while
+
+#ifdef GPU_DEBUG
+    {
+      __shared__ int n0;
+      if (threadIdx.x == 0)
+        n0 = nloops;
+      __syncthreads();
+      auto ok = n0 == nloops;
+      assert(__syncthreads_and(ok));
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("# loops %d\n", nloops);
+    }
+#endif
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a pixels with clus[i] == i;
+    // mark these pixels with a negative id.
+    for (int i = first; i < msize; i += blockDim.x) {
+      if (id[i] == InvId)  // skip invalid pixels
+        continue;
+      if (clusterId[i] == i) {
+        auto old = atomicInc(&foundClusters, 0xffffffff);
+        clusterId[i] = -(old + 1);
+      }
+    }
+    __syncthreads();
+
+    // propagate the negative id to all the pixels in the cluster.
+    for (int i = first; i < msize; i += blockDim.x) {
+      if (id[i] == InvId)  // skip invalid pixels
+        continue;
+      if (clusterId[i] >= 0) {
+        // mark each pixel in a cluster with the same id as the first one
+        clusterId[i] = clusterId[clusterId[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (int i = first; i < msize; i += blockDim.x) {
+      if (id[i] == InvId) {  // skip invalid pixels
+        clusterId[i] = -9999;
+        continue;
+      }
+      clusterId[i] = -clusterId[i] - 1;
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      nClustersInModule[thisModuleId] = foundClusters;
+      moduleId[blockIdx.x] = thisModuleId;
+#ifdef GPU_DEBUG
+      if (foundClusters > gMaxHit) {
+        gMaxHit = foundClusters;
+        if (foundClusters > 8)
+          printf("max hit %d in %d\n", foundClusters, thisModuleId);
+      }
+#endif
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        printf("%d clusters in module %d\n", foundClusters, thisModuleId);
+#endif
+    }
+  }
+
+}  // namespace gpuClustering
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuClusteringConstants.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusteringConstants.h
new file mode 100644
index 000000000..0bce634ee
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusteringConstants.h
@@ -0,0 +1,6 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
+
+#include "CUDADataFormats/gpuClusteringConstants.h"
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
diff --git a/src/cudacompat/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudacompat/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
new file mode 100644
index 000000000..448f4b797
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
@@ -0,0 +1,72 @@
+#include "CUDACore/Product.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "DataFormats/SiPixelDigisSoA.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/EDProducer.h"
+#include "Framework/PluginFactory.h"
+#include "CUDACore/ScopedContext.h"
+#include "CUDACore/host_unique_ptr.h"
+
+class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork {
+public:
+  explicit SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg);
+  ~SiPixelDigisSoAFromCUDA() override = default;
+
+private:
+  void acquire(const edm::Event& iEvent,
+               const edm::EventSetup& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
+  edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
+
+  cms::cuda::host::unique_ptr<uint32_t[]> pdigi_;
+  cms::cuda::host::unique_ptr<uint32_t[]> rawIdArr_;
+  cms::cuda::host::unique_ptr<uint16_t[]> adc_;
+  cms::cuda::host::unique_ptr<int32_t[]> clus_;
+
+  size_t nDigis_;
+};
+
+SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg)
+    : digiGetToken_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
+      digiPutToken_(reg.produces<SiPixelDigisSoA>()) {}
+
+void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
+                                      const edm::EventSetup& iSetup,
+                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
+
+  nDigis_ = gpuDigis.nDigis();
+  pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
+  rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
+  adc_ = gpuDigis.adcToHostAsync(ctx.stream());
+  clus_ = gpuDigis.clusToHostAsync(ctx.stream());
+}
+
+void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // The following line copies the data from the pinned host memory to
+  // regular host memory. In principle that feels unnecessary (why not
+  // just use the pinned host memory?). There are a few arguments for
+  // doing it though
+  // - Now can release the pinned host memory back to the (caching) allocator
+  //   * if we'd like to keep the pinned memory, we'd need to also
+  //     keep the CUDA stream around as long as that, or allow pinned
+  //     host memory to be allocated without a CUDA stream
+  // - What if a CPU algorithm would produce the same SoA? We can't
+  //   use cudaMallocHost without a GPU...
+  iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());
+
+  pdigi_.reset();
+  rawIdArr_.reset();
+  adc_.reset();
+  clus_.reset();
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelDigisSoAFromCUDA);
diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelCPEFastESProducer.cc b/src/cudacompat/plugin-SiPixelRecHits/PixelCPEFastESProducer.cc
new file mode 100644
index 000000000..077491c3a
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelRecHits/PixelCPEFastESProducer.cc
@@ -0,0 +1,23 @@
+#include "CondFormats/PixelCPEFast.h"
+#include "Framework/ESProducer.h"
+#include "Framework/EventSetup.h"
+#include "Framework/ESPluginFactory.h"
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+class PixelCPEFastESProducer : public edm::ESProducer {
+public:
+  explicit PixelCPEFastESProducer(std::filesystem::path const& datadir) : data_(datadir) {}
+  void produce(edm::EventSetup& eventSetup);
+
+private:
+  std::filesystem::path data_;
+};
+
+void PixelCPEFastESProducer::produce(edm::EventSetup& eventSetup) {
+  eventSetup.put(std::make_unique<PixelCPEFast>(data_ / "cpefast.bin"));
+}
+
+DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducer);
diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu
new file mode 100644
index 000000000..4cd3fc152
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu
@@ -0,0 +1,78 @@
+// C++ headers
+#include <algorithm>
+#include <numeric>
+
+// CUDA runtime
+#include <cuda_runtime.h>
+
+// CMSSW headers
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h"  // !
+#include "plugin-SiPixelClusterizer/gpuClusteringConstants.h"        // !
+
+#include "PixelRecHits.h"
+#include "gpuPixelRecHits.h"
+
+namespace {
+  __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart,
+                                    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                    uint32_t* hitsLayerStart) {
+    auto i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    assert(0 == hitsModuleStart[0]);
+
+    if (i < 11) {
+      hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]];
+#ifdef GPU_DEBUG
+      printf("LayerStart %d %d: %d\n", i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]);
+#endif
+    }
+  }
+}  // namespace
+
+namespace pixelgpudetails {
+
+  TrackingRecHit2DCUDA PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                                                           SiPixelClustersCUDA const& clusters_d,
+                                                           BeamSpotCUDA const& bs_d,
+                                                           pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                                           cudaStream_t stream) const {
+    auto nHits = clusters_d.nClusters();
+    TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream);
+
+    int threadsPerBlock = 128;
+    int blocks = digis_d.nModules();  // active modules (with digis)
+
+#ifdef GPU_DEBUG
+    std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
+#endif
+    if (blocks)  // protect from empty events
+      gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream>>>(
+          cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
+    cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+
+    // assuming full warp of threads is better than a smaller number...
+    if (nHits) {
+      setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
+      cudaCheck(cudaGetLastError());
+    }
+
+    if (nHits) {
+      cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream);
+      cudaCheck(cudaGetLastError());
+    }
+
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+
+    return hits_d;
+  }
+
+}  // namespace pixelgpudetails
diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h
new file mode 100644
index 000000000..8f5653fbd
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h
@@ -0,0 +1,33 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/BeamSpotCUDA.h"
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+
+namespace pixelgpudetails {
+
+  class PixelRecHitGPUKernel {
+  public:
+    PixelRecHitGPUKernel() = default;
+    ~PixelRecHitGPUKernel() = default;
+
+    PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete;
+    PixelRecHitGPUKernel(PixelRecHitGPUKernel&&) = delete;
+    PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
+    PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
+
+    TrackingRecHit2DCUDA makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                                       SiPixelClustersCUDA const& clusters_d,
+                                       BeamSpotCUDA const& bs_d,
+                                       pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                       cudaStream_t stream) const;
+  };
+}  // namespace pixelgpudetails
+
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
diff --git a/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
new file mode 100644
index 000000000..a82e23eab
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
@@ -0,0 +1,61 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/BeamSpotCUDA.h"
+#include "CUDACore/Product.h"
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+#include "CUDACore/ScopedContext.h"
+#include "CondFormats/PixelCPEFast.h"
+
+#include "PixelRecHits.h"  // TODO : spit product from kernel
+
+class SiPixelRecHitCUDA : public edm::EDProducer {
+public:
+  explicit SiPixelRecHitCUDA(edm::ProductRegistry& reg);
+  ~SiPixelRecHitCUDA() override = default;
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  // The mess with inputs will be cleaned up when migrating to the new framework
+  edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
+
+  edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tokenHit_;
+
+  pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
+};
+
+SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg)
+    : tBeamSpot(reg.consumes<cms::cuda::Product<BeamSpotCUDA>>()),
+      token_(reg.consumes<cms::cuda::Product<SiPixelClustersCUDA>>()),
+      tokenDigi_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
+      tokenHit_(reg.produces<cms::cuda::Product<TrackingRecHit2DCUDA>>()) {}
+
+void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
+  PixelCPEFast const& fcpe = es.get<PixelCPEFast>();
+
+  auto const& pclusters = iEvent.get(token_);
+  cms::cuda::ScopedContextProduce ctx{pclusters};
+
+  auto const& clusters = ctx.get(pclusters);
+  auto const& digis = ctx.get(iEvent, tokenDigi_);
+  auto const& bs = ctx.get(iEvent, tBeamSpot);
+
+  auto nHits = clusters.nClusters();
+  if (nHits >= TrackingRecHit2DSOAView::maxHits()) {
+    std::cout << "Clusters/Hits Overflow " << nHits << " >= " << TrackingRecHit2DSOAView::maxHits() << std::endl;
+  }
+
+  ctx.emplace(iEvent,
+              tokenHit_,
+              gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream()));
+}
+
+DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
diff --git a/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
new file mode 100644
index 000000000..433d3b012
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
@@ -0,0 +1,222 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
+
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "CUDADataFormats/BeamSpotCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "DataFormats/approx_atan2.h"
+#include "CUDACore/cuda_assert.h"
+#include "CondFormats/pixelCPEforGPU.h"
+
+namespace gpuPixelRecHits {
+
+  __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
+                          BeamSpotPOD const* __restrict__ bs,
+                          SiPixelDigisCUDA::DeviceConstView const* __restrict__ pdigis,
+                          int numElements,
+                          SiPixelClustersCUDA::DeviceConstView const* __restrict__ pclusters,
+                          TrackingRecHit2DSOAView* phits) {
+    // FIXME
+    // the compiler seems NOT to optimize loads from views (even in a simple test case)
+    // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
+    // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
+
+    assert(phits);
+    assert(cpeParams);
+
+    auto& hits = *phits;
+
+    auto const digis = *pdigis;  // the copy is intentional!
+    auto const& clusters = *pclusters;
+
+    // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
+    if (0 == blockIdx.x) {
+      auto& agc = hits.averageGeometry();
+      auto const& ag = cpeParams->averageGeometry();
+      for (int il = threadIdx.x, nl = TrackingRecHit2DSOAView::AverageGeometry::numberOfLaddersInBarrel; il < nl;
+           il += blockDim.x) {
+        agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
+        agc.ladderX[il] = ag.ladderX[il] - bs->x;
+        agc.ladderY[il] = ag.ladderY[il] - bs->y;
+        agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
+        agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
+        agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
+      }
+      if (0 == threadIdx.x) {
+        agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
+        agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
+        //         printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]);
+      }
+    }
+
+    // to be moved in common namespace...
+    constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
+    constexpr int32_t MaxHitsInIter = pixelCPEforGPU::MaxHitsInIter;
+
+    using ClusParams = pixelCPEforGPU::ClusParams;
+
+    // as usual one block per module
+    __shared__ ClusParams clusParams;
+
+    auto me = clusters.moduleId(blockIdx.x);
+    int nclus = clusters.clusInModule(me);
+
+    if (0 == nclus)
+      return;
+
+#ifdef GPU_DEBUG
+    if (threadIdx.x == 0) {
+      auto k = clusters.moduleStart(1 + blockIdx.x);
+      while (digis.moduleInd(k) == InvId)
+        ++k;
+      assert(digis.moduleInd(k) == me);
+    }
+#endif
+
+#ifdef GPU_DEBUG
+    if (me % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
+#endif
+
+    for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
+      auto first = clusters.moduleStart(1 + blockIdx.x);
+
+      int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
+      int lastClus = startClus + nClusInIter;
+      assert(nClusInIter <= nclus);
+      assert(nClusInIter > 0);
+      assert(lastClus <= nclus);
+
+      assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
+
+      // init
+      for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
+        clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
+        clusParams.maxRow[ic] = 0;
+        clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
+        clusParams.maxCol[ic] = 0;
+        clusParams.charge[ic] = 0;
+        clusParams.Q_f_X[ic] = 0;
+        clusParams.Q_l_X[ic] = 0;
+        clusParams.Q_f_Y[ic] = 0;
+        clusParams.Q_l_Y[ic] = 0;
+      }
+
+      first += threadIdx.x;
+
+      __syncthreads();
+
+      // one thead per "digi"
+
+      for (int i = first; i < numElements; i += blockDim.x) {
+        auto id = digis.moduleInd(i);
+        if (id == InvId)
+          continue;  // not valid
+        if (id != me)
+          break;  // end of module
+        auto cl = digis.clus(i);
+        if (cl < startClus || cl >= lastClus)
+          continue;
+        auto x = digis.xx(i);
+        auto y = digis.yy(i);
+        cl -= startClus;
+        assert(cl >= 0);
+        assert(cl < MaxHitsInIter);
+        atomicMin(&clusParams.minRow[cl], x);
+        atomicMax(&clusParams.maxRow[cl], x);
+        atomicMin(&clusParams.minCol[cl], y);
+        atomicMax(&clusParams.maxCol[cl], y);
+      }
+
+      __syncthreads();
+
+      // pixmx is not available in the binary dumps
+      //auto pixmx = cpeParams->detParams(me).pixmx;
+      auto pixmx = std::numeric_limits<uint16_t>::max();
+      for (int i = first; i < numElements; i += blockDim.x) {
+        auto id = digis.moduleInd(i);
+        if (id == InvId)
+          continue;  // not valid
+        if (id != me)
+          break;  // end of module
+        auto cl = digis.clus(i);
+        if (cl < startClus || cl >= lastClus)
+          continue;
+        cl -= startClus;
+        assert(cl >= 0);
+        assert(cl < MaxHitsInIter);
+        auto x = digis.xx(i);
+        auto y = digis.yy(i);
+        auto ch = std::min(digis.adc(i), pixmx);
+        atomicAdd(&clusParams.charge[cl], ch);
+        if (clusParams.minRow[cl] == x)
+          atomicAdd(&clusParams.Q_f_X[cl], ch);
+        if (clusParams.maxRow[cl] == x)
+          atomicAdd(&clusParams.Q_l_X[cl], ch);
+        if (clusParams.minCol[cl] == y)
+          atomicAdd(&clusParams.Q_f_Y[cl], ch);
+        if (clusParams.maxCol[cl] == y)
+          atomicAdd(&clusParams.Q_l_Y[cl], ch);
+      }
+
+      __syncthreads();
+
+      // next one cluster per thread...
+
+      first = clusters.clusModuleStart(me) + startClus;
+
+      for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
+        auto h = first + ic;  // output index in global memory
+
+        // this cannot happen anymore
+        if (h >= TrackingRecHit2DSOAView::maxHits())
+          break;  // overflow...
+        assert(h < hits.nHits());
+        assert(h < clusters.clusModuleStart(me + 1));
+
+        pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+        pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+
+        // store it
+
+        hits.charge(h) = clusParams.charge[ic];
+
+        hits.detectorIndex(h) = me;
+
+        float xl, yl;
+        hits.xLocal(h) = xl = clusParams.xpos[ic];
+        hits.yLocal(h) = yl = clusParams.ypos[ic];
+
+        hits.clusterSizeX(h) = clusParams.xsize[ic];
+        hits.clusterSizeY(h) = clusParams.ysize[ic];
+
+        hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic];
+        hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic];
+
+        // keep it local for computations
+        float xg, yg, zg;
+        // to global and compute phi...
+        cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
+        // here correct for the beamspot...
+        xg -= bs->x;
+        yg -= bs->y;
+        zg -= bs->z;
+
+        hits.xGlobal(h) = xg;
+        hits.yGlobal(h) = yg;
+        hits.zGlobal(h) = zg;
+
+        hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
+        hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
+      }
+      __syncthreads();
+    }  // end loop on batches
+  }
+
+}  // namespace gpuPixelRecHits
+
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
diff --git a/src/cudacompat/plugin-Validation/CountValidator.cc b/src/cudacompat/plugin-Validation/CountValidator.cc
new file mode 100644
index 000000000..23352f5ba
--- /dev/null
+++ b/src/cudacompat/plugin-Validation/CountValidator.cc
@@ -0,0 +1,149 @@
+#include "CUDACore/Product.h"
+#include "CUDACore/ScopedContext.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/ZVertexHeterogeneous.h"
+#include "DataFormats/DigiClusterCount.h"
+#include "DataFormats/TrackCount.h"
+#include "DataFormats/VertexCount.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+
+#include <atomic>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+
+namespace {
+  std::atomic<int> allEvents = 0;
+  std::atomic<int> goodEvents = 0;
+  std::atomic<int> sumVertexDifference = 0;
+
+  std::mutex sumTrackDifferenceMutex;
+  float sumTrackDifference = 0;
+}  // namespace
+
+class CountValidator : public edm::EDProducer {
+public:
+  explicit CountValidator(edm::ProductRegistry& reg);
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void endJob() override;
+
+  edm::EDGetTokenT<DigiClusterCount> digiClusterCountToken_;
+  edm::EDGetTokenT<TrackCount> trackCountToken_;
+  edm::EDGetTokenT<VertexCount> vertexCountToken_;
+
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiToken_;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterToken_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> trackToken_;
+  edm::EDGetTokenT<ZVertexHeterogeneous> vertexToken_;
+};
+
+CountValidator::CountValidator(edm::ProductRegistry& reg)
+    : digiClusterCountToken_(reg.consumes<DigiClusterCount>()),
+      trackCountToken_(reg.consumes<TrackCount>()),
+      vertexCountToken_(reg.consumes<VertexCount>()),
+      digiToken_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
+      clusterToken_(reg.consumes<cms::cuda::Product<SiPixelClustersCUDA>>()),
+      trackToken_(reg.consumes<PixelTrackHeterogeneous>()),
+      vertexToken_(reg.consumes<ZVertexHeterogeneous>()) {}
+
+void CountValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  constexpr float trackTolerance = 0.012f;  // in 200 runs of 1k events all events are withing this tolerance
+  constexpr int vertexTolerance = 1;
+
+  std::stringstream ss;
+  bool ok = true;
+
+  ss << "Event " << iEvent.eventID() << " ";
+
+  {
+    auto const& pdigis = iEvent.get(digiToken_);
+    cms::cuda::ScopedContextProduce ctx{pdigis};
+    auto const& count = iEvent.get(digiClusterCountToken_);
+    auto const& digis = ctx.get(iEvent, digiToken_);
+    auto const& clusters = ctx.get(iEvent, clusterToken_);
+
+    if (digis.nModules() != count.nModules()) {
+      ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules();
+      ok = false;
+    }
+    if (digis.nDigis() != count.nDigis()) {
+      ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis();
+      ok = false;
+    }
+    if (clusters.nClusters() != count.nClusters()) {
+      ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters();
+      ok = false;
+    }
+  }
+
+  {
+    auto const& count = iEvent.get(trackCountToken_);
+    auto const& tracks = iEvent.get(trackToken_);
+
+    int nTracks = 0;
+    for (int i = 0; i < tracks->stride(); ++i) {
+      if (tracks->nHits(i) > 0) {
+        ++nTracks;
+      }
+    }
+
+    auto rel = std::abs(float(nTracks - int(count.nTracks())) / count.nTracks());
+    if (static_cast<unsigned int>(nTracks) != count.nTracks()) {
+      std::lock_guard<std::mutex> guard(sumTrackDifferenceMutex);
+      sumTrackDifference += rel;
+    }
+    if (rel >= trackTolerance) {
+      ss << "\n N(tracks) is " << nTracks << " expected " << count.nTracks() << ", relative difference " << rel
+         << " is outside tolerance " << trackTolerance;
+      ok = false;
+    }
+  }
+
+  {
+    auto const& count = iEvent.get(vertexCountToken_);
+    auto const& vertices = iEvent.get(vertexToken_);
+
+    auto diff = std::abs(int(vertices->nvFinal) - int(count.nVertices()));
+    if (diff != 0) {
+      sumVertexDifference += diff;
+    }
+    if (diff > vertexTolerance) {
+      ss << "\n N(vertices) is " << vertices->nvFinal << " expected " << count.nVertices() << ", difference " << diff
+         << " is outside tolerance " << vertexTolerance;
+      ok = false;
+    }
+  }
+
+  ++allEvents;
+  if (ok) {
+    ++goodEvents;
+  } else {
+    std::cout << ss.str() << std::endl;
+  }
+}
+
+void CountValidator::endJob() {
+  if (allEvents == goodEvents) {
+    std::cout << "CountValidator: all " << allEvents << " events passed validation\n";
+    if (sumTrackDifference != 0.f) {
+      std::cout << " Average relative track difference " << sumTrackDifference / allEvents.load()
+                << " (all within tolerance)\n";
+    }
+    if (sumVertexDifference != 0) {
+      std::cout << " Average absolute vertex difference " << float(sumVertexDifference.load()) / allEvents.load()
+                << " (all within tolerance)\n";
+    }
+  } else {
+    std::cout << "CountValidator: " << (allEvents - goodEvents) << " events failed validation (see details above)\n";
+    throw std::runtime_error("CountValidator failed");
+  }
+}
+
+DEFINE_FWK_MODULE(CountValidator);
diff --git a/src/cudacompat/plugin-Validation/HistoValidator.cc b/src/cudacompat/plugin-Validation/HistoValidator.cc
new file mode 100644
index 000000000..d7b11d4b2
--- /dev/null
+++ b/src/cudacompat/plugin-Validation/HistoValidator.cc
@@ -0,0 +1,192 @@
+#include "CUDACore/Product.h"
+#include "CUDACore/ScopedContext.h"
+#include "CUDADataFormats/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/ZVertexHeterogeneous.h"
+#include "Framework/EventSetup.h"
+#include "Framework/Event.h"
+#include "Framework/PluginFactory.h"
+#include "Framework/EDProducer.h"
+
+#include "SimpleAtomicHisto.h"
+
+#include <map>
+#include <fstream>
+
+class HistoValidator : public edm::EDProducerExternalWork {
+public:
+  explicit HistoValidator(edm::ProductRegistry& reg);
+
+private:
+  void acquire(const edm::Event& iEvent,
+               const edm::EventSetup& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void endJob() override;
+
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiToken_;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterToken_;
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> hitToken_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> trackToken_;
+  edm::EDGetTokenT<ZVertexHeterogeneous> vertexToken_;
+
+  uint32_t nDigis;
+  uint32_t nModules;
+  uint32_t nClusters;
+  uint32_t nHits;
+  cms::cuda::host::unique_ptr<uint16_t[]> h_adc;
+  cms::cuda::host::unique_ptr<uint32_t[]> h_clusInModule;
+  cms::cuda::host::unique_ptr<float[]> h_localCoord;
+  cms::cuda::host::unique_ptr<float[]> h_globalCoord;
+  cms::cuda::host::unique_ptr<int32_t[]> h_charge;
+  cms::cuda::host::unique_ptr<int16_t[]> h_size;
+
+  static std::map<std::string, SimpleAtomicHisto> histos;
+};
+
+std::map<std::string, SimpleAtomicHisto> HistoValidator::histos = {
+    {"digi_n", SimpleAtomicHisto(100, 0, 1e5)},
+    {"digi_adc", SimpleAtomicHisto(250, 0, 5e4)},
+    {"module_n", SimpleAtomicHisto(100, 1500, 2000)},
+    {"cluster_n", SimpleAtomicHisto(200, 5000, 25000)},
+    {"cluster_per_module_n", SimpleAtomicHisto(110, 0, 110)},
+    {"hit_n", SimpleAtomicHisto(200, 5000, 25000)},
+    {"hit_lx", SimpleAtomicHisto(200, -1, 1)},
+    {"hit_ly", SimpleAtomicHisto(800, -4, 4)},
+    {"hit_lex", SimpleAtomicHisto(100, 0, 5e-5)},
+    {"hit_ley", SimpleAtomicHisto(100, 0, 1e-4)},
+    {"hit_gx", SimpleAtomicHisto(200, -20, 20)},
+    {"hit_gy", SimpleAtomicHisto(200, -20, 20)},
+    {"hit_gz", SimpleAtomicHisto(600, -60, 60)},
+    {"hit_gr", SimpleAtomicHisto(200, 0, 20)},
+    {"hit_charge", SimpleAtomicHisto(400, 0, 4e6)},
+    {"hit_sizex", SimpleAtomicHisto(800, 0, 800)},
+    {"hit_sizey", SimpleAtomicHisto(800, 0, 800)},
+    {"track_n", SimpleAtomicHisto(150, 0, 15000)},
+    {"track_nhits", SimpleAtomicHisto(3, 3, 6)},
+    {"track_chi2", SimpleAtomicHisto(100, 0, 40)},
+    {"track_pt", SimpleAtomicHisto(400, 0, 400)},
+    {"track_eta", SimpleAtomicHisto(100, -3, 3)},
+    {"track_phi", SimpleAtomicHisto(100, -3.15, 3.15)},
+    {"track_tip", SimpleAtomicHisto(100, -1, 1)},
+    {"track_tip_zoom", SimpleAtomicHisto(100, -0.05, 0.05)},
+    {"track_zip", SimpleAtomicHisto(100, -15, 15)},
+    {"track_zip_zoom", SimpleAtomicHisto(100, -0.1, 0.1)},
+    {"track_quality", SimpleAtomicHisto(6, 0, 6)},
+    {"vertex_n", SimpleAtomicHisto(60, 0, 60)},
+    {"vertex_z", SimpleAtomicHisto(100, -15, 15)},
+    {"vertex_chi2", SimpleAtomicHisto(100, 0, 40)},
+    {"vertex_ndof", SimpleAtomicHisto(170, 0, 170)},
+    {"vertex_pt2", SimpleAtomicHisto(100, 0, 4000)}};
+
+HistoValidator::HistoValidator(edm::ProductRegistry& reg)
+    : digiToken_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
+      clusterToken_(reg.consumes<cms::cuda::Product<SiPixelClustersCUDA>>()),
+      hitToken_(reg.consumes<cms::cuda::Product<TrackingRecHit2DCUDA>>()),
+      trackToken_(reg.consumes<PixelTrackHeterogeneous>()),
+      vertexToken_(reg.consumes<ZVertexHeterogeneous>()) {}
+
+void HistoValidator::acquire(const edm::Event& iEvent,
+                             const edm::EventSetup& iSetup,
+                             edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  auto const& pdigis = iEvent.get(digiToken_);
+  cms::cuda::ScopedContextAcquire ctx{pdigis, std::move(waitingTaskHolder)};
+  auto const& digis = ctx.get(iEvent, digiToken_);
+  auto const& clusters = ctx.get(iEvent, clusterToken_);
+  auto const& hits = ctx.get(iEvent, hitToken_);
+
+  nDigis = digis.nDigis();
+  nModules = digis.nModules();
+  h_adc = digis.adcToHostAsync(ctx.stream());
+
+  nClusters = clusters.nClusters();
+  h_clusInModule = cms::cuda::make_host_unique<uint32_t[]>(nModules, ctx.stream());
+  cudaCheck(cudaMemcpyAsync(
+      h_clusInModule.get(), clusters.clusInModule(), sizeof(uint32_t) * nModules, cudaMemcpyDefault, ctx.stream()));
+
+  nHits = hits.nHits();
+  h_localCoord = hits.localCoordToHostAsync(ctx.stream());
+  h_globalCoord = hits.globalCoordToHostAsync(ctx.stream());
+  h_charge = hits.chargeToHostAsync(ctx.stream());
+  h_size = hits.sizeToHostAsync(ctx.stream());
+}
+
+void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  histos["digi_n"].fill(nDigis);
+  for (uint32_t i = 0; i < nDigis; ++i) {
+    histos["digi_adc"].fill(h_adc[i]);
+  }
+  h_adc.reset();
+  histos["module_n"].fill(nModules);
+
+  histos["cluster_n"].fill(nClusters);
+  for (uint32_t i = 0; i < nModules; ++i) {
+    histos["cluster_per_module_n"].fill(h_clusInModule[i]);
+  }
+  h_clusInModule.reset();
+
+  histos["hit_n"].fill(nHits);
+  for (uint32_t i = 0; i < nHits; ++i) {
+    histos["hit_lx"].fill(h_localCoord[i]);
+    histos["hit_ly"].fill(h_localCoord[i + nHits]);
+    histos["hit_lex"].fill(h_localCoord[i + 2 * nHits]);
+    histos["hit_ley"].fill(h_localCoord[i + 3 * nHits]);
+    histos["hit_gx"].fill(h_globalCoord[i]);
+    histos["hit_gy"].fill(h_globalCoord[i + nHits]);
+    histos["hit_gz"].fill(h_globalCoord[i + 2 * nHits]);
+    histos["hit_gr"].fill(h_globalCoord[i + 3 * nHits]);
+    histos["hit_charge"].fill(h_charge[i]);
+    histos["hit_sizex"].fill(h_size[i]);
+    histos["hit_sizey"].fill(h_size[i + nHits]);
+  }
+  h_localCoord.reset();
+  h_globalCoord.reset();
+  h_charge.reset();
+  h_size.reset();
+
+  {
+    auto const& tracks = iEvent.get(trackToken_);
+
+    int nTracks = 0;
+    for (int i = 0; i < tracks->stride(); ++i) {
+      if (tracks->nHits(i) > 0 and tracks->quality(i) >= trackQuality::loose) {
+        ++nTracks;
+        histos["track_nhits"].fill(tracks->nHits(i));
+        histos["track_chi2"].fill(tracks->chi2(i));
+        histos["track_pt"].fill(tracks->pt(i));
+        histos["track_eta"].fill(tracks->eta(i));
+        histos["track_phi"].fill(tracks->phi(i));
+        histos["track_tip"].fill(tracks->tip(i));
+        histos["track_tip_zoom"].fill(tracks->tip(i));
+        histos["track_zip"].fill(tracks->zip(i));
+        histos["track_zip_zoom"].fill(tracks->zip(i));
+        histos["track_quality"].fill(tracks->quality(i));
+      }
+    }
+
+    histos["track_n"].fill(nTracks);
+  }
+
+  {
+    auto const& vertices = iEvent.get(vertexToken_);
+
+    histos["vertex_n"].fill(vertices->nvFinal);
+    for (uint32_t i = 0; i < vertices->nvFinal; ++i) {
+      histos["vertex_z"].fill(vertices->zv[i]);
+      histos["vertex_chi2"].fill(vertices->chi2[i]);
+      histos["vertex_ndof"].fill(vertices->ndof[i]);
+      histos["vertex_pt2"].fill(vertices->ptv2[i]);
+    }
+  }
+}
+
+void HistoValidator::endJob() {
+  std::ofstream out("histograms_cuda.txt");
+  for (auto const& elem : histos) {
+    out << elem.first << " " << elem.second << "\n";
+  }
+}
+
+DEFINE_FWK_MODULE(HistoValidator);
diff --git a/src/cudacompat/plugin-Validation/SimpleAtomicHisto.h b/src/cudacompat/plugin-Validation/SimpleAtomicHisto.h
new file mode 100644
index 000000000..5d05f3272
--- /dev/null
+++ b/src/cudacompat/plugin-Validation/SimpleAtomicHisto.h
@@ -0,0 +1,60 @@
+#ifndef SimpleAtomicHisto_h
+#define SimpleAtomicHisto_h
+
+#include <atomic>
+#include <cassert>
+#include <exception>
+#include <ostream>
+#include <vector>
+
+class SimpleAtomicHisto {
+public:
+  SimpleAtomicHisto() = default;
+  explicit SimpleAtomicHisto(int nbins, float min, float max) : data_(nbins + 2), min_(min), max_(max) {}
+
+  // dirty
+  SimpleAtomicHisto(SimpleAtomicHisto&& o) : data_(o.data_.size()), min_(o.min_), max_(o.max_) {}
+  SimpleAtomicHisto(SimpleAtomicHisto const& o) : data_(o.data_.size()), min_(o.min_), max_(o.max_) {}
+
+  // thread safe
+  void fill(float value) {
+    int i;
+    if (value < min_) {
+      i = 0;
+    } else if (value >= max_) {
+      i = data_.size() - 1;
+    } else {
+      i = (value - min_) / (max_ - min_) * (data_.size() - 2);
+      // handle rounding near maximum
+      if (static_cast<unsigned int>(i) == data_.size() - 2) {
+        i = data_.size() - 3;
+      }
+      if (not(i >= 0 and static_cast<unsigned int>(i) < data_.size() - 2)) {
+        throw std::runtime_error("SimpleAtomicHisto::fill(" + std::to_string(value) + "): i " + std::to_string(i) +
+                                 " min " + std::to_string(min_) + " max " + std::to_string(max_) + " nbins " +
+                                 std::to_string(data_.size() - 2));
+      }
+      ++i;
+    }
+    assert(i >= 0 and static_cast<unsigned int>(i) < data_.size());
+    data_[i] += 1;
+  }
+
+  void dump(std::ostream& os) const {
+    os << data_.size() << " " << min_ << " " << max_;
+    for (auto const& item : data_) {
+      os << " " << item;
+    }
+  };
+
+private:
+  std::vector<std::atomic<int>> data_;
+  float min_, max_;
+};
+
+inline std::ostream& operator<<(std::ostream& os, SimpleAtomicHisto const& h) {
+  h.dump(os);
+  return os;
+}
+
+#endif
diff --git a/src/cudacompat/plugins.txt b/src/cudacompat/plugins.txt
new file mode 100644
index 000000000..52dfe3102
--- /dev/null
+++ b/src/cudacompat/plugins.txt
@@ -0,0 +1,14 @@
+BeamSpotESProducer pluginBeamSpotProducer.so
+BeamSpotToCUDA pluginBeamSpotProducer.so
+CAHitNtupletCUDA pluginPixelTriplets.so
+CountValidator pluginValidation.so
+HistoValidator pluginValidation.so
+SiPixelFedCablingMapGPUWrapperESProducer pluginSiPixelClusterizer.so
+SiPixelGainCalibrationForHLTGPUESProducer pluginSiPixelClusterizer.so
+SiPixelRawToClusterCUDA pluginSiPixelClusterizer.so
+SiPixelDigisSoAFromCUDA pluginSiPixelRawToDigi.so
+PixelCPEFastESProducer pluginSiPixelRecHits.so
+PixelTrackSoAFromCUDA pluginPixelTrackFitting.so
+PixelVertexProducerCUDA pluginPixelVertexFinding.so
+PixelVertexSoAFromCUDA pluginPixelVertexFinding.so
+SiPixelRecHitCUDA pluginSiPixelRecHits.so
diff --git a/src/cudacompat/test/AtomicPairCounter_t.cu b/src/cudacompat/test/AtomicPairCounter_t.cu
new file mode 100644
index 000000000..8e737e9cb
--- /dev/null
+++ b/src/cudacompat/test/AtomicPairCounter_t.cu
@@ -0,0 +1,67 @@
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include "CUDACore/AtomicPairCounter.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/cuda_assert.h"
+
+__global__ void update(cms::cuda::AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto m = i % 11;
+  m = m % 6 + 1;  // max 6, no 0
+  auto c = dc->add(m);
+  assert(c.m < n);
+  ind[c.m] = c.n;
+  for (int j = c.n; j < c.n + m; ++j)
+    cont[j] = i;
+};
+
+__global__ void finalize(cms::cuda::AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
+  assert(dc->get().m == n);
+  ind[n] = dc->get().n;
+}
+
+__global__ void verify(cms::cuda::AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+  assert(0 == ind[0]);
+  assert(dc->get().m == n);
+  assert(ind[n] == dc->get().n);
+  auto ib = ind[i];
+  auto ie = ind[i + 1];
+  auto k = cont[ib++];
+  assert(k < n);
+  for (; ib < ie; ++ib)
+    assert(cont[ib] == k);
+}
+
+int main() {
+  cms::cuda::AtomicPairCounter *dc_d;
+  cudaCheck(cudaMalloc(&dc_d, sizeof(cms::cuda::AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(cms::cuda::AtomicPairCounter)));
+
+  std::cout << "size " << sizeof(cms::cuda::AtomicPairCounter) << std::endl;
+
+  constexpr uint32_t N = 20000;
+  constexpr uint32_t M = N * 6;
+  uint32_t *n_d, *m_d;
+  cudaCheck(cudaMalloc(&n_d, N * sizeof(int)));
+  // cudaMemset(n_d, 0, N*sizeof(int));
+  cudaCheck(cudaMalloc(&m_d, M * sizeof(int)));
+
+  update<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
+  finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000);
+  verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
+
+  cms::cuda::AtomicPairCounter dc;
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(cms::cuda::AtomicPairCounter), cudaMemcpyDeviceToHost));
+
+  std::cout << dc.get().n << ' ' << dc.get().m << std::endl;
+
+  return 0;
+}
diff --git a/src/cudacompat/test/HistoContainer_t.cu b/src/cudacompat/test/HistoContainer_t.cu
new file mode 100644
index 000000000..15aafe0d3
--- /dev/null
+++ b/src/cudacompat/test/HistoContainer_t.cu
@@ -0,0 +1,155 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
+
+template <typename T>
+void go() {
+  std::mt19937 eng;
+  std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+
+  constexpr int N = 12000;
+  T v[N];
+  auto v_d = make_device_unique<T[]>(N, nullptr);
+
+  cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+
+  constexpr uint32_t nParts = 10;
+  constexpr uint32_t partSize = N / nParts;
+  uint32_t offsets[nParts + 1];
+
+  using Hist = HistoContainer<T, 128, N, 8 * sizeof(T), uint32_t, nParts>;
+  std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
+            << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' '
+            << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
+
+  Hist h;
+  auto h_d = make_device_unique<Hist[]>(1, nullptr);
+
+  auto off_d = make_device_unique<uint32_t[]>(nParts + 1, nullptr);
+
+  for (int it = 0; it < 5; ++it) {
+    offsets[0] = 0;
+    for (uint32_t j = 1; j < nParts + 1; ++j) {
+      offsets[j] = offsets[j - 1] + partSize - 3 * j;
+      assert(offsets[j] <= N);
+    }
+
+    if (it == 1) {  // special cases...
+      offsets[0] = 0;
+      offsets[1] = 0;
+      offsets[2] = 19;
+      offsets[3] = 32 + offsets[2];
+      offsets[4] = 123 + offsets[3];
+      offsets[5] = 256 + offsets[4];
+      offsets[6] = 311 + offsets[5];
+      offsets[7] = 2111 + offsets[6];
+      offsets[8] = 256 * 11 + offsets[7];
+      offsets[9] = 44 + offsets[8];
+      offsets[10] = 3297 + offsets[9];
+    }
+
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
+
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+
+    if (it == 2) {  // big bin
+      for (long long j = 1000; j < 2000; j++)
+        v[j] = sizeof(T) == 1 ? 22 : 3456;
+    }
+
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+
+    fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
+    cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
+    assert(0 == h.off[0]);
+    assert(offsets[10] == h.size());
+
+    auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
+      assert(t1 < N);
+      assert(t2 < N);
+      if (T(v[t1] - v[t2]) <= 0)
+        std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
+    };
+
+    auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
+
+    // make sure it spans 3 bins...
+    auto window = T(1300);
+
+    for (uint32_t j = 0; j < nParts; ++j) {
+      auto off = Hist::histOff(j);
+      for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+        auto ii = i + off;
+        if (0 == h.size(ii))
+          continue;
+        auto k = *h.begin(ii);
+        if (j % 2)
+          k = *(h.begin(ii) + (h.end(ii) - h.begin(ii)) / 2);
+        auto bk = h.bin(v[k]);
+        assert(bk == i);
+        assert(k < offsets[j + 1]);
+        auto kl = h.bin(v[k] - window);
+        auto kh = h.bin(v[k] + window);
+        assert(kl != i);
+        assert(kh != i);
+        // std::cout << kl << ' ' << kh << std::endl;
+
+        auto me = v[k];
+        auto tot = 0;
+        auto nm = 0;
+        bool l = true;
+        auto khh = kh;
+        incr(khh);
+        for (auto kk = kl; kk != khh; incr(kk)) {
+          if (kk != kl && kk != kh)
+            nm += h.size(kk + off);
+          for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) {
+            if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) {
+            } else {
+              ++tot;
+            }
+          }
+          if (kk == i) {
+            l = false;
+            continue;
+          }
+          if (l)
+            for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p)
+              verify(i, k, k, (*p));
+          else
+            for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p)
+              verify(i, k, (*p), k);
+        }
+        if (!(tot >= nm)) {
+          std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                    << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                    << std::endl;
+        }
+        if (l)
+          std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                    << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                    << std::endl;
+        assert(!l);
+      }
+    }
+  }
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  go<int16_t>();
+  go<int8_t>();
+
+  return 0;
+}
diff --git a/src/cudacompat/test/HistoContainer_t_cpu.cc b/src/cudacompat/test/HistoContainer_t_cpu.cc
new file mode 100644
index 000000000..ad1121ef1
--- /dev/null
+++ b/src/cudacompat/test/HistoContainer_t_cpu.cc
@@ -0,0 +1,146 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
+
+template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
+void go() {
+  std::mt19937 eng;
+
+  int rmin = std::numeric_limits<T>::min();
+  int rmax = std::numeric_limits<T>::max();
+  if (NBINS != 128) {
+    rmin = 0;
+    rmax = NBINS * 2 - 1;
+  }
+
+  std::uniform_int_distribution<T> rgen(rmin, rmax);
+
+  constexpr int N = 12000;
+  T v[N];
+
+  using Hist = HistoContainer<T, NBINS, N, S>;
+  using Hist4 = HistoContainer<T, NBINS, N, S, uint16_t, 4>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
+            << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+  std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' '
+            << Hist4::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+  for (auto nh = 0; nh < 4; ++nh)
+    std::cout << "bins " << int(Hist4::bin(0)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin)) + Hist4::histOff(nh)
+              << ' ' << int(Hist::bin(rmax)) + Hist4::histOff(nh) << std::endl;
+
+  Hist h;
+  Hist4 h4;
+  for (int it = 0; it < 5; ++it) {
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+    if (it == 2)
+      for (long long j = N / 2; j < N / 2 + N / 4; j++)
+        v[j] = 4;
+    h.zero();
+    h4.zero();
+    assert(h.size() == 0);
+    assert(h4.size() == 0);
+    for (long long j = 0; j < N; j++) {
+      h.count(v[j]);
+      if (j < 2000)
+        h4.count(v[j], 2);
+      else
+        h4.count(v[j], j % 4);
+    }
+    assert(h.size() == 0);
+    assert(h4.size() == 0);
+    h.finalize();
+    h4.finalize();
+    assert(h.size() == N);
+    assert(h4.size() == N);
+    for (long long j = 0; j < N; j++) {
+      h.fill(v[j], j);
+      if (j < 2000)
+        h4.fill(v[j], j, 2);
+      else
+        h4.fill(v[j], j, j % 4);
+    }
+    assert(h.off[0] == 0);
+    assert(h4.off[0] == 0);
+    assert(h.size() == N);
+    assert(h4.size() == N);
+
+    auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
+      assert((int32_t)t1 < N);
+      assert((int32_t)t2 < N);
+      if (i != j && T(v[t1] - v[t2]) <= 0)
+        std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
+    };
+
+    for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+      if (0 == h.size(i))
+        continue;
+      auto k = *h.begin(i);
+      assert(k < N);
+      auto kl = NBINS != 128 ? h.bin(std::max(rmin, v[k] - DELTA)) : h.bin(v[k] - T(DELTA));
+      auto kh = NBINS != 128 ? h.bin(std::min(rmax, v[k] + DELTA)) : h.bin(v[k] + T(DELTA));
+      if (NBINS == 128) {
+        assert(kl != i);
+        assert(kh != i);
+      }
+      if (NBINS != 128) {
+        assert(kl <= i);
+        assert(kh >= i);
+      }
+      // std::cout << kl << ' ' << kh << std::endl;
+      for (auto j = h.begin(kl); j < h.end(kl); ++j)
+        verify(i, kl, k, k, (*j));
+      for (auto j = h.begin(kh); j < h.end(kh); ++j)
+        verify(i, kh, k, (*j), k);
+    }
+  }
+
+  for (long long j = 0; j < N; j++) {
+    auto b0 = h.bin(v[j]);
+    int w = 0;
+    int tot = 0;
+    auto ftest = [&](int k) {
+      assert(k >= 0 && k < N);
+      tot++;
+    };
+    forEachInBins(h, v[j], w, ftest);
+    int rtot = h.end(b0) - h.begin(b0);
+    assert(tot == rtot);
+    w = 1;
+    tot = 0;
+    forEachInBins(h, v[j], w, ftest);
+    int bp = b0 + 1;
+    int bm = b0 - 1;
+    if (bp < int(h.nbins()))
+      rtot += h.end(bp) - h.begin(bp);
+    if (bm >= 0)
+      rtot += h.end(bm) - h.begin(bm);
+    assert(tot == rtot);
+    w = 2;
+    tot = 0;
+    forEachInBins(h, v[j], w, ftest);
+    bp++;
+    bm--;
+    if (bp < int(h.nbins()))
+      rtot += h.end(bp) - h.begin(bp);
+    if (bm >= 0)
+      rtot += h.end(bm) - h.begin(bm);
+    assert(tot == rtot);
+  }
+}
+
+int main() {
+  go<int16_t>();
+  go<uint8_t, 128, 8, 4>();
+  go<uint16_t, 313 / 2, 9, 4>();
+
+  return 0;
+}
diff --git a/src/cudacompat/test/OneHistoContainer_t.cu b/src/cudacompat/test/OneHistoContainer_t.cu
new file mode 100644
index 000000000..960f77eca
--- /dev/null
+++ b/src/cudacompat/test/OneHistoContainer_t.cu
@@ -0,0 +1,142 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <random>
+#include <limits>
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/launch.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
+
+template <typename T, int NBINS, int S, int DELTA>
+__global__ void mykernel(T const* __restrict__ v, uint32_t N) {
+  assert(v);
+  assert(N == 12000);
+
+  if (threadIdx.x == 0)
+    printf("start kernel for %d data\n", N);
+
+  using Hist = HistoContainer<T, NBINS, 12000, S, uint16_t>;
+
+  __shared__ Hist hist;
+  __shared__ typename Hist::Counter ws[32];
+
+  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+    hist.off[j] = 0;
+  }
+  __syncthreads();
+
+  for (auto j = threadIdx.x; j < N; j += blockDim.x)
+    hist.count(v[j]);
+  __syncthreads();
+
+  assert(0 == hist.size());
+  __syncthreads();
+
+  hist.finalize(ws);
+  __syncthreads();
+
+  assert(N == hist.size());
+  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x)
+    assert(hist.off[j] <= hist.off[j + 1]);
+  __syncthreads();
+
+  if (threadIdx.x < 32)
+    ws[threadIdx.x] = 0;  // used by prefix scan...
+  __syncthreads();
+
+  for (auto j = threadIdx.x; j < N; j += blockDim.x)
+    hist.fill(v[j], j);
+  __syncthreads();
+  assert(0 == hist.off[0]);
+  assert(N == hist.size());
+
+  for (auto j = threadIdx.x; j < hist.size() - 1; j += blockDim.x) {
+    auto p = hist.begin() + j;
+    assert((*p) < N);
+    auto k1 = Hist::bin(v[*p]);
+    auto k2 = Hist::bin(v[*(p + 1)]);
+    assert(k2 >= k1);
+  }
+
+  for (auto i = threadIdx.x; i < hist.size(); i += blockDim.x) {
+    auto p = hist.begin() + i;
+    auto j = *p;
+    auto b0 = Hist::bin(v[j]);
+    int tot = 0;
+    auto ftest = [&](int k) {
+      assert(k >= 0 && k < N);
+      ++tot;
+    };
+    forEachInWindow(hist, v[j], v[j], ftest);
+    int rtot = hist.size(b0);
+    assert(tot == rtot);
+    tot = 0;
+    auto vm = int(v[j]) - DELTA;
+    auto vp = int(v[j]) + DELTA;
+    constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits<T>::max();
+    vm = std::max(vm, 0);
+    vm = std::min(vm, vmax);
+    vp = std::min(vp, vmax);
+    vp = std::max(vp, 0);
+    assert(vp >= vm);
+    forEachInWindow(hist, vm, vp, ftest);
+    int bp = Hist::bin(vp);
+    int bm = Hist::bin(vm);
+    rtot = hist.end(bp) - hist.begin(bm);
+    assert(tot == rtot);
+  }
+}
+
+template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
+void go() {
+  std::mt19937 eng;
+
+  int rmin = std::numeric_limits<T>::min();
+  int rmax = std::numeric_limits<T>::max();
+  if (NBINS != 128) {
+    rmin = 0;
+    rmax = NBINS * 2 - 1;
+  }
+
+  std::uniform_int_distribution<T> rgen(rmin, rmax);
+
+  constexpr int N = 12000;
+  T v[N];
+
+  auto v_d = make_device_unique<T[]>(N, nullptr);
+  assert(v_d.get());
+
+  using Hist = HistoContainer<T, NBINS, N, S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' '
+            << (rmax - rmin) / Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+
+  for (int it = 0; it < 5; ++it) {
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+    if (it == 2)
+      for (long long j = N / 2; j < N / 2 + N / 4; j++)
+        v[j] = 4;
+
+    assert(v_d.get());
+    assert(v);
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+    assert(v_d.get());
+    launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
+  }
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  go<int16_t>();
+  go<uint8_t, 128, 8, 4>();
+  go<uint16_t, 313 / 2, 9, 4>();
+
+  return 0;
+}
diff --git a/src/cudacompat/test/OneToManyAssoc_cpu_t.cc b/src/cudacompat/test/OneToManyAssoc_cpu_t.cc
new file mode 100644
index 000000000..3d452e851
--- /dev/null
+++ b/src/cudacompat/test/OneToManyAssoc_cpu_t.cc
@@ -0,0 +1 @@
+#include "OneToManyAssoc_t.h"
diff --git a/src/cudacompat/test/OneToManyAssoc_t.cu b/src/cudacompat/test/OneToManyAssoc_t.cu
new file mode 100644
index 000000000..3d452e851
--- /dev/null
+++ b/src/cudacompat/test/OneToManyAssoc_t.cu
@@ -0,0 +1 @@
+#include "OneToManyAssoc_t.h"
diff --git a/src/cudacompat/test/OneToManyAssoc_t.h b/src/cudacompat/test/OneToManyAssoc_t.h
new file mode 100644
index 000000000..69c3ade3d
--- /dev/null
+++ b/src/cudacompat/test/OneToManyAssoc_t.h
@@ -0,0 +1,304 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <random>
+#include <limits>
+#include <array>
+#include <memory>
+
+#ifdef __CUDACC__
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
+#include "CUDACore/currentDevice.h"
+#endif
+
+#include "CUDACore/HistoContainer.h"
+using cms::cuda::AtomicPairCounter;
+
+constexpr uint32_t MaxElem = 64000;
+constexpr uint32_t MaxTk = 8000;
+constexpr uint32_t MaxAssocs = 4 * MaxTk;
+
+using Assoc = cms::cuda::OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
+using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, MaxAssocs>;
+using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
+using TK = std::array<uint16_t, 4>;
+
+__global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < n; i += gridDim.x * blockDim.x) {
+    __shared__ Multiplicity::CountersOnly local;
+    if (threadIdx.x == 0)
+      local.zero();
+    __syncthreads();
+    local.countDirect(2 + i % 4);
+    __syncthreads();
+    if (threadIdx.x == 0)
+      assoc->add(local);
+  }
+}
+
+__global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < n; i += gridDim.x * blockDim.x)
+    assoc->countDirect(2 + i % 4);
+}
+
+__global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) {
+  auto first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x)
+    assert(m1->off[i] == m2->off[i]);
+}
+
+__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
+    auto k = i / 4;
+    auto j = i - 4 * k;
+    assert(j < 4);
+    if (k >= n)
+      return;
+    if (tk[k][j] < MaxElem)
+      assoc->countDirect(tk[k][j]);
+  }
+}
+
+__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
+    auto k = i / 4;
+    auto j = i - 4 * k;
+    assert(j < 4);
+    if (k >= n)
+      return;
+    if (tk[k][j] < MaxElem)
+      assoc->fillDirect(tk[k][j], k);
+  }
+}
+
+__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); }
+
+template <typename Assoc>
+__global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int k = first; k < n; k += gridDim.x * blockDim.x) {
+    auto m = tk[k][3] < MaxElem ? 4 : 3;
+    assoc->bulkFill(*apc, &tk[k][0], m);
+  }
+}
+
+template <typename Assoc>
+__global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) {
+  if (apc->get().m >= Assoc::nbins())
+    printf("Overflow %d %d\n", apc->get().m, Assoc::nbins());
+  assert(assoc->size() < Assoc::capacity());
+}
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+  auto current_device = cms::cuda::currentDevice();
+#else
+  // make sure cuda emulation is working
+  std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl;
+  std::cout << "cuda y's " << threadIdx.y << ' ' << blockIdx.y << ' ' << blockDim.y << ' ' << gridDim.y << std::endl;
+  std::cout << "cuda z's " << threadIdx.z << ' ' << blockIdx.z << ' ' << blockDim.z << ' ' << gridDim.z << std::endl;
+  assert(threadIdx.x == 0);
+  assert(threadIdx.y == 0);
+  assert(threadIdx.z == 0);
+  assert(blockIdx.x == 0);
+  assert(blockIdx.y == 0);
+  assert(blockIdx.z == 0);
+  assert(blockDim.x == 1);
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  assert(gridDim.x == 1);
+  assert(gridDim.y == 1);
+  assert(gridDim.z == 1);
+#endif
+
+  std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl;
+  std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' '
+            << SmallAssoc::capacity() << std::endl;
+
+  std::mt19937 eng;
+
+  std::geometric_distribution<int> rdm(0.8);
+
+  constexpr uint32_t N = 4000;
+
+  std::vector<std::array<uint16_t, 4>> tr(N);
+
+  // fill with "index" to element
+  long long ave = 0;
+  int imax = 0;
+  auto n = 0U;
+  auto z = 0U;
+  auto nz = 0U;
+  for (auto i = 0U; i < 4U; ++i) {
+    auto j = 0U;
+    while (j < N && n < MaxElem) {
+      if (z == 11) {
+        ++n;
+        z = 0;
+        ++nz;
+        continue;
+      }  // a bit of not assoc
+      auto x = rdm(eng);
+      auto k = std::min(j + x + 1, N);
+      if (i == 3 && z == 3) {  // some triplets time to time
+        for (; j < k; ++j)
+          tr[j][i] = MaxElem + 1;
+      } else {
+        ave += x + 1;
+        imax = std::max(imax, x);
+        for (; j < k; ++j)
+          tr[j][i] = n;
+        ++n;
+      }
+      ++z;
+    }
+    assert(n <= MaxElem);
+    assert(j <= N);
+  }
+  std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
+
+#ifdef __CUDACC__
+  auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
+  assert(v_d.get());
+  auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
+  auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
+  cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
+#else
+  auto a_d = std::make_unique<Assoc>();
+  auto sa_d = std::make_unique<SmallAssoc>();
+  auto v_d = tr.data();
+#endif
+
+  launchZero(a_d.get(), 0);
+
+#ifdef __CUDACC__
+  auto nThreads = 256;
+  auto nBlocks = (4 * N + nThreads - 1) / nThreads;
+
+  count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+
+  launchFinalize(a_d.get(), 0);
+  verify<<<1, 1>>>(a_d.get());
+  fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+#else
+  count(v_d, a_d.get(), N);
+  launchFinalize(a_d.get());
+  verify(a_d.get());
+  fill(v_d, a_d.get(), N);
+#endif
+
+  Assoc la;
+
+#ifdef __CUDACC__
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
+#else
+  memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
+#endif
+
+  std::cout << la.size() << std::endl;
+  imax = 0;
+  ave = 0;
+  z = 0;
+  for (auto i = 0U; i < n; ++i) {
+    auto x = la.size(i);
+    if (x == 0) {
+      z++;
+      continue;
+    }
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la.size(n));
+  std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
+
+  // now the inverse map (actually this is the direct....)
+  AtomicPairCounter* dc_d;
+  AtomicPairCounter dc(0);
+
+#ifdef __CUDACC__
+  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
+  nBlocks = (N + nThreads - 1) / nThreads;
+  fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
+  finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
+  verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
+
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
+
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
+  fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
+  finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
+  verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
+
+#else
+  dc_d = &dc;
+  fillBulk(dc_d, v_d, a_d.get(), N);
+  finalizeBulk(dc_d, a_d.get());
+  verifyBulk(a_d.get(), dc_d);
+  memcpy(&la, a_d.get(), sizeof(Assoc));
+
+  AtomicPairCounter sdc(0);
+  fillBulk(&sdc, v_d, sa_d.get(), N);
+  finalizeBulk(&sdc, sa_d.get());
+  verifyBulk(sa_d.get(), &sdc);
+
+#endif
+
+  std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl;
+
+  std::cout << la.size() << std::endl;
+  imax = 0;
+  ave = 0;
+  for (auto i = 0U; i < N; ++i) {
+    auto x = la.size(i);
+    if (!(x == 4 || x == 3))
+      std::cout << i << ' ' << x << std::endl;
+    assert(x == 4 || x == 3);
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la.size(N));
+  std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
+
+  // here verify use of block local counters
+#ifdef __CUDACC__
+  auto m1_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m2_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
+#else
+  auto m1_d = std::make_unique<Multiplicity>();
+  auto m2_d = std::make_unique<Multiplicity>();
+#endif
+  launchZero(m1_d.get(), 0);
+  launchZero(m2_d.get(), 0);
+
+#ifdef __CUDACC__
+  nBlocks = (4 * N + nThreads - 1) / nThreads;
+  countMulti<<<nBlocks, nThreads>>>(v_d.get(), m1_d.get(), N);
+  countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
+  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+
+  launchFinalize(m1_d.get(), 0);
+  launchFinalize(m2_d.get(), 0);
+  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  countMulti(v_d, m1_d.get(), N);
+  countMultiLocal(v_d, m2_d.get(), N);
+  verifyMulti(m1_d.get(), m2_d.get());
+
+  launchFinalize(m1_d.get());
+  launchFinalize(m2_d.get());
+  verifyMulti(m1_d.get(), m2_d.get());
+#endif
+  return 0;
+}
diff --git a/src/cudacompat/test/TrackingRecHit2DCUDA_t.cu b/src/cudacompat/test/TrackingRecHit2DCUDA_t.cu
new file mode 100644
index 000000000..5f25e4c1a
--- /dev/null
+++ b/src/cudacompat/test/TrackingRecHit2DCUDA_t.cu
@@ -0,0 +1,56 @@
+#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDACore/copyAsync.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace testTrackingRecHit2D {
+
+  __global__ void fill(TrackingRecHit2DSOAView* phits) {
+    assert(phits);
+    auto& hits = *phits;
+    assert(hits.nHits() == 200);
+
+    int i = threadIdx.x;
+    if (i > 200)
+      return;
+  }
+
+  __global__ void verify(TrackingRecHit2DSOAView const* phits) {
+    assert(phits);
+    auto const& hits = *phits;
+    assert(hits.nHits() == 200);
+
+    int i = threadIdx.x;
+    if (i > 200)
+      return;
+  }
+
+  void runKernels(TrackingRecHit2DSOAView* hits) {
+    assert(hits);
+    fill<<<1, 1024>>>(hits);
+    verify<<<1, 1024>>>(hits);
+  }
+
+}  // namespace testTrackingRecHit2D
+
+namespace testTrackingRecHit2D {
+
+  void runKernels(TrackingRecHit2DSOAView* hits);
+
+}
+
+int main() {
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  // inner scope to deallocate memory before destroying the stream
+  {
+    auto nHits = 200;
+    TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);
+
+    testTrackingRecHit2D::runKernels(tkhit.view());
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/src/cudacompat/test/TrajectoryStateSOA_cpu_t.cc b/src/cudacompat/test/TrajectoryStateSOA_cpu_t.cc
new file mode 100644
index 000000000..d6ff539a6
--- /dev/null
+++ b/src/cudacompat/test/TrajectoryStateSOA_cpu_t.cc
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/src/cudacompat/test/TrajectoryStateSOA_t.cu b/src/cudacompat/test/TrajectoryStateSOA_t.cu
new file mode 100644
index 000000000..d6ff539a6
--- /dev/null
+++ b/src/cudacompat/test/TrajectoryStateSOA_t.cu
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/src/cudacompat/test/TrajectoryStateSOA_t.h b/src/cudacompat/test/TrajectoryStateSOA_t.h
new file mode 100644
index 000000000..2fcf9fc09
--- /dev/null
+++ b/src/cudacompat/test/TrajectoryStateSOA_t.h
@@ -0,0 +1,75 @@
+#include "CUDADataFormats/TrajectoryStateSoA.h"
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+__host__ __device__ Matrix5d loadCov(Vector5d const& e) {
+  Matrix5d cov;
+  for (int i = 0; i < 5; ++i)
+    cov(i, i) = e(i) * e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
+      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
+      cov(j, i) = cov(i, j);
+    }
+  }
+  return cov;
+}
+
+using TS = TrajectoryStateSoA<128>;
+
+__global__ void testTSSoA(TS* pts, int n) {
+  assert(n <= 128);
+
+  Vector5d par0;
+  par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
+  Vector5d e0;
+  e0 << 0.01, 0.01, 0.035, -0.03, -0.01;
+  auto cov0 = loadCov(e0);
+
+  TS& ts = *pts;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    ts.copyFromDense(par0, cov0, i);
+    Vector5d par1;
+    Matrix5d cov1;
+    ts.copyToDense(par1, cov1, i);
+    Vector5d delV = par1 - par0;
+    Matrix5d delM = cov1 - cov0;
+    for (int j = 0; j < 5; ++j) {
+      assert(std::abs(delV(j)) < 1.e-5);
+      for (auto k = j; k < 5; ++k) {
+        assert(cov0(k, j) == cov0(j, k));
+        assert(cov1(k, j) == cov1(j, k));
+        assert(std::abs(delM(k, j)) < 1.e-5);
+      }
+    }
+  }
+}
+
+#ifdef __CUDACC__
+#include "CUDACore/requireDevices.h"
+#include "CUDACore/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
+  TS ts;
+
+#ifdef __CUDACC__
+  TS* ts_d;
+  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
+  testTSSoA<<<1, 64>>>(ts_d, 128);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testTSSoA(&ts, 128);
+#endif
+}
diff --git a/src/cudacompat/test/VertexFinder_t.h b/src/cudacompat/test/VertexFinder_t.h
new file mode 100644
index 000000000..53f26d2de
--- /dev/null
+++ b/src/cudacompat/test/VertexFinder_t.h
@@ -0,0 +1,350 @@
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
+#include "CUDACore/launch.h"
+#ifdef USE_DBSCAN
+#include "plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
+#elif USE_ITERATIVE
+#include "plugin-PixelVertexFinding/gpuClusterTracksIterative.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
+#else
+#include "plugin-PixelVertexFinding/gpuClusterTracksByDensity.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
+#endif
+#include "plugin-PixelVertexFinding/gpuFitVertices.h"
+#include "plugin-PixelVertexFinding/gpuSortByPt2.h"
+#include "plugin-PixelVertexFinding/gpuSplitVertices.h"
+
+#ifdef ONE_KERNEL
+#ifdef __CUDACC__
+__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+) {
+  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  __syncthreads();
+  fitVertices(pdata, pws, 50.);
+  __syncthreads();
+  splitVertices(pdata, pws, 9.f);
+  __syncthreads();
+  fitVertices(pdata, pws, 5000.);
+  __syncthreads();
+  sortByPt2(pdata, pws);
+}
+#endif
+#endif
+
+struct Event {
+  std::vector<float> zvert;
+  std::vector<uint16_t> itrack;
+  std::vector<float> ztrack;
+  std::vector<float> eztrack;
+  std::vector<float> pttrack;
+  std::vector<uint16_t> ivert;
+};
+
+struct ClusterGenerator {
+  explicit ClusterGenerator(float nvert, float ntrack)
+      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
+
+  void operator()(Event& ev) {
+    int nclus = clusGen(reng);
+    ev.zvert.resize(nclus);
+    ev.itrack.resize(nclus);
+    for (auto& z : ev.zvert) {
+      z = 3.5f * gauss(reng);
+    }
+
+    ev.ztrack.clear();
+    ev.eztrack.clear();
+    ev.ivert.clear();
+    for (int iv = 0; iv < nclus; ++iv) {
+      auto nt = trackGen(reng);
+      ev.itrack[nclus] = nt;
+      for (int it = 0; it < nt; ++it) {
+        auto err = errgen(reng);  // reality is not flat....
+        ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
+        ev.eztrack.push_back(err * err);
+        ev.ivert.push_back(iv);
+        ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
+        ev.pttrack.back() *= ev.pttrack.back();
+      }
+    }
+    // add noise
+    auto nt = 2 * trackGen(reng);
+    for (int it = 0; it < nt; ++it) {
+      auto err = 0.03f;
+      ev.ztrack.push_back(rgen(reng));
+      ev.eztrack.push_back(err * err);
+      ev.ivert.push_back(9999);
+      ev.pttrack.push_back(0.5f + ptGen(reng));
+      ev.pttrack.back() *= ev.pttrack.back();
+    }
+  }
+
+  std::mt19937 reng;
+  std::uniform_real_distribution<float> rgen;
+  std::uniform_real_distribution<float> errgen;
+  std::poisson_distribution<int> clusGen;
+  std::poisson_distribution<int> trackGen;
+  std::normal_distribution<float> gauss;
+  std::exponential_distribution<float> ptGen;
+};
+
+// a macro SORRY
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
+#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
+
+__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
+  auto const& __restrict__ data = *pdata;
+  auto const& __restrict__ ws = *pws;
+  printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
+}
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+
+  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+#else
+  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
+#endif
+
+  Event ev;
+
+  float eps = 0.1f;
+  std::array<float, 3> par{{eps, 0.01f, 9.0f}};
+  for (int nav = 30; nav < 80; nav += 20) {
+    ClusterGenerator gen(nav, 10);
+
+    for (int i = 8; i < 20; ++i) {
+      auto kk = i / 4;  // M param
+
+      gen(ev);
+
+#ifdef __CUDACC__
+      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+#else
+      onGPU_d->init();
+      ws_d->init();
+#endif
+
+      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+      auto nt = ev.ztrack.size();
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+#else
+      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
+      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+#endif
+
+      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+
+      if ((i % 4) == 0)
+        par = {{eps, 0.02f, 12.0f}};
+      if ((i % 4) == 1)
+        par = {{eps, 0.02f, 9.0f}};
+      if ((i % 4) == 2)
+        par = {{eps, 0.01f, 9.0f}};
+      if ((i % 4) == 3)
+        par = {{0.7f * eps, 0.01f, 9.0f}};
+
+      uint32_t nv = 0;
+#ifdef __CUDACC__
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
+
+#ifdef ONE_KERNEL
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#else
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#endif
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
+
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+#else
+      print(onGPU_d.get(), ws_d.get());
+      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.get(), ws_d.get());
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
+#endif
+
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
+
+      float* zv = nullptr;
+      float* wv = nullptr;
+      float* ptv2 = nullptr;
+      int32_t* nn = nullptr;
+      uint16_t* ind = nullptr;
+
+      // keep chi2 separated...
+      float chi2[2 * nv];  // make space for splitting...
+
+#ifdef __CUDACC__
+      float hzv[2 * nv];
+      float hwv[2 * nv];
+      float hptv2[2 * nv];
+      int32_t hnn[2 * nv];
+      uint16_t hind[2 * nv];
+
+      zv = hzv;
+      wv = hwv;
+      ptv2 = hptv2;
+      nn = hnn;
+      ind = hind;
+#else
+      zv = onGPU_d->zv;
+      wv = onGPU_d->wv;
+      ptv2 = onGPU_d->ptv2;
+      nn = onGPU_d->ndof;
+      ind = onGPU_d->sortInd;
+#endif
+
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+#ifdef __CUDACC__
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+#ifdef __CUDACC__
+      // one vertex per block!!!
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+      gridDim.x = 1;
+      assert(blockIdx.x == 0);
+      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+      resetGrid();
+      nv = ws_d->nvIntermediate;
+#endif
+      std::cout << "after split " << nv << std::endl;
+
+#ifdef __CUDACC__
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cudaCheck(cudaGetLastError());
+
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
+      sortByPt2(onGPU_d.get(), ws_d.get());
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
+
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+#endif
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+      {
+        auto mx = std::minmax_element(wv, wv + nv);
+        std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
+      }
+
+      {
+        auto mx = std::minmax_element(ptv2, ptv2 + nv);
+        std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
+        std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
+                  << ind[nv - 1] << std::endl;
+      }
+
+      float dd[nv];
+      for (auto kv = 0U; kv < nv; ++kv) {
+        auto zr = zv[kv];
+        auto md = 500.0f;
+        for (auto zs : ev.ztrack) {
+          auto d = std::abs(zr - zs);
+          md = std::min(d, md);
+        }
+        dd[kv] = md;
+      }
+      if (i == 6) {
+        for (auto d : dd)
+          std::cout << d << ' ';
+        std::cout << std::endl;
+      }
+      auto mx = std::minmax_element(dd, dd + nv);
+      float rms = 0;
+      for (auto d : dd)
+        rms += d * d;
+      rms = std::sqrt(rms) / (nv - 1);
+      std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+    }  // loop on events
+  }    // lopp on ave vert
+
+  return 0;
+}
diff --git a/src/cudacompat/test/cpuClustering_t.cc b/src/cudacompat/test/cpuClustering_t.cc
new file mode 100644
index 000000000..19a3b8d01
--- /dev/null
+++ b/src/cudacompat/test/cpuClustering_t.cc
@@ -0,0 +1 @@
+#include "gpuClustering_t.h"
diff --git a/src/cudacompat/test/cpuVertexFinder_t.cc b/src/cudacompat/test/cpuVertexFinder_t.cc
new file mode 100644
index 000000000..a7906fe0d
--- /dev/null
+++ b/src/cudacompat/test/cpuVertexFinder_t.cc
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"
diff --git a/src/cudacompat/test/cudastdAlgorithm_t.cu b/src/cudacompat/test/cudastdAlgorithm_t.cu
new file mode 100644
index 000000000..4ecc93cf5
--- /dev/null
+++ b/src/cudacompat/test/cudastdAlgorithm_t.cu
@@ -0,0 +1,30 @@
+#include <cassert>
+#include <iostream>
+
+#include "CUDACore/cudastdAlgorithm.h"
+#include "CUDACore/requireDevices.h"
+#include "CUDACore/launch.h"
+
+__global__ void testBinaryFind() {
+  int data[] = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6};
+
+  auto lower = cuda_std::lower_bound(data, data + 13, 4);
+  auto upper = cuda_std::upper_bound(data, data + 12, 4);
+
+  assert(3 == upper - lower);
+
+  // classic binary search, returning a value only if it is present
+
+  constexpr int data2[] = {1, 2, 4, 6, 9, 10};
+
+  assert(data2 + 2 == cuda_std::binary_find(data2, data2 + 6, 4));
+  assert(data2 + 6 == cuda_std::binary_find(data2, data2 + 6, 5));
+}
+
+void wrapper() { cms::cuda::launch(testBinaryFind, {32, 64}); }
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  wrapper();
+}
diff --git a/src/cudacompat/test/cudastdAlgorithm_t_cpu.cc b/src/cudacompat/test/cudastdAlgorithm_t_cpu.cc
new file mode 100644
index 000000000..3caa034d6
--- /dev/null
+++ b/src/cudacompat/test/cudastdAlgorithm_t_cpu.cc
@@ -0,0 +1,34 @@
+#include "CUDACore/cudastdAlgorithm.h"
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <vector>
+
+void testBinaryFind() {
+  std::vector<int> data = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6};
+
+  auto lower = cuda_std::lower_bound(data.begin(), data.end(), 4);
+  auto upper = cuda_std::upper_bound(data.begin(), data.end(), 4);
+
+  std::copy(lower, upper, std::ostream_iterator<int>(std::cout, " "));
+
+  std::cout << '\n';
+
+  // classic binary search, returning a value only if it is present
+
+  data = {1, 2, 4, 6, 9, 10};
+
+  auto test = [&](auto v) {
+    auto it = cuda_std::binary_find(data.cbegin(), data.cend(), v);
+
+    if (it != data.cend())
+      std::cout << *it << " found at index " << std::distance(data.cbegin(), it) << std::endl;
+    else
+      std::cout << v << " non found" << std::endl;
+  };
+
+  test(4);
+  test(5);
+}
+
+int main() { testBinaryFind(); }
diff --git a/src/cudacompat/test/eigenSoA_t.cu b/src/cudacompat/test/eigenSoA_t.cu
new file mode 100644
index 000000000..c5200e2a8
--- /dev/null
+++ b/src/cudacompat/test/eigenSoA_t.cu
@@ -0,0 +1 @@
+#include "eigenSoA_t.h"
diff --git a/src/cudacompat/test/eigenSoA_t.h b/src/cudacompat/test/eigenSoA_t.h
new file mode 100644
index 000000000..5409bce10
--- /dev/null
+++ b/src/cudacompat/test/eigenSoA_t.h
@@ -0,0 +1,101 @@
+#include <Eigen/Dense>
+
+#include "CUDACore/eigenSoA.h"
+
+template <int32_t S>
+struct MySoA {
+  // we can find a way to avoid this copy/paste???
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::ScalarSoA<float, S> a;
+  eigenSoA::ScalarSoA<float, S> b;
+};
+
+using V = MySoA<128>;
+
+__global__ void testBasicSoA(float* p) {
+  using namespace eigenSoA;
+
+  assert(!isPowerOf2(0));
+  assert(isPowerOf2(1));
+  assert(isPowerOf2(1024));
+  assert(!isPowerOf2(1026));
+
+  using M3 = Eigen::Matrix<float, 3, 3>;
+
+  __shared__ eigenSoA::MatrixSoA<M3, 64> m;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+  if (0 == first)
+    printf("before %f\n", p[0]);
+
+  // a silly game...
+  int n = 64;
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    m[i].setZero();
+    m[i](0, 0) = p[i];
+    m[i](1, 1) = p[i + 64];
+    m[i](2, 2) = p[i + 64 * 2];
+  }
+  __syncthreads();  // not needed
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x)
+    m[i] = m[i].inverse().eval();
+  __syncthreads();
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    p[i] = m[63 - i](0, 0);
+    p[i + 64] = m[63 - i](1, 1);
+    p[i + 64 * 2] = m[63 - i](2, 2);
+  }
+
+  if (0 == first)
+    printf("after %f\n", p[0]);
+}
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <random>
+
+#ifdef __CUDACC__
+#include "CUDACore/requireDevices.h"
+#include "CUDACore/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
+  float p[1024];
+
+  std::uniform_real_distribution<float> rgen(0.01, 0.99);
+  std::mt19937 eng;
+
+  for (auto& r : p)
+    r = rgen(eng);
+  for (int i = 0, n = 64 * 3; i < n; ++i)
+    assert(p[i] > 0 && p[i] < 1.);
+
+  std::cout << p[0] << std::endl;
+#ifdef __CUDACC__
+  float* p_d;
+  cudaCheck(cudaMalloc(&p_d, 1024 * 4));
+  cudaCheck(cudaMemcpy(p_d, p, 1024 * 4, cudaMemcpyDefault));
+  testBasicSoA<<<1, 1024>>>(p_d);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(p, p_d, 1024 * 4, cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testBasicSoA(p);
+#endif
+
+  std::cout << p[0] << std::endl;
+
+  for (int i = 0, n = 64 * 3; i < n; ++i)
+    assert(p[i] > 1.);
+
+  std::cout << "END" << std::endl;
+  return 0;
+}
diff --git a/src/cudacompat/test/eigenSoA_t_cpu.cc b/src/cudacompat/test/eigenSoA_t_cpu.cc
new file mode 100644
index 000000000..c5200e2a8
--- /dev/null
+++ b/src/cudacompat/test/eigenSoA_t_cpu.cc
@@ -0,0 +1 @@
+#include "eigenSoA_t.h"
diff --git a/src/cudacompat/test/gpuClustering_t.cu b/src/cudacompat/test/gpuClustering_t.cu
new file mode 100644
index 000000000..19a3b8d01
--- /dev/null
+++ b/src/cudacompat/test/gpuClustering_t.cu
@@ -0,0 +1 @@
+#include "gpuClustering_t.h"
diff --git a/src/cudacompat/test/gpuClustering_t.h b/src/cudacompat/test/gpuClustering_t.h
new file mode 100644
index 000000000..5388e3499
--- /dev/null
+++ b/src/cudacompat/test/gpuClustering_t.h
@@ -0,0 +1,401 @@
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#ifdef __CUDACC__
+
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
+#include "CUDACore/launch.h"
+#endif
+
+// dirty, but works
+#include "plugin-SiPixelClusterizer/gpuClustering.h"
+#include "plugin-SiPixelClusterizer/gpuClusterChargeCut.h"
+
+int main(void) {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
+  using namespace gpuClustering;
+
+  int numElements = 256 * 2000;
+  // these in reality are already on GPU
+  auto h_id = std::make_unique<uint16_t[]>(numElements);
+  auto h_x = std::make_unique<uint16_t[]>(numElements);
+  auto h_y = std::make_unique<uint16_t[]>(numElements);
+  auto h_adc = std::make_unique<uint16_t[]>(numElements);
+
+  auto h_clus = std::make_unique<int[]>(numElements);
+
+#ifdef __CUDACC__
+  auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
+  auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules + 1, nullptr);
+  auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
+  auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
+#else
+
+  auto h_moduleStart = std::make_unique<uint32_t[]>(MaxNumModules + 1);
+  auto h_clusInModule = std::make_unique<uint32_t[]>(MaxNumModules);
+  auto h_moduleId = std::make_unique<uint32_t[]>(MaxNumModules);
+
+#endif
+
+  // later random number
+  int n = 0;
+  int ncl = 0;
+  int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6};
+
+  auto generateClusters = [&](int kn) {
+    auto addBigNoise = 1 == kn % 2;
+    if (addBigNoise) {
+      constexpr int MaxPixels = 1000;
+      int id = 666;
+      for (int x = 0; x < 140; x += 3) {
+        for (int yy = 0; yy < 400; yy += 3) {
+          h_id[n] = id;
+          h_x[n] = x;
+          h_y[n] = yy;
+          h_adc[n] = 1000;
+          ++n;
+          ++ncl;
+          if (MaxPixels <= ncl)
+            break;
+        }
+        if (MaxPixels <= ncl)
+          break;
+      }
+    }
+
+    {
+      // isolated
+      int id = 42;
+      int x = 10;
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = x;
+      h_adc[n] = kn == 0 ? 100 : 5000;
+      ++n;
+
+      // first column
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = 0;
+      h_adc[n] = 5000;
+      ++n;
+      // first columns
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 2;
+      h_adc[n] = 5000;
+      ++n;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 1;
+      h_adc[n] = 5000;
+      ++n;
+
+      // last column
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = 415;
+      h_adc[n] = 5000;
+      ++n;
+      // last columns
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 415;
+      h_adc[n] = 2500;
+      ++n;
+      h_id[n] = id;
+      h_x[n] = x + 80;
+      h_y[n] = 414;
+      h_adc[n] = 2500;
+      ++n;
+
+      // diagonal
+      ++ncl;
+      for (int x = 20; x < 25; ++x) {
+        h_id[n] = id;
+        h_x[n] = x;
+        h_y[n] = x;
+        h_adc[n] = 1000;
+        ++n;
+      }
+      ++ncl;
+      // reversed
+      for (int x = 45; x > 40; --x) {
+        h_id[n] = id;
+        h_x[n] = x;
+        h_y[n] = x;
+        h_adc[n] = 1000;
+        ++n;
+      }
+      ++ncl;
+      h_id[n++] = InvId;  // error
+      // messy
+      int xx[5] = {21, 25, 23, 24, 22};
+      for (int k = 0; k < 5; ++k) {
+        h_id[n] = id;
+        h_x[n] = xx[k];
+        h_y[n] = 20 + xx[k];
+        h_adc[n] = 1000;
+        ++n;
+      }
+      // holes
+      ++ncl;
+      for (int k = 0; k < 5; ++k) {
+        h_id[n] = id;
+        h_x[n] = xx[k];
+        h_y[n] = 100;
+        h_adc[n] = kn == 2 ? 100 : 1000;
+        ++n;
+        if (xx[k] % 2 == 0) {
+          h_id[n] = id;
+          h_x[n] = xx[k];
+          h_y[n] = 101;
+          h_adc[n] = 1000;
+          ++n;
+        }
+      }
+    }
+    {
+      // id == 0 (make sure it works!
+      int id = 0;
+      int x = 10;
+      ++ncl;
+      h_id[n] = id;
+      h_x[n] = x;
+      h_y[n] = x;
+      h_adc[n] = 5000;
+      ++n;
+    }
+    // all odd id
+    for (int id = 11; id <= 1800; id += 2) {
+      if ((id / 20) % 2)
+        h_id[n++] = InvId;  // error
+      for (int x = 0; x < 40; x += 4) {
+        ++ncl;
+        if ((id / 10) % 2) {
+          for (int k = 0; k < 10; ++k) {
+            h_id[n] = id;
+            h_x[n] = x;
+            h_y[n] = x + y[k];
+            h_adc[n] = 100;
+            ++n;
+            h_id[n] = id;
+            h_x[n] = x + 1;
+            h_y[n] = x + y[k] + 2;
+            h_adc[n] = 1000;
+            ++n;
+          }
+        } else {
+          for (int k = 0; k < 10; ++k) {
+            h_id[n] = id;
+            h_x[n] = x;
+            h_y[n] = x + y[9 - k];
+            h_adc[n] = kn == 2 ? 10 : 1000;
+            ++n;
+            if (y[k] == 3)
+              continue;  // hole
+            if (id == 51) {
+              h_id[n++] = InvId;
+              h_id[n++] = InvId;
+            }  // error
+            h_id[n] = id;
+            h_x[n] = x + 1;
+            h_y[n] = x + y[k] + 2;
+            h_adc[n] = kn == 2 ? 10 : 1000;
+            ++n;
+          }
+        }
+      }
+    }
+  };  // end lambda
+  for (auto kkk = 0; kkk < 5; ++kkk) {
+    n = 0;
+    ncl = 0;
+    generateClusters(kkk);
+
+    std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
+    assert(n <= numElements);
+
+    uint32_t nModules = 0;
+#ifdef __CUDACC__
+    size_t size32 = n * sizeof(unsigned int);
+    size_t size16 = n * sizeof(unsigned short);
+    // size_t size8 = n * sizeof(uint8_t);
+
+    cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
+
+    cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
+    // Launch CUDA Kernels
+    int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
+              << " threads\n";
+
+    cms::cuda::launch(countModules, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
+
+    blocksPerGrid = MaxNumModules;  //nModules;
+
+    std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
+              << " threads\n";
+    cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)));
+
+    cms::cuda::launch(findClus,
+                      {blocksPerGrid, threadsPerBlock},
+                      d_id.get(),
+                      d_x.get(),
+                      d_y.get(),
+                      d_moduleStart.get(),
+                      d_clusInModule.get(),
+                      d_moduleId.get(),
+                      d_clus.get(),
+                      n);
+    cudaDeviceSynchronize();
+    cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+    uint32_t nclus[MaxNumModules], moduleId[nModules];
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+    std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
+              << std::endl;
+    for (auto i = MaxNumModules; i > 0; i--)
+      if (nclus[i - 1] > 0) {
+        std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
+        break;
+      }
+    if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
+      std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
+
+    cms::cuda::launch(clusterChargeCut,
+                      {blocksPerGrid, threadsPerBlock},
+                      d_id.get(),
+                      d_adc.get(),
+                      d_moduleStart.get(),
+                      d_clusInModule.get(),
+                      d_moduleId.get(),
+                      d_clus.get(),
+                      n);
+
+    cudaDeviceSynchronize();
+#else
+    h_moduleStart[0] = nModules;
+    countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
+    memset(h_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));
+    gridDim.x = MaxNumModules;  //not needed in the kernel for this specific case;
+    assert(blockIdx.x == 0);
+    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
+      findClus(h_id.get(),
+               h_x.get(),
+               h_y.get(),
+               h_moduleStart.get(),
+               h_clusInModule.get(),
+               h_moduleId.get(),
+               h_clus.get(),
+               n);
+    resetGrid();
+
+    nModules = h_moduleStart[0];
+    auto nclus = h_clusInModule.get();
+
+    std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
+              << std::endl;
+    for (auto i = MaxNumModules; i > 0; i--)
+      if (nclus[i - 1] > 0) {
+        std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
+        break;
+      }
+    if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
+      std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
+
+    gridDim.x = MaxNumModules;  // no needed in the kernel for in this specific case
+    assert(blockIdx.x == 0);
+    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
+      clusterChargeCut(
+          h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
+    resetGrid();
+
+#endif
+
+    std::cout << "found " << nModules << " Modules active" << std::endl;
+
+#ifdef __CUDACC__
+    cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#endif
+
+    std::set<unsigned int> clids;
+    for (int i = 0; i < n; ++i) {
+      assert(h_id[i] != 666);  // only noise
+      if (h_id[i] == InvId)
+        continue;
+      assert(h_clus[i] >= 0);
+      assert(h_clus[i] < int(nclus[h_id[i]]));
+      clids.insert(h_id[i] * 1000 + h_clus[i]);
+      // clids.insert(h_clus[i]);
+    }
+
+    // verify no hole in numbering
+    auto p = clids.begin();
+    auto cmid = (*p) / 1000;
+    assert(0 == (*p) % 1000);
+    auto c = p;
+    ++c;
+    std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl;
+    std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl;
+    for (; c != clids.end(); ++c) {
+      auto cc = *c;
+      auto pp = *p;
+      auto mid = cc / 1000;
+      auto pnc = pp % 1000;
+      auto nc = cc % 1000;
+      if (mid != cmid) {
+        assert(0 == cc % 1000);
+        assert(nclus[cmid] - 1 == pp % 1000);
+        // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": "  << nclus[mid] << ' ' << pp << std::endl;
+        cmid = mid;
+        p = c;
+        continue;
+      }
+      p = c;
+      // assert(nc==pnc+1);
+      if (nc != pnc + 1)
+        std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
+    }
+
+    std::cout << "found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << ' ' << clids.size() << " clusters"
+              << std::endl;
+    for (auto i = MaxNumModules; i > 0; i--)
+      if (nclus[i - 1] > 0) {
+        std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
+        break;
+      }
+    // << " and " << seeds.size() << " seeds" << std::endl;
+  }  /// end loop kkk
+  return 0;
+}
diff --git a/src/cudacompat/test/gpuVertexFinder_t.cu b/src/cudacompat/test/gpuVertexFinder_t.cu
new file mode 100644
index 000000000..a7906fe0d
--- /dev/null
+++ b/src/cudacompat/test/gpuVertexFinder_t.cu
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"
diff --git a/src/cudacompat/test/histo.cc b/src/cudacompat/test/histo.cc
new file mode 100644
index 000000000..5b17825ef
--- /dev/null
+++ b/src/cudacompat/test/histo.cc
@@ -0,0 +1,54 @@
+#include <iostream>
+#include <sstream>
+
+#include "plugin-Validation/SimpleAtomicHisto.h"
+
+int main() {
+  SimpleAtomicHisto h(10, 0, 1);
+  std::stringstream ss;
+
+  ss << h;
+  assert(ss.str() == "12 0 1 0 0 0 0 0 0 0 0 0 0 0 0");
+
+  h.fill(-0.1);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 0 0 0 0 0 0 0 0 0 0 0");
+
+  h.fill(1.1);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 0 0 0 0 0 0 0 0 0 0 1");
+
+  h.fill(0);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 1 0 0 0 0 0 0 0 0 0 1");
+
+  h.fill(0.1);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 1 1 0 0 0 0 0 0 0 0 1");
+
+  h.fill(0.0999);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 2 1 0 0 0 0 0 0 0 0 1");
+
+  h.fill(0.2);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 2 1 1 0 0 0 0 0 0 0 1");
+
+  h.fill(0.9);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 2 1 1 0 0 0 0 0 0 1 1");
+
+  h.fill(0.9999999);
+  ss.str("");
+  ss << h;
+  assert(ss.str() == "12 0 1 1 2 1 1 0 0 0 0 0 0 2 1");
+
+  return 0;
+}
diff --git a/src/cudacompat/test/prefixScan_t.cu b/src/cudacompat/test/prefixScan_t.cu
new file mode 100644
index 000000000..307c989b0
--- /dev/null
+++ b/src/cudacompat/test/prefixScan_t.cu
@@ -0,0 +1,148 @@
+#include <iostream>
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/prefixScan.h"
+#include "CUDACore/requireDevices.h"
+
+using namespace cms::cuda;
+
+template <typename T>
+struct format_traits {
+public:
+  static const constexpr char *failed_msg = "failed %d %d %d: %d %d\n";
+};
+
+template <>
+struct format_traits<float> {
+public:
+  static const constexpr char *failed_msg = "failed %d %d %d: %f %f\n";
+};
+
+template <typename T>
+__global__ void testPrefixScan(uint32_t size) {
+  __shared__ T ws[32];
+  __shared__ T c[1024];
+  __shared__ T co[1024];
+
+  auto first = threadIdx.x;
+  for (auto i = first; i < size; i += blockDim.x)
+    c[i] = 1;
+  __syncthreads();
+
+  blockPrefixScan(c, co, size, ws);
+  blockPrefixScan(c, size, ws);
+
+  assert(1 == c[0]);
+  assert(1 == co[0]);
+  for (auto i = first + 1; i < size; i += blockDim.x) {
+    if (c[i] != c[i - 1] + 1)
+      printf(format_traits<T>::failed_msg, size, i, blockDim.x, c[i], c[i - 1]);
+    assert(c[i] == c[i - 1] + 1);
+    assert(c[i] == i + 1);
+    assert(c[i] = co[i]);
+  }
+}
+
+template <typename T>
+__global__ void testWarpPrefixScan(uint32_t size) {
+  assert(size <= 32);
+  __shared__ T c[1024];
+  __shared__ T co[1024];
+  auto i = threadIdx.x;
+  c[i] = 1;
+  __syncthreads();
+
+  warpPrefixScan(c, co, i, 0xffffffff);
+  warpPrefixScan(c, i, 0xffffffff);
+  __syncthreads();
+
+  assert(1 == c[0]);
+  assert(1 == co[0]);
+  if (i != 0) {
+    if (c[i] != c[i - 1] + 1)
+      printf(format_traits<T>::failed_msg, size, i, blockDim.x, c[i], c[i - 1]);
+    assert(c[i] == c[i - 1] + 1);
+    assert(c[i] == i + 1);
+    assert(c[i] = co[i]);
+  }
+}
+
+__global__ void init(uint32_t *v, uint32_t val, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    v[i] = val;
+  if (i == 0)
+    printf("init\n");
+}
+
+__global__ void verify(uint32_t const *v, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    assert(v[i] == i + 1);
+  if (i == 0)
+    printf("verify\n");
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  std::cout << "warp level" << std::endl;
+  // std::cout << "warp 32" << std::endl;
+  testWarpPrefixScan<int><<<1, 32>>>(32);
+  cudaDeviceSynchronize();
+  // std::cout << "warp 16" << std::endl;
+  testWarpPrefixScan<int><<<1, 32>>>(16);
+  cudaDeviceSynchronize();
+  // std::cout << "warp 5" << std::endl;
+  testWarpPrefixScan<int><<<1, 32>>>(5);
+  cudaDeviceSynchronize();
+
+  std::cout << "block level" << std::endl;
+  for (int bs = 32; bs <= 1024; bs += 32) {
+    // std::cout << "bs " << bs << std::endl;
+    for (int j = 1; j <= 1024; ++j) {
+      // std::cout << j << std::endl;
+      testPrefixScan<uint16_t><<<1, bs>>>(j);
+      cudaDeviceSynchronize();
+      testPrefixScan<float><<<1, bs>>>(j);
+      cudaDeviceSynchronize();
+    }
+  }
+  cudaDeviceSynchronize();
+
+  int num_items = 200;
+  for (int ksize = 1; ksize < 4; ++ksize) {
+    // test multiblock
+    std::cout << "multiblok" << std::endl;
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    num_items *= 10;
+    uint32_t *d_in;
+    uint32_t *d_out1;
+    uint32_t *d_out2;
+
+    cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t)));
+
+    auto nthreads = 256;
+    auto nblocks = (num_items + nthreads - 1) / nthreads;
+
+    init<<<nblocks, nthreads, 0>>>(d_in, 1, num_items);
+
+    // the block counter
+    int32_t *d_pc;
+    cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t)));
+    cudaCheck(cudaMemset(d_pc, 0, sizeof(int32_t)));
+
+    nthreads = 1024;
+    nblocks = (num_items + nthreads - 1) / nthreads;
+    std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nblocks << std::endl;
+    multiBlockPrefixScan<<<nblocks, nthreads, 4 * nblocks>>>(d_in, d_out1, num_items, d_pc);
+    cudaCheck(cudaGetLastError());
+    verify<<<nblocks, nthreads, 0>>>(d_out1, num_items);
+    cudaCheck(cudaGetLastError());
+    cudaDeviceSynchronize();
+
+  }  // ksize
+  return 0;
+}
diff --git a/src/cudacompat/test/radixSort_t.cu b/src/cudacompat/test/radixSort_t.cu
new file mode 100644
index 000000000..e1b9bca4c
--- /dev/null
+++ b/src/cudacompat/test/radixSort_t.cu
@@ -0,0 +1,204 @@
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+#include <set>
+
+#include "CUDACore/device_unique_ptr.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
+#include "CUDACore/launch.h"
+#include "CUDACore/radixSort.h"
+
+using namespace cms::cuda;
+
+template <typename T>
+struct RS {
+  using type = std::uniform_int_distribution<T>;
+  static auto ud() { return type(std::numeric_limits<T>::min(), std::numeric_limits<T>::max()); }
+  static constexpr T imax = std::numeric_limits<T>::max();
+};
+
+template <>
+struct RS<float> {
+  using T = float;
+  using type = std::uniform_real_distribution<float>;
+  static auto ud() { return type(-std::numeric_limits<T>::max() / 2, std::numeric_limits<T>::max() / 2); }
+  //  static auto ud() { return type(0,std::numeric_limits<T>::max()/2);}
+  static constexpr int imax = std::numeric_limits<int>::max();
+};
+
+template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
+void go(bool useShared) {
+  std::mt19937 eng;
+  //std::mt19937 eng2;
+  auto rgen = RS<T>::ud();
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+
+  constexpr int blocks = 10;
+  constexpr int blockSize = 256 * 32;
+  constexpr int N = blockSize * blocks;
+  T v[N];
+  uint16_t ind[N];
+
+  constexpr bool sgn = T(-1) < T(0);
+  std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned")
+            << (std::numeric_limits<T>::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using "
+            << NS << " significant bytes" << std::endl;
+
+  for (int i = 0; i < 50; ++i) {
+    if (i == 49) {
+      for (long long j = 0; j < N; j++)
+        v[j] = 0;
+    } else if (i > 30) {
+      for (long long j = 0; j < N; j++)
+        v[j] = rgen(eng);
+    } else {
+      uint64_t imax = (i < 15) ? uint64_t(RS<T>::imax) + 1LL : 255;
+      for (uint64_t j = 0; j < N; j++) {
+        v[j] = (j % imax);
+        if (j % 2 && i % 2)
+          v[j] = -v[j];
+      }
+    }
+
+    uint32_t offsets[blocks + 1];
+    offsets[0] = 0;
+    for (int j = 1; j < blocks + 1; ++j) {
+      offsets[j] = offsets[j - 1] + blockSize - 3 * j;
+      assert(offsets[j] <= N);
+    }
+
+    if (i == 1) {  // special cases...
+      offsets[0] = 0;
+      offsets[1] = 0;
+      offsets[2] = 19;
+      offsets[3] = 32 + offsets[2];
+      offsets[4] = 123 + offsets[3];
+      offsets[5] = 256 + offsets[4];
+      offsets[6] = 311 + offsets[5];
+      offsets[7] = 2111 + offsets[6];
+      offsets[8] = 256 * 11 + offsets[7];
+      offsets[9] = 44 + offsets[8];
+      offsets[10] = 3297 + offsets[9];
+    }
+
+    std::random_shuffle(v, v + N);
+
+    auto v_d = cms::cuda::make_device_unique<U[]>(N, nullptr);
+    auto ind_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
+    auto ws_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
+    auto off_d = cms::cuda::make_device_unique<uint32_t[]>(blocks + 1, nullptr);
+
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
+
+    if (i < 2)
+      std::cout << "lauch for " << offsets[blocks] << std::endl;
+
+    auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
+
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+    constexpr int MaxSize = 256 * 32;
+    if (useShared)
+      cms::cuda::launch(
+          radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
+    else
+      cms::cuda::launch(
+          radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
+
+    if (i == 0)
+      std::cout << "done for " << offsets[blocks] << std::endl;
+
+    cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));
+
+    delta += (std::chrono::high_resolution_clock::now() - start);
+
+    if (i == 0)
+      std::cout << "done for " << offsets[blocks] << std::endl;
+
+    if (32 == i) {
+      std::cout << LL(v[ind[0]]) << ' ' << LL(v[ind[1]]) << ' ' << LL(v[ind[2]]) << std::endl;
+      std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize - 1000]]) << std::endl;
+      std::cout << LL(v[ind[blockSize / 2 - 1]]) << ' ' << LL(v[ind[blockSize / 2]]) << ' '
+                << LL(v[ind[blockSize / 2 + 1]]) << std::endl;
+    }
+    for (int ib = 0; ib < blocks; ++ib) {
+      std::set<uint16_t> inds;
+      if (offsets[ib + 1] > offsets[ib])
+        inds.insert(ind[offsets[ib]]);
+      for (auto j = offsets[ib] + 1; j < offsets[ib + 1]; j++) {
+        inds.insert(ind[j]);
+        auto a = v + offsets[ib];
+        auto k1 = a[ind[j]];
+        auto k2 = a[ind[j - 1]];
+        auto sh = sizeof(uint64_t) - NS;
+        sh *= 8;
+        auto shorten = [sh](T& t) {
+          auto k = (uint64_t*)(&t);
+          *k = (*k >> sh) << sh;
+        };
+        shorten(k1);
+        shorten(k2);
+        if (k1 < k2)
+          std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
+      }
+      if (!inds.empty()) {
+        assert(0 == *inds.begin());
+        assert(inds.size() - 1 == *inds.rbegin());
+      }
+      if (inds.size() != (offsets[ib + 1] - offsets[ib]))
+        std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets[ib + 1] - offsets[ib])
+                  << std::endl;
+      assert(inds.size() == (offsets[ib + 1] - offsets[ib]));
+    }
+  }  // 50 times
+  std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / 50.
+            << " ms" << std::endl;
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  bool useShared = false;
+
+  std::cout << "using Global memory" << std::endl;
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t, 3>(useShared);
+  go<int64_t>(useShared);
+  go<float, 4, float, double>(useShared);
+  go<float, 2, float, double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+  useShared = true;
+
+  std::cout << "using Shared memory" << std::endl;
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t, 3>(useShared);
+  go<int64_t>(useShared);
+  go<float, 4, float, double>(useShared);
+  go<float, 2, float, double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+  return 0;
+}
diff --git a/src/cudacompat/test/testBrokenLineFit.cc b/src/cudacompat/test/testBrokenLineFit.cc
new file mode 100644
index 000000000..d564f1b02
--- /dev/null
+++ b/src/cudacompat/test/testBrokenLineFit.cc
@@ -0,0 +1,2 @@
+#define USE_BL
+#include "testRiemannFit.cc"
diff --git a/src/cudacompat/test/testEigenGPU.cu b/src/cudacompat/test/testEigenGPU.cu
new file mode 100644
index 000000000..9cbcc4c57
--- /dev/null
+++ b/src/cudacompat/test/testEigenGPU.cu
@@ -0,0 +1,341 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
+
+#ifdef USE_BL
+#include "plugin-PixelTriplets/BrokenLine.h"
+#else
+#include "plugin-PixelTriplets/RiemannFit.h"
+#endif
+
+#include "test_common.h"
+
+using namespace Eigen;
+
+namespace Rfit {
+  constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks(); }
+  // hits
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()>>;
+  // errors
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()>>;
+  // fast fit
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()>>;
+
+}  // namespace Rfit
+
+template <int N>
+__global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
+  if (i != 0)
+    return;
+  printf("GPU sizes %lu %lu %lu %lu %lu\n",
+         sizeof(hits[i]),
+         sizeof(hits_ge[i]),
+         sizeof(Vector4d),
+         sizeof(Rfit::line_fit),
+         sizeof(Rfit::circle_fit));
+}
+
+template <int N>
+__global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d result(presults + i, 4);
+#ifdef USE_BL
+  BrokenLine::BL_Fast_fit(hits, result);
+#else
+  Rfit::Fast_fit(hits, result);
+#endif
+}
+
+#ifdef USE_BL
+
+template <int N>
+__global__ void kernelBrokenLineFit(double* __restrict__ phits,
+                                    float* __restrict__ phits_ge,
+                                    double* __restrict__ pfast_fit_input,
+                                    double B,
+                                    Rfit::circle_fit* circle_fit,
+                                    Rfit::line_fit* line_fit) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+
+  BrokenLine::PreparedBrokenLineData<N> data;
+  Rfit::Matrix3d Jacob;
+
+  auto& line_fit_results = line_fit[i];
+  auto& circle_fit_results = circle_fit[i];
+
+  BrokenLine::prepareBrokenLineData(hits, fast_fit_input, B, data);
+  BrokenLine::BL_Line_fit(hits_ge, fast_fit_input, B, data, line_fit_results);
+  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
+
+#ifdef TEST_DEBUG
+  if (0 == i) {
+    printf("Circle param %f,%f,%f\n", circle_fit[i].par(0), circle_fit[i].par(1), circle_fit[i].par(2));
+  }
+#endif
+}
+
+#else
+
+template <int N>
+__global__ void kernelCircleFit(double* __restrict__ phits,
+                                float* __restrict__ phits_ge,
+                                double* __restrict__ pfast_fit_input,
+                                double B,
+                                Rfit::circle_fit* circle_fit_resultsGPU) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+
+  constexpr auto n = N;
+
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+  Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+  Rfit::loadCovariance2D(hits_ge, hits_cov);
+
+#ifdef TEST_DEBUG
+  if (0 == i) {
+    printf("hits %f, %f\n", hits.block(0, 0, 2, n)(0, 0), hits.block(0, 0, 2, n)(0, 1));
+    printf("hits %f, %f\n", hits.block(0, 0, 2, n)(1, 0), hits.block(0, 0, 2, n)(1, 1));
+    printf("fast_fit_input(0): %f\n", fast_fit_input(0));
+    printf("fast_fit_input(1): %f\n", fast_fit_input(1));
+    printf("fast_fit_input(2): %f\n", fast_fit_input(2));
+    printf("fast_fit_input(3): %f\n", fast_fit_input(3));
+    printf("rad(0,0): %f\n", rad(0, 0));
+    printf("rad(1,1): %f\n", rad(1, 1));
+    printf("rad(2,2): %f\n", rad(2, 2));
+    printf("hits_cov(0,0): %f\n", (*hits_cov)(0, 0));
+    printf("hits_cov(1,1): %f\n", (*hits_cov)(1, 1));
+    printf("hits_cov(2,2): %f\n", (*hits_cov)(2, 2));
+    printf("hits_cov(11,11): %f\n", (*hits_cov)(11, 11));
+    printf("B: %f\n", B);
+  }
+#endif
+  circle_fit_resultsGPU[i] = Rfit::Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
+#ifdef TEST_DEBUG
+  if (0 == i) {
+    printf("Circle param %f,%f,%f\n",
+           circle_fit_resultsGPU[i].par(0),
+           circle_fit_resultsGPU[i].par(1),
+           circle_fit_resultsGPU[i].par(2));
+  }
+#endif
+}
+
+template <int N>
+__global__ void kernelLineFit(double* __restrict__ phits,
+                              float* __restrict__ phits_ge,
+                              double B,
+                              Rfit::circle_fit* circle_fit,
+                              double* __restrict__ pfast_fit_input,
+                              Rfit::line_fit* line_fit) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  line_fit[i] = Rfit::Line_fit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
+}
+#endif
+
+template <typename M3xN, typename M6xN>
+__device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
+  constexpr uint32_t N = M3xN::ColsAtCompileTime;
+
+  if (N == 5) {
+    hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+        -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
+    return;
+  }
+
+  if (N > 3)
+    hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
+  else
+    hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
+
+  hits_ge.col(0)[0] = 7.14652e-06;
+  hits_ge.col(1)[0] = 2.15789e-06;
+  hits_ge.col(2)[0] = 1.63328e-06;
+  if (N > 3)
+    hits_ge.col(3)[0] = 6.27919e-06;
+  hits_ge.col(0)[2] = 6.10348e-06;
+  hits_ge.col(1)[2] = 2.08211e-06;
+  hits_ge.col(2)[2] = 1.61672e-06;
+  if (N > 3)
+    hits_ge.col(3)[2] = 6.28081e-06;
+  hits_ge.col(0)[5] = 5.184e-05;
+  hits_ge.col(1)[5] = 1.444e-05;
+  hits_ge.col(2)[5] = 6.25e-06;
+  if (N > 3)
+    hits_ge.col(3)[5] = 3.136e-05;
+  hits_ge.col(0)[1] = -5.60077e-06;
+  hits_ge.col(1)[1] = -1.11936e-06;
+  hits_ge.col(2)[1] = -6.24945e-07;
+  if (N > 3)
+    hits_ge.col(3)[1] = -5.28e-06;
+}
+
+template <int N>
+__global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  hits_ge = MatrixXf::Zero(6, N);
+  fillHitsAndHitsCov(hits, hits_ge);
+}
+
+template <int N>
+void testFit() {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd<N> hits;
+  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+  double* hitsGPU = nullptr;
+  ;
+  float* hits_geGPU = nullptr;
+  double* fast_fit_resultsGPU = nullptr;
+  double* fast_fit_resultsGPUret = new double[Rfit::maxNumberOfTracks() * sizeof(Vector4d)];
+  Rfit::circle_fit* circle_fit_resultsGPU = nullptr;
+  Rfit::circle_fit* circle_fit_resultsGPUret = new Rfit::circle_fit();
+  Rfit::line_fit* line_fit_resultsGPU = nullptr;
+  Rfit::line_fit* line_fit_resultsGPUret = new Rfit::line_fit();
+
+  fillHitsAndHitsCov(hits, hits_ge);
+
+  std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' '
+            << sizeof(Rfit::line_fit) << ' ' << sizeof(Rfit::circle_fit) << std::endl;
+
+  std::cout << "Generated hits:\n" << hits << std::endl;
+  std::cout << "Generated cov:\n" << hits_ge << std::endl;
+
+  // FAST_FIT_CPU
+#ifdef USE_BL
+  Vector4d fast_fit_results;
+  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+#else
+  Vector4d fast_fit_results;
+  Rfit::Fast_fit(hits, fast_fit_results);
+#endif
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+  // for timing    purposes we fit    4096 tracks
+  constexpr uint32_t Ntracks = 4096;
+  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix3xNd<N>)));
+  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix6xNf<N>)));
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMalloc(&line_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::circle_fit)));
+
+  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
+
+  kernelPrintSizes<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
+  kernelFillHitsAndHitsCov<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
+
+  // FAST_FIT GPU
+  kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
+                       fast_fit_resultsGPU,
+                       Rfit::maxNumberOfTracks() * sizeof(Vector4d),
+                       cudaMemcpyDeviceToHost));
+  Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
+  assert(isEqualFuzzy(fast_fit_results, fast_fit));
+
+#ifdef USE_BL
+  // CIRCLE AND LINE FIT CPU
+  BrokenLine::PreparedBrokenLineData<N> data;
+  BrokenLine::karimaki_circle_fit circle_fit_results;
+  Rfit::line_fit line_fit_results;
+  Rfit::Matrix3d Jacob;
+  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
+
+  // fit on GPU
+  kernelBrokenLineFit<N>
+      <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+#else
+  // CIRCLE_FIT CPU
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+
+  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
+  Rfit::loadCovariance2D(hits_ge, hits_cov);
+  Rfit::circle_fit circle_fit_results =
+      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+
+  // CIRCLE_FIT GPU
+  kernelCircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+
+  kernelLineFit<N>
+      <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+#endif
+
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
+
+  cudaCheck(
+      cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
+
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+  // LINE_FIT GPU
+  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
+  std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
+
+  std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl;
+  std::cout << "Fitted cov (LineFit): GPU\n" << line_fit_resultsGPUret->cov << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  cms::cudatest::requireDevices();
+
+  testFit<4>();
+  testFit<3>();
+  testFit<5>();
+
+  std::cout << "TEST FIT, NO ERRORS" << std::endl;
+
+  return 0;
+}
diff --git a/src/cudacompat/test/testEigenGPUNoFit.cu b/src/cudacompat/test/testEigenGPUNoFit.cu
new file mode 100644
index 000000000..8e0b28f0f
--- /dev/null
+++ b/src/cudacompat/test/testEigenGPUNoFit.cu
@@ -0,0 +1,248 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
+#include "test_common.h"
+
+using namespace Eigen;
+
+using Matrix5d = Matrix<double, 5, 5>;
+
+__host__ __device__ void eigenValues(Matrix3d *m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret) {
+#if TEST_DEBUG
+  printf("Matrix(0,0): %f\n", (*m)(0, 0));
+  printf("Matrix(1,1): %f\n", (*m)(1, 1));
+  printf("Matrix(2,2): %f\n", (*m)(2, 2));
+#endif
+  SelfAdjointEigenSolver<Matrix3d> es;
+  es.computeDirect(*m);
+  (*ret) = es.eigenvalues();
+  return;
+}
+
+__global__ void kernel(Matrix3d *m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret) {
+  eigenValues(m, ret);
+}
+
+__global__ void kernelInverse3x3(Matrix3d *in, Matrix3d *out) { (*out) = in->inverse(); }
+
+__global__ void kernelInverse4x4(Matrix4d *in, Matrix4d *out) { (*out) = in->inverse(); }
+
+__global__ void kernelInverse5x5(Matrix5d *in, Matrix5d *out) { (*out) = in->inverse(); }
+
+template <typename M1, typename M2, typename M3>
+__global__ void kernelMultiply(M1 *J, M2 *C, M3 *result) {
+//  Map<M3> res(result->data());
+#if TEST_DEBUG
+  printf("*** GPU IN ***\n");
+#endif
+  printIt(J);
+  printIt(C);
+  //  res.noalias() = (*J) * (*C);
+  //  printIt(&res);
+  (*result) = (*J) * (*C);
+#if TEST_DEBUG
+  printf("*** GPU OUT ***\n");
+#endif
+  return;
+}
+
+template <int row1, int col1, int row2, int col2>
+void testMultiply() {
+  std::cout << "TEST MULTIPLY" << std::endl;
+  std::cout << "Product of type " << row1 << "x" << col1 << " * " << row2 << "x" << col2 << std::endl;
+  Eigen::Matrix<double, row1, col1> J;
+  fillMatrix(J);
+  Eigen::Matrix<double, row2, col2> C;
+  fillMatrix(C);
+  Eigen::Matrix<double, row1, col2> multiply_result = J * C;
+#if TEST_DEBUG
+  std::cout << "Input J:" << std::endl;
+  printIt(&J);
+  std::cout << "Input C:" << std::endl;
+  printIt(&C);
+  std::cout << "Output:" << std::endl;
+  printIt(&multiply_result);
+#endif
+  // GPU
+  Eigen::Matrix<double, row1, col1> *JGPU = nullptr;
+  Eigen::Matrix<double, row2, col2> *CGPU = nullptr;
+  Eigen::Matrix<double, row1, col2> *multiply_resultGPU = nullptr;
+  Eigen::Matrix<double, row1, col2> *multiply_resultGPUret = new Eigen::Matrix<double, row1, col2>();
+
+  cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>)));
+  cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>)));
+  cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>)));
+  cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));
+
+  kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost));
+  printIt(multiply_resultGPUret);
+  assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
+}
+
+void testInverse3x3() {
+  std::cout << "TEST INVERSE 3x3" << std::endl;
+  Matrix3d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix3d m_inv = m.inverse();
+  Matrix3d *mGPU = nullptr;
+  Matrix3d *mGPUret = nullptr;
+  Matrix3d *mCPUret = new Matrix3d();
+
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
+
+  kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
+void testInverse4x4() {
+  std::cout << "TEST INVERSE 4x4" << std::endl;
+  Matrix4d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix4d m_inv = m.inverse();
+  Matrix4d *mGPU = nullptr;
+  Matrix4d *mGPUret = nullptr;
+  Matrix4d *mCPUret = new Matrix4d();
+
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice));
+
+  kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
+void testInverse5x5() {
+  std::cout << "TEST INVERSE 5x5" << std::endl;
+  Matrix5d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix5d m_inv = m.inverse();
+  Matrix5d *mGPU = nullptr;
+  Matrix5d *mGPUret = nullptr;
+  Matrix5d *mCPUret = new Matrix5d();
+
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice));
+
+  kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret);
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
+void testEigenvalues() {
+  std::cout << "TEST EIGENVALUES" << std::endl;
+  Matrix3d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix3d *m_gpu = nullptr;
+  Matrix3d *mgpudebug = new Matrix3d();
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret =
+      new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret1 =
+      new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret_gpu = nullptr;
+  eigenValues(&m, ret);
+#if TEST_DEBUG
+  std::cout << "Generated Matrix M 3x3:\n" << m << std::endl;
+  std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
+  std::cout << "*************************\n\n" << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType)));
+  cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
+
+  kernel<<<1, 1>>>(m_gpu, ret_gpu);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(
+      ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
+  std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
+  std::cout << "*************************\n\n" << std::endl;
+#endif
+  assert(isEqualFuzzy(*ret, *ret1));
+}
+
+int main(int argc, char *argv[]) {
+  cms::cudatest::requireDevices();
+
+  testEigenvalues();
+  testInverse3x3();
+  testInverse4x4();
+  testInverse5x5();
+
+  testMultiply<1, 2, 2, 1>();
+  testMultiply<1, 2, 2, 2>();
+  testMultiply<1, 2, 2, 3>();
+  testMultiply<1, 2, 2, 4>();
+  testMultiply<1, 2, 2, 5>();
+  testMultiply<2, 1, 1, 2>();
+  testMultiply<2, 1, 1, 3>();
+  testMultiply<2, 1, 1, 4>();
+  testMultiply<2, 1, 1, 5>();
+  testMultiply<2, 2, 2, 2>();
+  testMultiply<2, 3, 3, 1>();
+  testMultiply<2, 3, 3, 2>();
+  testMultiply<2, 3, 3, 4>();
+  testMultiply<2, 3, 3, 5>();
+  testMultiply<3, 2, 2, 3>();
+  testMultiply<2, 3, 3, 3>();  // DOES NOT COMPILE W/O PATCHING EIGEN
+  testMultiply<3, 3, 3, 3>();
+  testMultiply<8, 8, 8, 8>();
+  testMultiply<3, 4, 4, 3>();
+  testMultiply<2, 4, 4, 2>();
+  testMultiply<3, 4, 4, 2>();  // DOES NOT COMPILE W/O PATCHING EIGEN
+
+  return 0;
+}
diff --git a/src/cudacompat/test/testRiemannFit.cc b/src/cudacompat/test/testRiemannFit.cc
new file mode 100644
index 000000000..4df7bcf30
--- /dev/null
+++ b/src/cudacompat/test/testRiemannFit.cc
@@ -0,0 +1,153 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#ifdef USE_BL
+#include "plugin-PixelTriplets/BrokenLine.h"
+#else
+#include "plugin-PixelTriplets/RiemannFit.h"
+#endif
+
+#include "test_common.h"
+
+using namespace Eigen;
+
+namespace Rfit {
+  constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks(); }
+  // hits
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()> >;
+  // errors
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()> >;
+  // fast fit
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
+
+}  // namespace Rfit
+
+/*
+Hit global: 641,0 2: 2.934787,0.773211,-10.980247
+Error: 641,0 2: 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05
+Hit global: 641,1 104: 6.314229,1.816356,-23.162731
+Error: 641,1 104: 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06
+Hit global: 641,2 1521: 8.936963,2.765734,-32.759060
+Error: 641,2 1521: 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07
+Hit global: 641,3 1712: 10.360559,3.330824,-38.061260
+Error: 641,3 1712: 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08
+Hit global: 641,4 1824: 12.856387,4.422212,-47.518867
+Error: 641,4 1824: 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07
+*/
+
+template <typename M3xN, typename M6xN>
+void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
+  constexpr uint32_t N = M3xN::ColsAtCompileTime;
+
+  if (N == 5) {
+    hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+        -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
+    return;
+  }
+
+  if (N > 3)
+    hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
+  else
+    hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
+
+  hits_ge.col(0)[0] = 7.14652e-06;
+  hits_ge.col(1)[0] = 2.15789e-06;
+  hits_ge.col(2)[0] = 1.63328e-06;
+  if (N > 3)
+    hits_ge.col(3)[0] = 6.27919e-06;
+  hits_ge.col(0)[2] = 6.10348e-06;
+  hits_ge.col(1)[2] = 2.08211e-06;
+  hits_ge.col(2)[2] = 1.61672e-06;
+  if (N > 3)
+    hits_ge.col(3)[2] = 6.28081e-06;
+  hits_ge.col(0)[5] = 5.184e-05;
+  hits_ge.col(1)[5] = 1.444e-05;
+  hits_ge.col(2)[5] = 6.25e-06;
+  if (N > 3)
+    hits_ge.col(3)[5] = 3.136e-05;
+  hits_ge.col(0)[1] = -5.60077e-06;
+  hits_ge.col(1)[1] = -1.11936e-06;
+  hits_ge.col(2)[1] = -6.24945e-07;
+  if (N > 3)
+    hits_ge.col(3)[1] = -5.28e-06;
+}
+
+template <int N>
+void testFit() {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd<N> hits;
+  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+
+  fillHitsAndHitsCov(hits, hits_ge);
+
+  std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << std::endl;
+
+  std::cout << "Generated hits:\n" << hits << std::endl;
+  std::cout << "Generated cov:\n" << hits_ge << std::endl;
+
+  // FAST_FIT_CPU
+#ifdef USE_BL
+  Vector4d fast_fit_results;
+  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+#else
+  Vector4d fast_fit_results;
+  Rfit::Fast_fit(hits, fast_fit_results);
+#endif
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+  // CIRCLE_FIT CPU
+
+#ifdef USE_BL
+  BrokenLine::PreparedBrokenLineData<N> data;
+  BrokenLine::karimaki_circle_fit circle_fit_results;
+  Rfit::Matrix3d Jacob;
+
+  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  Rfit::line_fit line_fit_results;
+  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
+#else
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
+  Rfit::loadCovariance2D(hits_ge, hits_cov);
+  Rfit::circle_fit circle_fit_results =
+      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+  Rfit::par_uvrtopak(circle_fit_results, B, true);
+
+#endif
+
+  std::cout << "Fitted values (CircleFit):\n"
+            << circle_fit_results.par << "\nchi2 " << circle_fit_results.chi2 << std::endl;
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << "\nchi2 " << line_fit_results.chi2 << std::endl;
+
+  std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  testFit<4>();
+  testFit<3>();
+  testFit<5>();
+
+  return 0;
+}
diff --git a/src/cudacompat/test/test_GPUSimpleVector.cu b/src/cudacompat/test/test_GPUSimpleVector.cu
new file mode 100644
index 000000000..92974d8a2
--- /dev/null
+++ b/src/cudacompat/test/test_GPUSimpleVector.cu
@@ -0,0 +1,83 @@
+//  author: Felice Pantaleo, CERN, 2018
+#include <cassert>
+#include <iostream>
+#include <new>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "CUDACore/SimpleVector.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/requireDevices.h"
+
+__global__ void vector_pushback(cms::cuda::SimpleVector<int> *foo) {
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  foo->push_back(index);
+}
+
+__global__ void vector_reset(cms::cuda::SimpleVector<int> *foo) { foo->reset(); }
+
+__global__ void vector_emplace_back(cms::cuda::SimpleVector<int> *foo) {
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  foo->emplace_back(index);
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  auto maxN = 10000;
+  cms::cuda::SimpleVector<int> *obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *d_obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *tmp_obj_ptr = nullptr;
+  int *data_ptr = nullptr;
+  int *d_data_ptr = nullptr;
+
+  cudaCheck(cudaMallocHost(&obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
+  cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int)));
+  cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int)));
+
+  auto v = cms::cuda::make_SimpleVector(obj_ptr, maxN, data_ptr);
+
+  cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
+  cms::cuda::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr);
+  assert(tmp_obj_ptr->size() == 0);
+  assert(tmp_obj_ptr->capacity() == static_cast<int>(maxN));
+
+  cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
+  // ... and copy the object to the device.
+  cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
+
+  int numBlocks = 5;
+  int numThreadsPerBlock = 256;
+  vector_pushback<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
+
+  assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
+  vector_reset<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
+
+  assert(obj_ptr->size() == 0);
+
+  vector_emplace_back<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
+
+  assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
+
+  cudaCheck(cudaMemcpy(data_ptr, d_data_ptr, obj_ptr->size() * sizeof(int), cudaMemcpyDefault));
+  cudaCheck(cudaFreeHost(obj_ptr));
+  cudaCheck(cudaFreeHost(data_ptr));
+  cudaCheck(cudaFreeHost(tmp_obj_ptr));
+  cudaCheck(cudaFree(d_data_ptr));
+  cudaCheck(cudaFree(d_obj_ptr));
+  std::cout << "TEST PASSED" << std::endl;
+  return 0;
+}
diff --git a/src/cudacompat/test/test_common.h b/src/cudacompat/test/test_common.h
new file mode 100644
index 000000000..6377628b0
--- /dev/null
+++ b/src/cudacompat/test/test_common.h
@@ -0,0 +1,47 @@
+#ifndef RecoPixelVertexing__PixelTrackFitting__test_common_h
+#define RecoPixelVertexing__PixelTrackFitting__test_common_h
+
+#include <algorithm>
+#include <cassert>
+#include <random>
+
+template <class C>
+__host__ __device__ void printIt(C* m) {
+#ifdef TEST_DEBUG
+  printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
+  for (u_int r = 0; r < m->rows(); ++r) {
+    for (u_int c = 0; c < m->cols(); ++c) {
+      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r, c));
+    }
+  }
+#endif
+}
+
+template <class C1, class C2>
+bool isEqualFuzzy(C1 a, C2 b, double epsilon = 1e-6) {
+  for (unsigned int i = 0; i < a.rows(); ++i) {
+    for (unsigned int j = 0; j < a.cols(); ++j) {
+      assert(std::abs(a(i, j) - b(i, j)) < std::min(std::abs(a(i, j)), std::abs(b(i, j))) * epsilon);
+    }
+  }
+  return true;
+}
+
+bool isEqualFuzzy(double a, double b, double epsilon = 1e-6) {
+  return std::abs(a - b) < std::min(std::abs(a), std::abs(b)) * epsilon;
+}
+
+template <typename T>
+void fillMatrix(T& t) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 2.0);
+  for (int row = 0; row < t.rows(); ++row) {
+    for (int col = 0; col < t.cols(); ++col) {
+      t(row, col) = dis(gen);
+    }
+  }
+  return;
+}
+
+#endif

From c22f8489e6650979b98cd3381b65dfed64e6e95c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Dec 2020 06:46:57 -0800
Subject: [PATCH 2/6] [cudacompat] Run everything on CPU via cudacompat

---
 .../CUDADataFormats/SiPixelClustersSoA.cc     |  16 ++
 .../CUDADataFormats/SiPixelClustersSoA.h      |  56 +++++
 .../CUDADataFormats/SiPixelDigiErrorsSoA.cc   |  17 ++
 .../CUDADataFormats/SiPixelDigiErrorsSoA.h    |  32 +++
 .../CUDADataFormats/SiPixelDigisSoA.cc        |  21 ++
 .../CUDADataFormats/SiPixelDigisSoA.h         |  74 +++++++
 .../SiPixelFedCablingMapGPUWrapper.h          |   2 +
 .../SiPixelGainCalibrationForHLTGPU.cc        |  11 +-
 src/cudacompat/bin/main.cc                    |  24 +--
 .../plugin-BeamSpotProducer/BeamSpotToPOD.cc  |  24 +++
 .../plugin-PixelTriplets/CAHitNtupletCUDA.cc  |  19 +-
 .../CAHitNtupletGeneratorOnGPU.cc             |   2 +-
 .../PixelVertexProducerCUDA.cc                |   2 +-
 .../SiPixelRawToClusterCUDA.cc                |  69 +++----
 ...nel.cu => SiPixelRawToClusterGPUKernel.cc} | 195 ++++++------------
 .../SiPixelRawToClusterGPUKernel.h            |  62 +++---
 .../plugin-SiPixelClusterizer/gpuCalibPixel.h |   2 +-
 .../plugin-SiPixelRecHits/PixelRecHits.cc     |  56 +++++
 .../plugin-SiPixelRecHits/PixelRecHits.cu     |  78 -------
 .../plugin-SiPixelRecHits/PixelRecHits.h      |  19 +-
 .../SiPixelRecHitCUDA.cc                      |  38 ++--
 .../plugin-SiPixelRecHits/gpuPixelRecHits.h   |   4 +-
 .../plugin-Validation/CountValidator.cc       |  20 +-
 .../plugin-Validation/HistoValidator.cc       | 106 ++++------
 src/cudacompat/plugins.txt                    |   1 +
 25 files changed, 508 insertions(+), 442 deletions(-)
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelClustersSoA.cc
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelClustersSoA.h
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.cc
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.h
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigisSoA.cc
 create mode 100644 src/cudacompat/CUDADataFormats/SiPixelDigisSoA.h
 create mode 100644 src/cudacompat/plugin-BeamSpotProducer/BeamSpotToPOD.cc
 rename src/cudacompat/plugin-SiPixelClusterizer/{SiPixelRawToClusterGPUKernel.cu => SiPixelRawToClusterGPUKernel.cc} (74%)
 create mode 100644 src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cc
 delete mode 100644 src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu

diff --git a/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.cc b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.cc
new file mode 100644
index 000000000..06f753528
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.cc
@@ -0,0 +1,16 @@
+#include "CUDADataFormats/SiPixelClustersSoA.h"
+
+SiPixelClustersSoA::SiPixelClustersSoA(size_t maxClusters) {
+  moduleStart_d = std::make_unique<uint32_t[]>(maxClusters + 1);
+  clusInModule_d = std::make_unique<uint32_t[]>(maxClusters);
+  moduleId_d = std::make_unique<uint32_t[]>(maxClusters);
+  clusModuleStart_d = std::make_unique<uint32_t[]>(maxClusters + 1);
+
+  auto view = std::make_unique<DeviceConstView>();
+  view->moduleStart_ = moduleStart_d.get();
+  view->clusInModule_ = clusInModule_d.get();
+  view->moduleId_ = moduleId_d.get();
+  view->clusModuleStart_ = clusModuleStart_d.get();
+
+  view_d = std::move(view);
+}
diff --git a/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.h b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.h
new file mode 100644
index 000000000..24d0d209d
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.h
@@ -0,0 +1,56 @@
+#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersSoA_h
+#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersSoA_h
+
+#include "CUDACore/cudaCompat.h"
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
+
+#include <memory>
+
+class SiPixelClustersSoA {
+public:
+  SiPixelClustersSoA() = default;
+  explicit SiPixelClustersSoA(size_t maxClusters);
+  ~SiPixelClustersSoA() = default;
+
+  SiPixelClustersSoA(const SiPixelClustersSoA &) = delete;
+  SiPixelClustersSoA &operator=(const SiPixelClustersSoA &) = delete;
+  SiPixelClustersSoA(SiPixelClustersSoA &&) = default;
+  SiPixelClustersSoA &operator=(SiPixelClustersSoA &&) = default;
+
+  void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; }
+
+  uint32_t nClusters() const { return nClusters_h; }
+
+  uint32_t *moduleStart() { return moduleStart_d.get(); }
+  uint32_t *clusInModule() { return clusInModule_d.get(); }
+  uint32_t *moduleId() { return moduleId_d.get(); }
+  uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
+
+  uint32_t const *moduleStart() const { return moduleStart_d.get(); }
+  uint32_t const *clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *moduleId() const { return moduleId_d.get(); }
+  uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
+  uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *c_moduleId() const { return moduleId_d.get(); }
+  uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  using DeviceConstView = SiPixelClustersCUDA::DeviceConstView;
+
+  DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  std::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
+  std::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
+  std::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
+
+  // originally from rechits
+  std::unique_ptr<uint32_t[]> clusModuleStart_d;  // index of the first cluster of each module
+
+  std::unique_ptr<DeviceConstView> view_d;  // "me" pointer
+
+  uint32_t nClusters_h;
+};
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.cc b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.cc
new file mode 100644
index 000000000..76c0e8655
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.cc
@@ -0,0 +1,17 @@
+#include "CUDADataFormats/SiPixelDigiErrorsSoA.h"
+
+#include <cassert>
+#include <cstring>
+
+SiPixelDigiErrorsSoA::SiPixelDigiErrorsSoA(size_t maxFedWords, PixelFormatterErrors errors)
+    : formatterErrors_h(std::move(errors)) {
+  error_d = std::make_unique<cms::cuda::SimpleVector<PixelErrorCompact>>();
+  data_d = std::make_unique<PixelErrorCompact[]>(maxFedWords);
+
+  std::memset(data_d.get(), 0x00, maxFedWords);
+
+  error_d = std::make_unique<cms::cuda::SimpleVector<PixelErrorCompact>>();
+  cms::cuda::make_SimpleVector(error_d.get(), maxFedWords, data_d.get());
+  assert(error_d->empty());
+  assert(error_d->capacity() == static_cast<int>(maxFedWords));
+}
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.h b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.h
new file mode 100644
index 000000000..50c5a3e86
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.h
@@ -0,0 +1,32 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h
+
+#include <memory>
+
+#include "CUDACore/SimpleVector.h"
+#include "DataFormats/PixelErrors.h"
+
+class SiPixelDigiErrorsSoA {
+public:
+  SiPixelDigiErrorsSoA() = default;
+  explicit SiPixelDigiErrorsSoA(size_t maxFedWords, PixelFormatterErrors errors);
+  ~SiPixelDigiErrorsSoA() = default;
+
+  SiPixelDigiErrorsSoA(const SiPixelDigiErrorsSoA&) = delete;
+  SiPixelDigiErrorsSoA& operator=(const SiPixelDigiErrorsSoA&) = delete;
+  SiPixelDigiErrorsSoA(SiPixelDigiErrorsSoA&&) = default;
+  SiPixelDigiErrorsSoA& operator=(SiPixelDigiErrorsSoA&&) = default;
+
+  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
+
+  cms::cuda::SimpleVector<PixelErrorCompact>* error() { return error_d.get(); }
+  cms::cuda::SimpleVector<PixelErrorCompact> const* error() const { return error_d.get(); }
+  cms::cuda::SimpleVector<PixelErrorCompact> const* c_error() const { return error_d.get(); }
+
+private:
+  std::unique_ptr<PixelErrorCompact[]> data_d;
+  std::unique_ptr<cms::cuda::SimpleVector<PixelErrorCompact>> error_d;
+  PixelFormatterErrors formatterErrors_h;
+};
+
+#endif
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.cc b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.cc
new file mode 100644
index 000000000..265f69064
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.cc
@@ -0,0 +1,21 @@
+#include "CUDADataFormats/SiPixelDigisSoA.h"
+
+SiPixelDigisSoA::SiPixelDigisSoA(size_t maxFedWords) {
+  xx_d = std::make_unique<uint16_t[]>(maxFedWords);
+  yy_d = std::make_unique<uint16_t[]>(maxFedWords);
+  adc_d = std::make_unique<uint16_t[]>(maxFedWords);
+  moduleInd_d = std::make_unique<uint16_t[]>(maxFedWords);
+  clus_d = std::make_unique<int32_t[]>(maxFedWords);
+
+  pdigi_d = std::make_unique<uint32_t[]>(maxFedWords);
+  rawIdArr_d = std::make_unique<uint32_t[]>(maxFedWords);
+
+  auto view = std::make_unique<DeviceConstView>();
+  view->xx_ = xx_d.get();
+  view->yy_ = yy_d.get();
+  view->adc_ = adc_d.get();
+  view->moduleInd_ = moduleInd_d.get();
+  view->clus_ = clus_d.get();
+
+  view_d = std::move(view);
+}
diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.h b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.h
new file mode 100644
index 000000000..5a0727f97
--- /dev/null
+++ b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.h
@@ -0,0 +1,74 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+
+#include "CUDACore/cudaCompat.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
+
+#include <memory>
+
+class SiPixelDigisSoA {
+public:
+  SiPixelDigisSoA() = default;
+  explicit SiPixelDigisSoA(size_t maxFedWords);
+  ~SiPixelDigisSoA() = default;
+
+  SiPixelDigisSoA(const SiPixelDigisSoA &) = delete;
+  SiPixelDigisSoA &operator=(const SiPixelDigisSoA &) = delete;
+  SiPixelDigisSoA(SiPixelDigisSoA &&) = default;
+  SiPixelDigisSoA &operator=(SiPixelDigisSoA &&) = default;
+
+  void setNModulesDigis(uint32_t nModules, uint32_t nDigis) {
+    nModules_h = nModules;
+    nDigis_h = nDigis;
+  }
+
+  uint32_t nModules() const { return nModules_h; }
+  uint32_t nDigis() const { return nDigis_h; }
+
+  uint16_t *xx() { return xx_d.get(); }
+  uint16_t *yy() { return yy_d.get(); }
+  uint16_t *adc() { return adc_d.get(); }
+  uint16_t *moduleInd() { return moduleInd_d.get(); }
+  int32_t *clus() { return clus_d.get(); }
+  uint32_t *pdigi() { return pdigi_d.get(); }
+  uint32_t *rawIdArr() { return rawIdArr_d.get(); }
+
+  uint16_t const *xx() const { return xx_d.get(); }
+  uint16_t const *yy() const { return yy_d.get(); }
+  uint16_t const *adc() const { return adc_d.get(); }
+  uint16_t const *moduleInd() const { return moduleInd_d.get(); }
+  int32_t const *clus() const { return clus_d.get(); }
+  uint32_t const *pdigi() const { return pdigi_d.get(); }
+  uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
+
+  uint16_t const *c_xx() const { return xx_d.get(); }
+  uint16_t const *c_yy() const { return yy_d.get(); }
+  uint16_t const *c_adc() const { return adc_d.get(); }
+  uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
+  int32_t const *c_clus() const { return clus_d.get(); }
+  uint32_t const *c_pdigi() const { return pdigi_d.get(); }
+  uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
+
+  using DeviceConstView = SiPixelDigisCUDA::DeviceConstView;
+
+  const DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  // These are consumed by downstream device code
+  std::unique_ptr<uint16_t[]> xx_d;         // local coordinates of each pixel
+  std::unique_ptr<uint16_t[]> yy_d;         //
+  std::unique_ptr<uint16_t[]> adc_d;        // ADC of each pixel
+  std::unique_ptr<uint16_t[]> moduleInd_d;  // module id of each pixel
+  std::unique_ptr<int32_t[]> clus_d;        // cluster id of each pixel
+  std::unique_ptr<DeviceConstView> view_d;  // "me" pointer
+
+  // These are for CPU output; should we (eventually) place them to a
+  // separate product?
+  std::unique_ptr<uint32_t[]> pdigi_d;
+  std::unique_ptr<uint32_t[]> rawIdArr_d;
+
+  uint32_t nModules_h = 0;
+  uint32_t nDigis_h = 0;
+};
+
+#endif
diff --git a/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h
index 027e7d25c..c037faa67 100644
--- a/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h
+++ b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h
@@ -20,9 +20,11 @@ class SiPixelFedCablingMapGPUWrapper {
 
   // returns pointer to GPU memory
   const SiPixelFedCablingMapGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
+  const SiPixelFedCablingMapGPU *getCPUProduct() const { return cablingMapHost; }
 
   // returns pointer to GPU memory
   const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const;
+  const unsigned char *getModToUnpAll() const { return modToUnpDefault.data(); }
 
 private:
   std::vector<unsigned char, cms::cuda::HostAllocator<unsigned char>> modToUnpDefault;
diff --git a/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
index 76e64e8f3..6e308da8f 100644
--- a/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
+++ b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
@@ -7,11 +7,20 @@
 SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const& gain,
                                                                  std::vector<char> gainData)
     : gainData_(std::move(gainData)) {
+  /*
   cudaCheck(cudaMallocHost(&gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU)));
   *gainForHLTonHost_ = gain;
+  */
+  gainForHLTonHost_ = new SiPixelGainForHLTonGPU(gain);
+  gainForHLTonHost_->v_pedestals = reinterpret_cast<SiPixelGainForHLTonGPU_DecodingStructure*>(gainData_.data());
 }
 
-SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { cudaCheck(cudaFreeHost(gainForHLTonHost_)); }
+SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() {
+  /*
+  cudaCheck(cudaFreeHost(gainForHLTonHost_));
+  */
+  delete gainForHLTonHost_;
+}
 
 SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
   cudaCheck(cudaFree(gainForHLTonGPU));
diff --git a/src/cudacompat/bin/main.cc b/src/cudacompat/bin/main.cc
index c8a76eee5..966f2a1f6 100644
--- a/src/cudacompat/bin/main.cc
+++ b/src/cudacompat/bin/main.cc
@@ -17,16 +17,15 @@ namespace {
   void print_help(std::string const& name) {
     std::cout
         << name
-        << ": [--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] [--transfer] [--validation] "
+        << ": [--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] [--validation] "
            "[--histogram] [--empty]\n\n"
         << "Options\n"
         << " --numberOfThreads   Number of threads to use (default 1)\n"
         << " --numberOfStreams   Number of concurrent events (default 0=numberOfThreads)\n"
         << " --maxEvents         Number of events to process (default -1 for all events in the input file)\n"
         << " --data              Path to the 'data' directory (default 'data' in the directory of the executable)\n"
-        << " --transfer          Transfer results from GPU to CPU (default is to leave them on GPU)\n"
-        << " --validation        Run (rudimentary) validation at the end (implies --transfer)\n"
-        << " --histogram         Produce histograms at the end (implies --transfer)\n"
+        << " --validation        Run (rudimentary) validation at the end\n"
+        << " --histogram         Produce histograms at the end\n"
         << " --empty             Ignore all producers (for testing only)\n"
         << std::endl;
   }
@@ -39,7 +38,6 @@ int main(int argc, char** argv) {
   int numberOfStreams = 0;
   int maxEvents = -1;
   std::filesystem::path datadir;
-  bool transfer = false;
   bool validation = false;
   bool histogram = false;
   bool empty = false;
@@ -59,13 +57,9 @@ int main(int argc, char** argv) {
     } else if (*i == "--data") {
       ++i;
       datadir = *i;
-    } else if (*i == "--transfer") {
-      transfer = true;
     } else if (*i == "--validation") {
-      transfer = true;
       validation = true;
     } else if (*i == "--histogram") {
-      transfer = true;
       histogram = true;
     } else if (*i == "--empty") {
       empty = true;
@@ -85,6 +79,8 @@ int main(int argc, char** argv) {
     std::cout << "Data directory '" << datadir << "' does not exist" << std::endl;
     return EXIT_FAILURE;
   }
+
+  // TODO: remove when can run without a GPU
   int numberOfDevices;
   auto status = cudaGetDeviceCount(&numberOfDevices);
   if (cudaSuccess != status) {
@@ -98,19 +94,11 @@ int main(int argc, char** argv) {
   std::vector<std::string> esmodules;
   if (not empty) {
     edmodules = {
-        "BeamSpotToCUDA", "SiPixelRawToClusterCUDA", "SiPixelRecHitCUDA", "CAHitNtupletCUDA", "PixelVertexProducerCUDA"};
+        "BeamSpotToPOD", "SiPixelRawToClusterCUDA", "SiPixelRecHitCUDA", "CAHitNtupletCUDA", "PixelVertexProducerCUDA"};
     esmodules = {"BeamSpotESProducer",
                  "SiPixelFedCablingMapGPUWrapperESProducer",
                  "SiPixelGainCalibrationForHLTGPUESProducer",
                  "PixelCPEFastESProducer"};
-    if (transfer) {
-      auto capos = std::find(edmodules.begin(), edmodules.end(), "CAHitNtupletCUDA");
-      assert(capos != edmodules.end());
-      edmodules.insert(capos + 1, "PixelTrackSoAFromCUDA");
-      auto vertpos = std::find(edmodules.begin(), edmodules.end(), "PixelVertexProducerCUDA");
-      assert(vertpos != edmodules.end());
-      edmodules.insert(vertpos + 1, "PixelVertexSoAFromCUDA");
-    }
     if (validation) {
       edmodules.emplace_back("CountValidator");
     }
diff --git a/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToPOD.cc b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToPOD.cc
new file mode 100644
index 000000000..209479c54
--- /dev/null
+++ b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToPOD.cc
@@ -0,0 +1,24 @@
+#include "DataFormats/BeamSpotPOD.h"
+#include "Framework/EDProducer.h"
+#include "Framework/Event.h"
+#include "Framework/EventSetup.h"
+#include "Framework/PluginFactory.h"
+
+class BeamSpotToPOD : public edm::EDProducer {
+public:
+  explicit BeamSpotToPOD(edm::ProductRegistry& reg);
+  ~BeamSpotToPOD() override = default;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+private:
+  const edm::EDPutTokenT<BeamSpotPOD> bsPutToken_;
+};
+
+BeamSpotToPOD::BeamSpotToPOD(edm::ProductRegistry& reg) : bsPutToken_{reg.produces<BeamSpotPOD>()} {}
+
+void BeamSpotToPOD::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  iEvent.emplace(bsPutToken_, iSetup.get<BeamSpotPOD>());
+}
+
+DEFINE_FWK_MODULE(BeamSpotToPOD);
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc
index 57baea007..ae0efa96c 100644
--- a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc
@@ -1,6 +1,3 @@
-#include <cuda_runtime.h>
-
-#include "CUDACore/Product.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
@@ -10,7 +7,7 @@
 
 #include "CAHitNtupletGeneratorOnGPU.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 class CAHitNtupletCUDA : public edm::EDProducer {
 public:
@@ -20,25 +17,23 @@ class CAHitNtupletCUDA : public edm::EDProducer {
 private:
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
+  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
 
   CAHitNtupletGeneratorOnGPU gpuAlgo_;
 };
 
 CAHitNtupletCUDA::CAHitNtupletCUDA(edm::ProductRegistry& reg)
-    : tokenHitGPU_{reg.consumes<cms::cuda::Product<TrackingRecHit2DGPU>>()},
-      tokenTrackGPU_{reg.produces<cms::cuda::Product<PixelTrackHeterogeneous>>()},
+    : tokenHitCPU_{reg.consumes<TrackingRecHit2DCPU>()},
+      tokenTrackCPU_{reg.produces<PixelTrackHeterogeneous>()},
       gpuAlgo_(reg) {}
 
 void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
   auto bf = 0.0114256972711507;  // 1/fieldInGeV
 
-  auto const& phits = iEvent.get(tokenHitGPU_);
-  cms::cuda::ScopedContextProduce ctx{phits};
-  auto const& hits = ctx.get(phits);
+  auto const& hits = iEvent.get(tokenHitCPU_);
 
-  ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+  iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
 }
 
 DEFINE_FWK_MODULE(CAHitNtupletCUDA);
diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
index d0e428da6..d91c1cc9f 100644
--- a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
+++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
@@ -43,7 +43,7 @@ namespace {
 
 using namespace std;
 CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(edm::ProductRegistry& reg)
-    : m_params(true,              // onGPU
+    : m_params(false,             // onGPU
                3,                 // minHitsPerNtuplet,
                458752,            // maxNumberOfDoublets
                false,             //useRiemannFit
diff --git a/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
index 15e3c486e..557329c9e 100644
--- a/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
+++ b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
@@ -32,7 +32,7 @@ class PixelVertexProducerCUDA : public edm::EDProducer {
 };
 
 PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg)
-    : m_OnGPU(true),
+    : m_OnGPU(false),
       m_gpuAlgo(true,   // oneKernel
                 true,   // useDensity
                 false,  // useDBSCAN
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
index 06624744e..090cb9e55 100644
--- a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -1,7 +1,6 @@
-#include "CUDACore/Product.h"
-#include "CUDADataFormats/SiPixelClustersCUDA.h"
-#include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
+#include "CUDADataFormats/SiPixelClustersSoA.h"
+#include "CUDADataFormats/SiPixelDigisSoA.h"
+#include "CUDADataFormats/SiPixelDigiErrorsSoA.h"
 #include "CondFormats/SiPixelGainCalibrationForHLTGPU.h"
 #include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"
 #include "CondFormats/SiPixelFedIds.h"
@@ -22,23 +21,20 @@
 #include <string>
 #include <vector>
 
-class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
+class SiPixelRawToClusterCUDA : public edm::EDProducer {
 public:
   explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg);
   ~SiPixelRawToClusterCUDA() override = default;
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
   cms::cuda::ContextState ctxState_;
 
   edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
-  edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
-  edm::EDPutTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
-  edm::EDPutTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterPutToken_;
+  edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
+  edm::EDPutTokenT<SiPixelDigiErrorsSoA> digiErrorPutToken_;
+  edm::EDPutTokenT<SiPixelClustersSoA> clusterPutToken_;
 
   pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
   std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
@@ -51,35 +47,31 @@ class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
 
 SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
     : rawGetToken_(reg.consumes<FEDRawDataCollection>()),
-      digiPutToken_(reg.produces<cms::cuda::Product<SiPixelDigisCUDA>>()),
-      clusterPutToken_(reg.produces<cms::cuda::Product<SiPixelClustersCUDA>>()),
+      digiPutToken_(reg.produces<SiPixelDigisSoA>()),
+      clusterPutToken_(reg.produces<SiPixelClustersSoA>()),
       isRun2_(true),
       includeErrors_(true),
       useQuality_(true) {
   if (includeErrors_) {
-    digiErrorPutToken_ = reg.produces<cms::cuda::Product<SiPixelDigiErrorsCUDA>>();
+    digiErrorPutToken_ = reg.produces<SiPixelDigiErrorsSoA>();
   }
 
   wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
 }
 
-void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
-                                      const edm::EventSetup& iSetup,
-                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
-
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
   auto const& hgpuMap = iSetup.get<SiPixelFedCablingMapGPUWrapper>();
   if (hgpuMap.hasQuality() != useQuality_) {
     throw std::runtime_error("UseQuality of the module (" + std::to_string(useQuality_) +
                              ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.");
   }
   // get the GPU product already here so that the async transfer can begin
-  const auto* gpuMap = hgpuMap.getGPUProductAsync(ctx.stream());
-  const unsigned char* gpuModulesToUnpack = hgpuMap.getModToUnpAllAsync(ctx.stream());
+  const auto* gpuMap = hgpuMap.getCPUProduct();
+  const unsigned char* gpuModulesToUnpack = hgpuMap.getModToUnpAll();
 
   auto const& hgains = iSetup.get<SiPixelGainCalibrationForHLTGPU>();
   // get the GPU product already here so that the async transfer can begin
-  const auto* gpuGains = hgains.getGPUProductAsync(ctx.stream());
+  const auto* gpuGains = hgains.getCPUProduct();
 
   auto const& fedIds_ = iSetup.get<SiPixelFedIds>().fedIds();
 
@@ -147,28 +139,23 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
 
   }  // end of for loop
 
-  gpuAlgo_.makeClustersAsync(isRun2_,
-                             gpuMap,
-                             gpuModulesToUnpack,
-                             gpuGains,
-                             *wordFedAppender_,
-                             std::move(errors_),
-                             wordCounterGPU,
-                             fedCounter,
-                             useQuality_,
-                             includeErrors_,
-                             false,  // debug
-                             ctx.stream());
-}
-
-void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
+  gpuAlgo_.makeClusters(isRun2_,
+                        gpuMap,
+                        gpuModulesToUnpack,
+                        gpuGains,
+                        *wordFedAppender_,
+                        std::move(errors_),
+                        wordCounterGPU,
+                        fedCounter,
+                        useQuality_,
+                        includeErrors_,
+                        false);  // debug
 
   auto tmp = gpuAlgo_.getResults();
-  ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
-  ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
+  iEvent.emplace(digiPutToken_, std::move(tmp.first));
+  iEvent.emplace(clusterPutToken_, std::move(tmp.second));
   if (includeErrors_) {
-    ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors());
+    iEvent.emplace(digiErrorPutToken_, gpuAlgo_.getErrors());
   }
 }
 
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cc
similarity index 74%
rename from src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
rename to src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cc
index f5070130a..b8ad06ed0 100644
--- a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cc
@@ -17,15 +17,9 @@
 #include <iostream>
 #include <string>
 
-// CUDA includes
-#include <cuda.h>
-#include <cuda_runtime.h>
-
 // CMSSW includes
+#include "CUDACore/cudaCompat.h"
 #include "CUDADataFormats/gpuClusteringConstants.h"
-#include "CUDACore/cudaCheck.h"
-#include "CUDACore/device_unique_ptr.h"
-#include "CUDACore/host_unique_ptr.h"
 #include "CondFormats/SiPixelFedCablingMapGPU.h"
 
 // local includes
@@ -40,8 +34,8 @@ namespace pixelgpudetails {
   constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
 
   SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
-    word_ = cms::cuda::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
-    fedId_ = cms::cuda::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+    word_ = std::make_unique<unsigned int[]>(MAX_FED_WORDS);
+    fedId_ = std::make_unique<unsigned char[]>(MAX_FED_WORDS);
   }
 
   void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId,
@@ -519,140 +513,85 @@ namespace pixelgpudetails {
   }
 
   // Interface to outside
-  void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2,
-                                                       const SiPixelFedCablingMapGPU *cablingMap,
-                                                       const unsigned char *modToUnp,
-                                                       const SiPixelGainForHLTonGPU *gains,
-                                                       const WordFedAppender &wordFed,
-                                                       PixelFormatterErrors &&errors,
-                                                       const uint32_t wordCounter,
-                                                       const uint32_t fedCounter,
-                                                       bool useQualityInfo,
-                                                       bool includeErrors,
-                                                       bool debug,
-                                                       cudaStream_t stream) {
-    nDigis = wordCounter;
-
+  void SiPixelRawToClusterGPUKernel::makeClusters(bool isRun2,
+                                                  const SiPixelFedCablingMapGPU *cablingMap,
+                                                  const unsigned char *modToUnp,
+                                                  const SiPixelGainForHLTonGPU *gains,
+                                                  const WordFedAppender &wordFed,
+                                                  PixelFormatterErrors &&errors,
+                                                  const uint32_t wordCounter,
+                                                  const uint32_t fedCounter,
+                                                  bool useQualityInfo,
+                                                  bool includeErrors,
+                                                  bool debug) {
 #ifdef GPU_DEBUG
     std::cout << "decoding " << wordCounter << " digis. Max is " << pixelgpudetails::MAX_FED_WORDS << std::endl;
 #endif
 
-    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream);
+    digis_d = SiPixelDigisSoA(pixelgpudetails::MAX_FED_WORDS);
     if (includeErrors) {
-      digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
+      digiErrors_d = SiPixelDigiErrorsSoA(pixelgpudetails::MAX_FED_WORDS, std::move(errors));
     }
-    clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
-
-    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
+    clusters_d = SiPixelClustersSoA(gpuClustering::MaxNumModules);
 
     if (wordCounter)  // protect in case of empty event....
     {
-      const int threadsPerBlock = 512;
-      const int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;  // fill it all
-
       assert(0 == wordCounter % 2);
-      // wordCounter is the total no of words in each event to be trasfered on device
-      auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
-      auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
-
-      cudaCheck(
-          cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
-      cudaCheck(cudaMemcpyAsync(
-          fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream));
-
       // Launch rawToDigi kernel
-      RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream>>>(
-          cablingMap,
-          modToUnp,
-          wordCounter,
-          word_d.get(),
-          fedId_d.get(),
-          digis_d.xx(),
-          digis_d.yy(),
-          digis_d.adc(),
-          digis_d.pdigi(),
-          digis_d.rawIdArr(),
-          digis_d.moduleInd(),
-          digiErrors_d.error(),  // returns nullptr if default-constructed
-          useQualityInfo,
-          includeErrors,
-          debug);
-      cudaCheck(cudaGetLastError());
-#ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
-#endif
-
-      if (includeErrors) {
-        digiErrors_d.copyErrorToHostAsync(stream);
-      }
+      RawToDigi_kernel(cablingMap,
+                       modToUnp,
+                       wordCounter,
+                       wordFed.word(),
+                       wordFed.fedId(),
+                       digis_d.xx(),
+                       digis_d.yy(),
+                       digis_d.adc(),
+                       digis_d.pdigi(),
+                       digis_d.rawIdArr(),
+                       digis_d.moduleInd(),
+                       digiErrors_d.error(),  // returns nullptr if default-constructed
+                       useQualityInfo,
+                       includeErrors,
+                       debug);
     }
     // End of Raw2Digi and passing data for clustering
 
     {
       // clusterizer ...
       using namespace gpuClustering;
-      int threadsPerBlock = 256;
-      int blocks =
-          (std::max(int(wordCounter), int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
-
-      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
-                                                                        digis_d.moduleInd(),
-                                                                        digis_d.c_xx(),
-                                                                        digis_d.c_yy(),
-                                                                        digis_d.adc(),
-                                                                        gains,
-                                                                        wordCounter,
-                                                                        clusters_d.moduleStart(),
-                                                                        clusters_d.clusInModule(),
-                                                                        clusters_d.clusModuleStart());
-      cudaCheck(cudaGetLastError());
-#ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
-#endif
-
-#ifdef GPU_DEBUG
-      std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock
-                << " threads\n";
-#endif
-
-      countModules<<<blocks, threadsPerBlock, 0, stream>>>(
-          digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
-      cudaCheck(cudaGetLastError());
+      gpuCalibPixel::calibDigis(isRun2,
+                                digis_d.moduleInd(),
+                                digis_d.c_xx(),
+                                digis_d.c_yy(),
+                                digis_d.adc(),
+                                gains,
+                                wordCounter,
+                                clusters_d.moduleStart(),
+                                clusters_d.clusInModule(),
+                                clusters_d.clusModuleStart());
+
+      countModules(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
 
       // read the number of modules into a data member, used by getProduct())
-      cudaCheck(cudaMemcpyAsync(
-          &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
+      digis_d.setNModulesDigis(clusters_d.moduleStart()[0], wordCounter);
 
-      threadsPerBlock = 256;
-      blocks = MaxNumModules;
-#ifdef GPU_DEBUG
-      std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
-#endif
-      findClus<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.c_moduleInd(),
-                                                       digis_d.c_xx(),
-                                                       digis_d.c_yy(),
-                                                       clusters_d.c_moduleStart(),
-                                                       clusters_d.clusInModule(),
-                                                       clusters_d.moduleId(),
-                                                       digis_d.clus(),
-                                                       wordCounter);
-      cudaCheck(cudaGetLastError());
-#ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
-#endif
+      findClus(digis_d.c_moduleInd(),
+               digis_d.c_xx(),
+               digis_d.c_yy(),
+               clusters_d.c_moduleStart(),
+               clusters_d.clusInModule(),
+               clusters_d.moduleId(),
+               digis_d.clus(),
+               wordCounter);
 
       // apply charge cut
-      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
-                                                               digis_d.c_adc(),
-                                                               clusters_d.c_moduleStart(),
-                                                               clusters_d.clusInModule(),
-                                                               clusters_d.c_moduleId(),
-                                                               digis_d.clus(),
-                                                               wordCounter);
-      cudaCheck(cudaGetLastError());
+      clusterChargeCut(digis_d.moduleInd(),
+                       digis_d.c_adc(),
+                       clusters_d.c_moduleStart(),
+                       clusters_d.clusInModule(),
+                       clusters_d.c_moduleId(),
+                       digis_d.clus(),
+                       wordCounter);
 
       // count the module start indices already here (instead of
       // rechits) so that the number of clusters/hits can be made
@@ -660,20 +599,10 @@ namespace pixelgpudetails {
       // synchronization/ExternalWork
 
       // MUST be ONE block
-      fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.c_clusInModule(), clusters_d.clusModuleStart());
+      fillHitsModuleStart(clusters_d.c_clusInModule(), clusters_d.clusModuleStart());
 
       // last element holds the number of all clusters
-      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]),
-                                clusters_d.clusModuleStart() + gpuClustering::MaxNumModules,
-                                sizeof(uint32_t),
-                                cudaMemcpyDefault,
-                                stream));
-
-#ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
-#endif
-
+      clusters_d.setNClusters(clusters_d.clusModuleStart()[gpuClustering::MaxNumModules]);
     }  // end clusterizer scope
   }
 }  // namespace pixelgpudetails
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
index 3cbce9e71..8360f70c9 100644
--- a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
+++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
@@ -2,14 +2,13 @@
 #define RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
 
 #include <algorithm>
-#include <cuda_runtime.h>
+#include <memory>
 
-#include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
-#include "CUDADataFormats/SiPixelClustersCUDA.h"
+#include "CUDACore/cudaCompat.h"
+#include "CUDADataFormats/SiPixelDigisSoA.h"
+#include "CUDADataFormats/SiPixelDigiErrorsSoA.h"
+#include "CUDADataFormats/SiPixelClustersSoA.h"
 #include "CUDACore/SimpleVector.h"
-#include "CUDACore/host_unique_ptr.h"
-#include "CUDACore/host_noncached_unique_ptr.h"
 #include "DataFormats/PixelErrors.h"
 
 struct SiPixelFedCablingMapGPU;
@@ -155,8 +154,8 @@ namespace pixelgpudetails {
       const unsigned char* fedId() const { return fedId_.get(); }
 
     private:
-      cms::cuda::host::noncached::unique_ptr<unsigned int[]> word_;
-      cms::cuda::host::noncached::unique_ptr<unsigned char[]> fedId_;
+      std::unique_ptr<unsigned int[]> word_;
+      std::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;
@@ -167,42 +166,29 @@ namespace pixelgpudetails {
     SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
     SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
 
-    void makeClustersAsync(bool isRun2,
-                           const SiPixelFedCablingMapGPU* cablingMap,
-                           const unsigned char* modToUnp,
-                           const SiPixelGainForHLTonGPU* gains,
-                           const WordFedAppender& wordFed,
-                           PixelFormatterErrors&& errors,
-                           const uint32_t wordCounter,
-                           const uint32_t fedCounter,
-                           bool useQualityInfo,
-                           bool includeErrors,
-                           bool debug,
-                           cudaStream_t stream);
-
-    std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
-      digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
-      clusters_d.setNClusters(nModules_Clusters_h[1]);
-      // need to explicitly deallocate while the associated CUDA
-      // stream is still alive
-      //
-      // technically the statement above is not true anymore now that
-      // the CUDA streams are cached within the cms::cuda::StreamCache, but it is
-      // still better to release as early as possible
-      nModules_Clusters_h.reset();
+    void makeClusters(bool isRun2,
+                      const SiPixelFedCablingMapGPU* cablingMap,
+                      const unsigned char* modToUnp,
+                      const SiPixelGainForHLTonGPU* gains,
+                      const WordFedAppender& wordFed,
+                      PixelFormatterErrors&& errors,
+                      const uint32_t wordCounter,
+                      const uint32_t fedCounter,
+                      bool useQualityInfo,
+                      bool includeErrors,
+                      bool debug);
+
+    std::pair<SiPixelDigisSoA, SiPixelClustersSoA> getResults() {
       return std::make_pair(std::move(digis_d), std::move(clusters_d));
     }
 
-    SiPixelDigiErrorsCUDA&& getErrors() { return std::move(digiErrors_d); }
+    SiPixelDigiErrorsSoA&& getErrors() { return std::move(digiErrors_d); }
 
   private:
-    uint32_t nDigis = 0;
-
     // Data to be put in the event
-    cms::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
-    SiPixelDigisCUDA digis_d;
-    SiPixelClustersCUDA clusters_d;
-    SiPixelDigiErrorsCUDA digiErrors_d;
+    SiPixelDigisSoA digis_d;
+    SiPixelClustersSoA clusters_d;
+    SiPixelDigiErrorsSoA digiErrors_d;
   };
 
   // see RecoLocalTracker/SiPixelClusterizer
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h
index da36be6c4..72ad36e1f 100644
--- a/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h
+++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h
@@ -35,7 +35,7 @@ namespace gpuCalibPixel {
     // zero for next kernels...
     if (0 == first)
       clusModuleStart[0] = moduleStart[0] = 0;
-    for (int i = first; i < gpuClustering::MaxNumModules; i += gridDim.x * blockDim.x) {
+    for (int i = first; i < static_cast<int>(gpuClustering::MaxNumModules); i += gridDim.x * blockDim.x) {
       nClustersInModule[i] = 0;
     }
 
diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cc b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cc
new file mode 100644
index 000000000..e3ce1fb01
--- /dev/null
+++ b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cc
@@ -0,0 +1,56 @@
+// C++ headers
+#include <algorithm>
+#include <numeric>
+
+// CMSSW headers
+#include "CUDACore/cudaCompat.h"
+
+#include "plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h"  // !
+#include "plugin-SiPixelClusterizer/gpuClusteringConstants.h"        // !
+
+#include "PixelRecHits.h"
+#include "gpuPixelRecHits.h"
+
+namespace {
+  __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart,
+                                    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                    uint32_t* hitsLayerStart) {
+    assert(0 == hitsModuleStart[0]);
+
+    int begin = blockIdx.x * blockDim.x + threadIdx.x;
+    constexpr int end = 11;
+    for (int i = begin; i < end; i += blockDim.x * gridDim.x) {
+      hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]];
+#ifdef GPU_DEBUG
+      printf("LayerStart %d %d: %d\n", i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]);
+#endif
+    }
+  }
+}  // namespace
+
+namespace pixelgpudetails {
+
+  TrackingRecHit2DCPU PixelRecHitGPUKernel::makeHits(SiPixelDigisSoA const& digis_d,
+                                                     SiPixelClustersSoA const& clusters_d,
+                                                     BeamSpotPOD const& bs_d,
+                                                     pixelCPEforGPU::ParamsOnGPU const* cpeParams) const {
+    auto nHits = clusters_d.nClusters();
+    TrackingRecHit2DCPU hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), nullptr);
+
+    if (digis_d.nModules())  // protect from empty events
+      gpuPixelRecHits::getHits(cpeParams, &bs_d, digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
+    cudaCheck(cudaGetLastError());
+
+    // assuming full warp of threads is better than a smaller number...
+    if (nHits) {
+      setHitsLayerStart(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
+    }
+
+    if (nHits) {
+      cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256);
+    }
+
+    return hits_d;
+  }
+
+}  // namespace pixelgpudetails
diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu
deleted file mode 100644
index 4cd3fc152..000000000
--- a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu
+++ /dev/null
@@ -1,78 +0,0 @@
-// C++ headers
-#include <algorithm>
-#include <numeric>
-
-// CUDA runtime
-#include <cuda_runtime.h>
-
-// CMSSW headers
-#include "CUDACore/cudaCheck.h"
-#include "CUDACore/device_unique_ptr.h"
-#include "plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h"  // !
-#include "plugin-SiPixelClusterizer/gpuClusteringConstants.h"        // !
-
-#include "PixelRecHits.h"
-#include "gpuPixelRecHits.h"
-
-namespace {
-  __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart,
-                                    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                    uint32_t* hitsLayerStart) {
-    auto i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    assert(0 == hitsModuleStart[0]);
-
-    if (i < 11) {
-      hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]];
-#ifdef GPU_DEBUG
-      printf("LayerStart %d %d: %d\n", i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]);
-#endif
-    }
-  }
-}  // namespace
-
-namespace pixelgpudetails {
-
-  TrackingRecHit2DCUDA PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
-                                                           SiPixelClustersCUDA const& clusters_d,
-                                                           BeamSpotCUDA const& bs_d,
-                                                           pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                                           cudaStream_t stream) const {
-    auto nHits = clusters_d.nClusters();
-    TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream);
-
-    int threadsPerBlock = 128;
-    int blocks = digis_d.nModules();  // active modules (with digis)
-
-#ifdef GPU_DEBUG
-    std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
-#endif
-    if (blocks)  // protect from empty events
-      gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream>>>(
-          cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
-    cudaCheck(cudaGetLastError());
-#ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
-#endif
-
-    // assuming full warp of threads is better than a smaller number...
-    if (nHits) {
-      setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
-      cudaCheck(cudaGetLastError());
-    }
-
-    if (nHits) {
-      cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream);
-      cudaCheck(cudaGetLastError());
-    }
-
-#ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
-#endif
-
-    return hits_d;
-  }
-
-}  // namespace pixelgpudetails
diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h
index 8f5653fbd..68c02230d 100644
--- a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h
+++ b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h
@@ -3,12 +3,10 @@
 
 #include <cstdint>
 
-#include <cuda_runtime.h>
-
-#include "CUDADataFormats/BeamSpotCUDA.h"
-#include "CUDADataFormats/SiPixelClustersCUDA.h"
-#include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "DataFormats/BeamSpotPOD.h"
+#include "CUDADataFormats/SiPixelClustersSoA.h"
+#include "CUDADataFormats/SiPixelDigisSoA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 namespace pixelgpudetails {
 
@@ -22,11 +20,10 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
-    TrackingRecHit2DCUDA makeHitsAsync(SiPixelDigisCUDA const& digis_d,
-                                       SiPixelClustersCUDA const& clusters_d,
-                                       BeamSpotCUDA const& bs_d,
-                                       pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                       cudaStream_t stream) const;
+    TrackingRecHit2DCPU makeHits(SiPixelDigisSoA const& digis_d,
+                                 SiPixelClustersSoA const& clusters_d,
+                                 BeamSpotPOD const& bs_d,
+                                 pixelCPEforGPU::ParamsOnGPU const* cpeParams) const;
   };
 }  // namespace pixelgpudetails
 
diff --git a/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
index a82e23eab..d5b9dc687 100644
--- a/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
+++ b/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
@@ -1,10 +1,9 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/BeamSpotCUDA.h"
-#include "CUDACore/Product.h"
-#include "CUDADataFormats/SiPixelClustersCUDA.h"
-#include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/SiPixelClustersSoA.h"
+#include "CUDADataFormats/SiPixelDigisSoA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
+#include "DataFormats/BeamSpotPOD.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
@@ -23,39 +22,34 @@ class SiPixelRecHitCUDA : public edm::EDProducer {
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
   // The mess with inputs will be cleaned up when migrating to the new framework
-  edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
+  edm::EDGetTokenT<BeamSpotPOD> tBeamSpot;
+  edm::EDGetTokenT<SiPixelClustersSoA> token_;
+  edm::EDGetTokenT<SiPixelDigisSoA> tokenDigi_;
 
-  edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tokenHit_;
+  edm::EDPutTokenT<TrackingRecHit2DCPU> tokenHit_;
 
   pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
 };
 
 SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg)
-    : tBeamSpot(reg.consumes<cms::cuda::Product<BeamSpotCUDA>>()),
-      token_(reg.consumes<cms::cuda::Product<SiPixelClustersCUDA>>()),
-      tokenDigi_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
-      tokenHit_(reg.produces<cms::cuda::Product<TrackingRecHit2DCUDA>>()) {}
+    : tBeamSpot(reg.consumes<BeamSpotPOD>()),
+      token_(reg.consumes<SiPixelClustersSoA>()),
+      tokenDigi_(reg.consumes<SiPixelDigisSoA>()),
+      tokenHit_(reg.produces<TrackingRecHit2DCPU>()) {}
 
 void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
   PixelCPEFast const& fcpe = es.get<PixelCPEFast>();
 
-  auto const& pclusters = iEvent.get(token_);
-  cms::cuda::ScopedContextProduce ctx{pclusters};
-
-  auto const& clusters = ctx.get(pclusters);
-  auto const& digis = ctx.get(iEvent, tokenDigi_);
-  auto const& bs = ctx.get(iEvent, tBeamSpot);
+  auto const& clusters = iEvent.get(token_);
+  auto const& digis = iEvent.get(tokenDigi_);
+  auto const& bs = iEvent.get(tBeamSpot);
 
   auto nHits = clusters.nClusters();
   if (nHits >= TrackingRecHit2DSOAView::maxHits()) {
     std::cout << "Clusters/Hits Overflow " << nHits << " >= " << TrackingRecHit2DSOAView::maxHits() << std::endl;
   }
 
-  ctx.emplace(iEvent,
-              tokenHit_,
-              gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream()));
+  iEvent.emplace(tokenHit_, gpuAlgo_.makeHits(digis, clusters, bs, &fcpe.getCPUProduct()));
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
diff --git a/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
index 433d3b012..d58984893 100644
--- a/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
+++ b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
@@ -5,8 +5,8 @@
 #include <cstdio>
 #include <limits>
 
-#include "CUDADataFormats/BeamSpotCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "DataFormats/BeamSpotPOD.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "DataFormats/approx_atan2.h"
 #include "CUDACore/cuda_assert.h"
 #include "CondFormats/pixelCPEforGPU.h"
diff --git a/src/cudacompat/plugin-Validation/CountValidator.cc b/src/cudacompat/plugin-Validation/CountValidator.cc
index 23352f5ba..92a6c148d 100644
--- a/src/cudacompat/plugin-Validation/CountValidator.cc
+++ b/src/cudacompat/plugin-Validation/CountValidator.cc
@@ -1,8 +1,6 @@
-#include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/SiPixelClustersCUDA.h"
-#include "CUDADataFormats/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelClustersSoA.h"
+#include "CUDADataFormats/SiPixelDigisSoA.h"
 #include "CUDADataFormats/ZVertexHeterogeneous.h"
 #include "DataFormats/DigiClusterCount.h"
 #include "DataFormats/TrackCount.h"
@@ -38,8 +36,8 @@ class CountValidator : public edm::EDProducer {
   edm::EDGetTokenT<TrackCount> trackCountToken_;
   edm::EDGetTokenT<VertexCount> vertexCountToken_;
 
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiToken_;
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterToken_;
+  edm::EDGetTokenT<SiPixelDigisSoA> digiToken_;
+  edm::EDGetTokenT<SiPixelClustersSoA> clusterToken_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> trackToken_;
   edm::EDGetTokenT<ZVertexHeterogeneous> vertexToken_;
 };
@@ -48,8 +46,8 @@ CountValidator::CountValidator(edm::ProductRegistry& reg)
     : digiClusterCountToken_(reg.consumes<DigiClusterCount>()),
       trackCountToken_(reg.consumes<TrackCount>()),
       vertexCountToken_(reg.consumes<VertexCount>()),
-      digiToken_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
-      clusterToken_(reg.consumes<cms::cuda::Product<SiPixelClustersCUDA>>()),
+      digiToken_(reg.consumes<SiPixelDigisSoA>()),
+      clusterToken_(reg.consumes<SiPixelClustersSoA>()),
       trackToken_(reg.consumes<PixelTrackHeterogeneous>()),
       vertexToken_(reg.consumes<ZVertexHeterogeneous>()) {}
 
@@ -63,11 +61,9 @@ void CountValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup)
   ss << "Event " << iEvent.eventID() << " ";
 
   {
-    auto const& pdigis = iEvent.get(digiToken_);
-    cms::cuda::ScopedContextProduce ctx{pdigis};
     auto const& count = iEvent.get(digiClusterCountToken_);
-    auto const& digis = ctx.get(iEvent, digiToken_);
-    auto const& clusters = ctx.get(iEvent, clusterToken_);
+    auto const& digis = iEvent.get(digiToken_);
+    auto const& clusters = iEvent.get(clusterToken_);
 
     if (digis.nModules() != count.nModules()) {
       ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules();
diff --git a/src/cudacompat/plugin-Validation/HistoValidator.cc b/src/cudacompat/plugin-Validation/HistoValidator.cc
index d7b11d4b2..47f0159f4 100644
--- a/src/cudacompat/plugin-Validation/HistoValidator.cc
+++ b/src/cudacompat/plugin-Validation/HistoValidator.cc
@@ -1,9 +1,7 @@
-#include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/SiPixelClustersCUDA.h"
-#include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/SiPixelClustersSoA.h"
+#include "CUDADataFormats/SiPixelDigisSoA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/ZVertexHeterogeneous.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
@@ -15,34 +13,20 @@
 #include <map>
 #include <fstream>
 
-class HistoValidator : public edm::EDProducerExternalWork {
+class HistoValidator : public edm::EDProducer {
 public:
   explicit HistoValidator(edm::ProductRegistry& reg);
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
   void endJob() override;
 
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiToken_;
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterToken_;
-  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> hitToken_;
+  edm::EDGetTokenT<SiPixelDigisSoA> digiToken_;
+  edm::EDGetTokenT<SiPixelClustersSoA> clusterToken_;
+  edm::EDGetTokenT<TrackingRecHit2DCPU> hitToken_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> trackToken_;
   edm::EDGetTokenT<ZVertexHeterogeneous> vertexToken_;
 
-  uint32_t nDigis;
-  uint32_t nModules;
-  uint32_t nClusters;
-  uint32_t nHits;
-  cms::cuda::host::unique_ptr<uint16_t[]> h_adc;
-  cms::cuda::host::unique_ptr<uint32_t[]> h_clusInModule;
-  cms::cuda::host::unique_ptr<float[]> h_localCoord;
-  cms::cuda::host::unique_ptr<float[]> h_globalCoord;
-  cms::cuda::host::unique_ptr<int32_t[]> h_charge;
-  cms::cuda::host::unique_ptr<int16_t[]> h_size;
-
   static std::map<std::string, SimpleAtomicHisto> histos;
 };
 
@@ -82,69 +66,49 @@ std::map<std::string, SimpleAtomicHisto> HistoValidator::histos = {
     {"vertex_pt2", SimpleAtomicHisto(100, 0, 4000)}};
 
 HistoValidator::HistoValidator(edm::ProductRegistry& reg)
-    : digiToken_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
-      clusterToken_(reg.consumes<cms::cuda::Product<SiPixelClustersCUDA>>()),
-      hitToken_(reg.consumes<cms::cuda::Product<TrackingRecHit2DCUDA>>()),
+    : digiToken_(reg.consumes<SiPixelDigisSoA>()),
+      clusterToken_(reg.consumes<SiPixelClustersSoA>()),
+      hitToken_(reg.consumes<TrackingRecHit2DCPU>()),
       trackToken_(reg.consumes<PixelTrackHeterogeneous>()),
       vertexToken_(reg.consumes<ZVertexHeterogeneous>()) {}
 
-void HistoValidator::acquire(const edm::Event& iEvent,
-                             const edm::EventSetup& iSetup,
-                             edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& pdigis = iEvent.get(digiToken_);
-  cms::cuda::ScopedContextAcquire ctx{pdigis, std::move(waitingTaskHolder)};
-  auto const& digis = ctx.get(iEvent, digiToken_);
-  auto const& clusters = ctx.get(iEvent, clusterToken_);
-  auto const& hits = ctx.get(iEvent, hitToken_);
-
-  nDigis = digis.nDigis();
-  nModules = digis.nModules();
-  h_adc = digis.adcToHostAsync(ctx.stream());
-
-  nClusters = clusters.nClusters();
-  h_clusInModule = cms::cuda::make_host_unique<uint32_t[]>(nModules, ctx.stream());
-  cudaCheck(cudaMemcpyAsync(
-      h_clusInModule.get(), clusters.clusInModule(), sizeof(uint32_t) * nModules, cudaMemcpyDefault, ctx.stream()));
-
-  nHits = hits.nHits();
-  h_localCoord = hits.localCoordToHostAsync(ctx.stream());
-  h_globalCoord = hits.globalCoordToHostAsync(ctx.stream());
-  h_charge = hits.chargeToHostAsync(ctx.stream());
-  h_size = hits.sizeToHostAsync(ctx.stream());
-}
-
 void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  auto const& digis = iEvent.get(digiToken_);
+  auto const& clusters = iEvent.get(clusterToken_);
+
+  auto const nDigis = digis.nDigis();
+  auto const nModules = digis.nModules();
+
+  auto const nClusters = clusters.nClusters();
+
+  auto const* hits = iEvent.get(hitToken_).view();
+
   histos["digi_n"].fill(nDigis);
   for (uint32_t i = 0; i < nDigis; ++i) {
-    histos["digi_adc"].fill(h_adc[i]);
+    histos["digi_adc"].fill(digis.adc()[i]);
   }
-  h_adc.reset();
   histos["module_n"].fill(nModules);
 
   histos["cluster_n"].fill(nClusters);
   for (uint32_t i = 0; i < nModules; ++i) {
-    histos["cluster_per_module_n"].fill(h_clusInModule[i]);
+    histos["cluster_per_module_n"].fill(clusters.clusInModule()[i]);
   }
-  h_clusInModule.reset();
 
+  auto const nHits = hits->nHits();
   histos["hit_n"].fill(nHits);
   for (uint32_t i = 0; i < nHits; ++i) {
-    histos["hit_lx"].fill(h_localCoord[i]);
-    histos["hit_ly"].fill(h_localCoord[i + nHits]);
-    histos["hit_lex"].fill(h_localCoord[i + 2 * nHits]);
-    histos["hit_ley"].fill(h_localCoord[i + 3 * nHits]);
-    histos["hit_gx"].fill(h_globalCoord[i]);
-    histos["hit_gy"].fill(h_globalCoord[i + nHits]);
-    histos["hit_gz"].fill(h_globalCoord[i + 2 * nHits]);
-    histos["hit_gr"].fill(h_globalCoord[i + 3 * nHits]);
-    histos["hit_charge"].fill(h_charge[i]);
-    histos["hit_sizex"].fill(h_size[i]);
-    histos["hit_sizey"].fill(h_size[i + nHits]);
+    histos["hit_lx"].fill(hits->xLocal(i));
+    histos["hit_ly"].fill(hits->yLocal(i));
+    histos["hit_lex"].fill(hits->xerrLocal(i));
+    histos["hit_ley"].fill(hits->yerrLocal(i));
+    histos["hit_gx"].fill(hits->xGlobal(i));
+    histos["hit_gy"].fill(hits->yGlobal(i));
+    histos["hit_gz"].fill(hits->zGlobal(i));
+    histos["hit_gr"].fill(hits->rGlobal(i));
+    histos["hit_charge"].fill(hits->charge(i));
+    histos["hit_sizex"].fill(hits->clusterSizeX(i));
+    histos["hit_sizey"].fill(hits->clusterSizeY(i));
   }
-  h_localCoord.reset();
-  h_globalCoord.reset();
-  h_charge.reset();
-  h_size.reset();
 
   {
     auto const& tracks = iEvent.get(trackToken_);
@@ -183,7 +147,7 @@ void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup)
 }
 
 void HistoValidator::endJob() {
-  std::ofstream out("histograms_cuda.txt");
+  std::ofstream out("histograms_cudacompat.txt");
   for (auto const& elem : histos) {
     out << elem.first << " " << elem.second << "\n";
   }
diff --git a/src/cudacompat/plugins.txt b/src/cudacompat/plugins.txt
index 52dfe3102..bab0d03f6 100644
--- a/src/cudacompat/plugins.txt
+++ b/src/cudacompat/plugins.txt
@@ -1,5 +1,6 @@
 BeamSpotESProducer pluginBeamSpotProducer.so
 BeamSpotToCUDA pluginBeamSpotProducer.so
+BeamSpotToPOD pluginBeamSpotProducer.so
 CAHitNtupletCUDA pluginPixelTriplets.so
 CountValidator pluginValidation.so
 HistoValidator pluginValidation.so

From 438200cb5a65eb8d37999f6c24761d61ec11c0b7 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Dec 2020 08:46:30 -0800
Subject: [PATCH 3/6] [cudacompat] Make clusterizer kernels independent of grid
 size (cms-patatrack/cmssw#588)

---
 .../gpuClusterChargeCut.h                     | 173 ++++----
 .../plugin-SiPixelClusterizer/gpuClustering.h | 408 +++++++++---------
 src/cudacompat/test/gpuClustering_t.h         |  37 +-
 3 files changed, 304 insertions(+), 314 deletions(-)

diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
index d0dd93044..000a7d36e 100644
--- a/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
+++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
@@ -19,105 +19,106 @@ namespace gpuClustering {
       uint32_t const* __restrict__ moduleId,     // module id of each module
       int32_t* __restrict__ clusterId,           // modified: cluster id of each pixel
       uint32_t numElements) {
-    if (blockIdx.x >= moduleStart[0])
-      return;
-
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
-    assert(thisModuleId == moduleId[blockIdx.x]);
+    __shared__ int32_t charge[MaxNumClustersPerModules];
+    __shared__ uint8_t ok[MaxNumClustersPerModules];
+    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
 
-    auto nclus = nClustersInModule[thisModuleId];
-    if (nclus == 0)
-      return;
+    auto firstModule = blockIdx.x;
+    auto endModule = moduleStart[0];
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto firstPixel = moduleStart[1 + module];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
+      assert(thisModuleId == moduleId[module]);
+
+      auto nclus = nClustersInModule[thisModuleId];
+      if (nclus == 0)
+        continue;
+
+      if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
+        printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+               thisModuleId,
+               blockIdx.x,
+               nclus,
+               MaxNumClustersPerModules);
+
+      auto first = firstPixel + threadIdx.x;
+
+      if (nclus > MaxNumClustersPerModules) {
+        // remove excess  FIXME find a way to cut charge first....
+        for (auto i = first; i < numElements; i += blockDim.x) {
+          if (id[i] == InvId)
+            continue;  // not valid
+          if (id[i] != thisModuleId)
+            break;  // end of module
+          if (clusterId[i] >= MaxNumClustersPerModules) {
+            id[i] = InvId;
+            clusterId[i] = InvId;
+          }
+        }
+        nclus = MaxNumClustersPerModules;
+      }
 
-    if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
-      printf("Warning too many clusters in module %d in block %d: %d > %d\n",
-             thisModuleId,
-             blockIdx.x,
-             nclus,
-             MaxNumClustersPerModules);
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("start cluster charge cut for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
 
-    auto first = firstPixel + threadIdx.x;
+      assert(nclus <= MaxNumClustersPerModules);
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        charge[i] = 0;
+      }
+      __syncthreads();
 
-    if (nclus > MaxNumClustersPerModules) {
-      // remove excess  FIXME find a way to cut charge first....
       for (auto i = first; i < numElements; i += blockDim.x) {
         if (id[i] == InvId)
           continue;  // not valid
         if (id[i] != thisModuleId)
           break;  // end of module
-        if (clusterId[i] >= MaxNumClustersPerModules) {
-          id[i] = InvId;
-          clusterId[i] = InvId;
-        }
+        atomicAdd(&charge[clusterId[i]], adc[i]);
       }
-      nclus = MaxNumClustersPerModules;
-    }
+      __syncthreads();
 
-#ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
-#endif
+      auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
+      }
 
-    __shared__ int32_t charge[MaxNumClustersPerModules];
-    __shared__ uint8_t ok[MaxNumClustersPerModules];
-    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
+      __syncthreads();
+
+      // renumber
+      __shared__ uint16_t ws[32];
+      cms::cuda::blockPrefixScan(newclusId, nclus, ws);
+
+      assert(nclus >= newclusId[nclus - 1]);
+
+      if (nclus == newclusId[nclus - 1])
+        continue;
+
+      nClustersInModule[thisModuleId] = newclusId[nclus - 1];
+      __syncthreads();
+
+      // mark bad cluster again
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        if (0 == ok[i])
+          newclusId[i] = InvId + 1;
+      }
+      __syncthreads();
+
+      // reassign id
+      for (auto i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        clusterId[i] = newclusId[clusterId[i]] - 1;
+        if (clusterId[i] == InvId)
+          id[i] = InvId;
+      }
 
-    assert(nclus <= MaxNumClustersPerModules);
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      charge[i] = 0;
-    }
-    __syncthreads();
-
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      atomicAdd(&charge[clusterId[i]], adc[i]);
-    }
-    __syncthreads();
-
-    auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
-    }
-
-    __syncthreads();
-
-    // renumber
-    __shared__ uint16_t ws[32];
-    cms::cuda::blockPrefixScan(newclusId, nclus, ws);
-
-    assert(nclus >= newclusId[nclus - 1]);
-
-    if (nclus == newclusId[nclus - 1])
-      return;
-
-    nClustersInModule[thisModuleId] = newclusId[nclus - 1];
-    __syncthreads();
-
-    // mark bad cluster again
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      if (0 == ok[i])
-        newclusId[i] = InvId + 1;
-    }
-    __syncthreads();
-
-    // reassign id
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      clusterId[i] = newclusId[clusterId[i]] - 1;
-      if (clusterId[i] == InvId)
-        id[i] = InvId;
-    }
-
-    //done
+      //done
+    }  // loop on modules
   }
 
 }  // namespace gpuClustering
diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h
index 84609bd10..e485a2331 100644
--- a/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h
+++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h
@@ -47,258 +47,260 @@ namespace gpuClustering {
                uint32_t* __restrict__ moduleId,           // output: module id of each module
                int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
                int numElements) {
-    if (blockIdx.x >= moduleStart[0])
-      return;
+    __shared__ int msize;
 
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
+    auto firstModule = blockIdx.x;
+    auto endModule = moduleStart[0];
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto firstPixel = moduleStart[1 + module];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
 
 #ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
 #endif
 
-    auto first = firstPixel + threadIdx.x;
+      auto first = firstPixel + threadIdx.x;
 
-    // find the index of the first pixel not belonging to this module (or invalid)
-    __shared__ int msize;
-    msize = numElements;
-    __syncthreads();
+      // find the index of the first pixel not belonging to this module (or invalid)
+      msize = numElements;
+      __syncthreads();
 
-    // skip threads not associated to an existing pixel
-    for (int i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (id[i] != thisModuleId) {  // find the first pixel in a different module
-        atomicMin(&msize, i);
-        break;
+      // skip threads not associated to an existing pixel
+      for (int i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        if (id[i] != thisModuleId) {  // find the first pixel in a different module
+          atomicMin(&msize, i);
+          break;
+        }
       }
-    }
 
-    //init hist  (ymax=416 < 512 : 9bits)
-    constexpr uint32_t maxPixInModule = 4000;
-    constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
-    using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
-    __shared__ Hist hist;
-    __shared__ typename Hist::Counter ws[32];
-    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
-      hist.off[j] = 0;
-    }
-    __syncthreads();
+      //init hist  (ymax=416 < 512 : 9bits)
+      constexpr uint32_t maxPixInModule = 4000;
+      constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
+      using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
+      __shared__ Hist hist;
+      __shared__ typename Hist::Counter ws[32];
+      for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+        hist.off[j] = 0;
+      }
+      __syncthreads();
 
-    assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
+      assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
 
-    // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
-    if (0 == threadIdx.x) {
-      if (msize - firstPixel > maxPixInModule) {
-        printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
-        msize = maxPixInModule + firstPixel;
+      // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
+      if (0 == threadIdx.x) {
+        if (msize - firstPixel > maxPixInModule) {
+          printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
+          msize = maxPixInModule + firstPixel;
+        }
       }
-    }
 
-    __syncthreads();
-    assert(msize - firstPixel <= maxPixInModule);
+      __syncthreads();
+      assert(msize - firstPixel <= maxPixInModule);
 
 #ifdef GPU_DEBUG
-    __shared__ uint32_t totGood;
-    totGood = 0;
-    __syncthreads();
+      __shared__ uint32_t totGood;
+      totGood = 0;
+      __syncthreads();
 #endif
 
-    // fill histo
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      hist.count(y[i]);
+      // fill histo
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        hist.count(y[i]);
 #ifdef GPU_DEBUG
-      atomicAdd(&totGood, 1);
+        atomicAdd(&totGood, 1);
 #endif
-    }
-    __syncthreads();
-    if (threadIdx.x < 32)
-      ws[threadIdx.x] = 0;  // used by prefix scan...
-    __syncthreads();
-    hist.finalize(ws);
-    __syncthreads();
+      }
+      __syncthreads();
+      if (threadIdx.x < 32)
+        ws[threadIdx.x] = 0;  // used by prefix scan...
+      __syncthreads();
+      hist.finalize(ws);
+      __syncthreads();
 #ifdef GPU_DEBUG
-    assert(hist.size() == totGood);
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("histo size %d\n", hist.size());
+      assert(hist.size() == totGood);
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("histo size %d\n", hist.size());
 #endif
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      hist.fill(y[i], i - firstPixel);
-    }
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        hist.fill(y[i], i - firstPixel);
+      }
 
 #ifdef __CUDA_ARCH__
-    // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
-    constexpr int maxiter = 16;
+      // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
+      constexpr int maxiter = 16;
 #else
-    auto maxiter = hist.size();
+      auto maxiter = hist.size();
 #endif
-    // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
-    constexpr int maxNeighbours = 10;
-    assert((hist.size() / blockDim.x) <= maxiter);
-    // nearest neighbour
-    uint16_t nn[maxiter][maxNeighbours];
-    uint8_t nnn[maxiter];  // number of nn
-    for (uint32_t k = 0; k < maxiter; ++k)
-      nnn[k] = 0;
+      // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
+      constexpr int maxNeighbours = 10;
+      assert((hist.size() / blockDim.x) <= maxiter);
+      // nearest neighbour
+      uint16_t nn[maxiter][maxNeighbours];
+      uint8_t nnn[maxiter];  // number of nn
+      for (uint32_t k = 0; k < maxiter; ++k)
+        nnn[k] = 0;
 
-    __syncthreads();  // for hit filling!
+      __syncthreads();  // for hit filling!
 
 #ifdef GPU_DEBUG
-    // look for anomalous high occupancy
-    __shared__ uint32_t n40, n60;
-    n40 = n60 = 0;
-    __syncthreads();
-    for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
-      if (hist.size(j) > 60)
-        atomicAdd(&n60, 1);
-      if (hist.size(j) > 40)
-        atomicAdd(&n40, 1);
-    }
-    __syncthreads();
-    if (0 == threadIdx.x) {
-      if (n60 > 0)
-        printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
-      else if (n40 > 0)
-        printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
-    }
-    __syncthreads();
+      // look for anomalous high occupancy
+      __shared__ uint32_t n40, n60;
+      n40 = n60 = 0;
+      __syncthreads();
+      for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
+        if (hist.size(j) > 60)
+          atomicAdd(&n60, 1);
+        if (hist.size(j) > 40)
+          atomicAdd(&n40, 1);
+      }
+      __syncthreads();
+      if (0 == threadIdx.x) {
+        if (n60 > 0)
+          printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
+        else if (n40 > 0)
+          printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
+      }
+      __syncthreads();
 #endif
 
-    // fill NN
-    for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-      assert(k < maxiter);
-      auto p = hist.begin() + j;
-      auto i = *p + firstPixel;
-      assert(id[i] != InvId);
-      assert(id[i] == thisModuleId);  // same module
-      int be = Hist::bin(y[i] + 1);
-      auto e = hist.end(be);
-      ++p;
-      assert(0 == nnn[k]);
-      for (; p < e; ++p) {
-        auto m = (*p) + firstPixel;
-        assert(m != i);
-        assert(int(y[m]) - int(y[i]) >= 0);
-        assert(int(y[m]) - int(y[i]) <= 1);
-        if (std::abs(int(x[m]) - int(x[i])) > 1)
-          continue;
-        auto l = nnn[k]++;
-        assert(l < maxNeighbours);
-        nn[k][l] = *p;
+      // fill NN
+      for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+        assert(k < maxiter);
+        auto p = hist.begin() + j;
+        auto i = *p + firstPixel;
+        assert(id[i] != InvId);
+        assert(id[i] == thisModuleId);  // same module
+        int be = Hist::bin(y[i] + 1);
+        auto e = hist.end(be);
+        ++p;
+        assert(0 == nnn[k]);
+        for (; p < e; ++p) {
+          auto m = (*p) + firstPixel;
+          assert(m != i);
+          assert(int(y[m]) - int(y[i]) >= 0);
+          assert(int(y[m]) - int(y[i]) <= 1);
+          if (std::abs(int(x[m]) - int(x[i])) > 1)
+            continue;
+          auto l = nnn[k]++;
+          assert(l < maxNeighbours);
+          nn[k][l] = *p;
+        }
       }
-    }
 
-    // for each pixel, look at all the pixels until the end of the module;
-    // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
-    // after the loop, all the pixel in each cluster should have the id equeal to the lowest
-    // pixel in the cluster ( clus[i] == i ).
-    bool more = true;
-    int nloops = 0;
-    while (__syncthreads_or(more)) {
-      if (1 == nloops % 2) {
-        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-          auto p = hist.begin() + j;
-          auto i = *p + firstPixel;
-          auto m = clusterId[i];
-          while (m != clusterId[m])
-            m = clusterId[m];
-          clusterId[i] = m;
+      // for each pixel, look at all the pixels until the end of the module;
+      // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+      // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+      // pixel in the cluster ( clus[i] == i ).
+      bool more = true;
+      int nloops = 0;
+      while (__syncthreads_or(more)) {
+        if (1 == nloops % 2) {
+          for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            auto m = clusterId[i];
+            while (m != clusterId[m])
+              m = clusterId[m];
+            clusterId[i] = m;
+          }
+        } else {
+          more = false;
+          for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            for (int kk = 0; kk < nnn[k]; ++kk) {
+              auto l = nn[k][kk];
+              auto m = l + firstPixel;
+              assert(m != i);
+              auto old = atomicMin(&clusterId[m], clusterId[i]);
+              if (old != clusterId[i]) {
+                // end the loop only if no changes were applied
+                more = true;
+              }
+              atomicMin(&clusterId[i], old);
+            }  // nnloop
+          }    // pixel loop
         }
-      } else {
-        more = false;
-        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-          auto p = hist.begin() + j;
-          auto i = *p + firstPixel;
-          for (int kk = 0; kk < nnn[k]; ++kk) {
-            auto l = nn[k][kk];
-            auto m = l + firstPixel;
-            assert(m != i);
-            auto old = atomicMin(&clusterId[m], clusterId[i]);
-            if (old != clusterId[i]) {
-              // end the loop only if no changes were applied
-              more = true;
-            }
-            atomicMin(&clusterId[i], old);
-          }  // nnloop
-        }    // pixel loop
-      }
-      ++nloops;
-    }  // end while
+        ++nloops;
+      }  // end while
 
 #ifdef GPU_DEBUG
-    {
-      __shared__ int n0;
-      if (threadIdx.x == 0)
-        n0 = nloops;
-      __syncthreads();
-      auto ok = n0 == nloops;
-      assert(__syncthreads_and(ok));
-      if (thisModuleId % 100 == 1)
+      {
+        __shared__ int n0;
         if (threadIdx.x == 0)
-          printf("# loops %d\n", nloops);
-    }
+          n0 = nloops;
+        __syncthreads();
+        auto ok = n0 == nloops;
+        assert(__syncthreads_and(ok));
+        if (thisModuleId % 100 == 1)
+          if (threadIdx.x == 0)
+            printf("# loops %d\n", nloops);
+      }
 #endif
 
-    __shared__ unsigned int foundClusters;
-    foundClusters = 0;
-    __syncthreads();
+      __shared__ unsigned int foundClusters;
+      foundClusters = 0;
+      __syncthreads();
 
-    // find the number of different clusters, identified by a pixels with clus[i] == i;
-    // mark these pixels with a negative id.
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (clusterId[i] == i) {
-        auto old = atomicInc(&foundClusters, 0xffffffff);
-        clusterId[i] = -(old + 1);
+      // find the number of different clusters, identified by a pixels with clus[i] == i;
+      // mark these pixels with a negative id.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        if (clusterId[i] == i) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          clusterId[i] = -(old + 1);
+        }
       }
-    }
-    __syncthreads();
+      __syncthreads();
 
-    // propagate the negative id to all the pixels in the cluster.
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (clusterId[i] >= 0) {
-        // mark each pixel in a cluster with the same id as the first one
-        clusterId[i] = clusterId[clusterId[i]];
+      // propagate the negative id to all the pixels in the cluster.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        if (clusterId[i] >= 0) {
+          // mark each pixel in a cluster with the same id as the first one
+          clusterId[i] = clusterId[clusterId[i]];
+        }
       }
-    }
-    __syncthreads();
+      __syncthreads();
 
-    // adjust the cluster id to be a positive value starting from 0
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId) {  // skip invalid pixels
-        clusterId[i] = -9999;
-        continue;
+      // adjust the cluster id to be a positive value starting from 0
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId) {  // skip invalid pixels
+          clusterId[i] = -9999;
+          continue;
+        }
+        clusterId[i] = -clusterId[i] - 1;
       }
-      clusterId[i] = -clusterId[i] - 1;
-    }
-    __syncthreads();
+      __syncthreads();
 
-    if (threadIdx.x == 0) {
-      nClustersInModule[thisModuleId] = foundClusters;
-      moduleId[blockIdx.x] = thisModuleId;
+      if (threadIdx.x == 0) {
+        nClustersInModule[thisModuleId] = foundClusters;
+        moduleId[module] = thisModuleId;
 #ifdef GPU_DEBUG
-      if (foundClusters > gMaxHit) {
-        gMaxHit = foundClusters;
-        if (foundClusters > 8)
-          printf("max hit %d in %d\n", foundClusters, thisModuleId);
-      }
+        if (foundClusters > gMaxHit) {
+          gMaxHit = foundClusters;
+          if (foundClusters > 8)
+            printf("max hit %d in %d\n", foundClusters, thisModuleId);
+        }
 #endif
 #ifdef GPU_DEBUG
-      if (thisModuleId % 100 == 1)
-        printf("%d clusters in module %d\n", foundClusters, thisModuleId);
+        if (thisModuleId % 100 == 1)
+          printf("%d clusters in module %d\n", foundClusters, thisModuleId);
 #endif
-    }
+      }
+    }  // module loop
   }
 
 }  // namespace gpuClustering
diff --git a/src/cudacompat/test/gpuClustering_t.h b/src/cudacompat/test/gpuClustering_t.h
index 5388e3499..55998dcda 100644
--- a/src/cudacompat/test/gpuClustering_t.h
+++ b/src/cudacompat/test/gpuClustering_t.h
@@ -15,7 +15,7 @@
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/requireDevices.h"
 #include "CUDACore/launch.h"
-#endif
+#endif  // __CUDACC__
 
 // dirty, but works
 #include "plugin-SiPixelClusterizer/gpuClustering.h"
@@ -24,7 +24,7 @@
 int main(void) {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
-#endif
+#endif  // __CUDACC__
 
   using namespace gpuClustering;
 
@@ -46,7 +46,7 @@ int main(void) {
   auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules + 1, nullptr);
   auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
   auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
-#else
+#else  // __CUDACC__
 
   auto h_moduleStart = std::make_unique<uint32_t[]>(MaxNumModules + 1);
   auto h_clusInModule = std::make_unique<uint32_t[]>(MaxNumModules);
@@ -245,11 +245,11 @@ int main(void) {
     // size_t size8 = n * sizeof(uint8_t);
 
     cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
-
     cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
+
     // Launch CUDA Kernels
     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
@@ -301,22 +301,13 @@ int main(void) {
                       n);
 
     cudaDeviceSynchronize();
-#else
+#else  // __CUDACC__
     h_moduleStart[0] = nModules;
     countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
     memset(h_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));
-    gridDim.x = MaxNumModules;  //not needed in the kernel for this specific case;
-    assert(blockIdx.x == 0);
-    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
-      findClus(h_id.get(),
-               h_x.get(),
-               h_y.get(),
-               h_moduleStart.get(),
-               h_clusInModule.get(),
-               h_moduleId.get(),
-               h_clus.get(),
-               n);
-    resetGrid();
+
+    findClus(
+        h_id.get(), h_x.get(), h_y.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
 
     nModules = h_moduleStart[0];
     auto nclus = h_clusInModule.get();
@@ -331,14 +322,10 @@ int main(void) {
     if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
-    gridDim.x = MaxNumModules;  // no needed in the kernel for in this specific case
-    assert(blockIdx.x == 0);
-    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
-      clusterChargeCut(
-          h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
-    resetGrid();
+    clusterChargeCut(
+        h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
 
-#endif
+#endif  // __CUDACC__
 
     std::cout << "found " << nModules << " Modules active" << std::endl;
 
@@ -347,7 +334,7 @@ int main(void) {
     cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
     cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
     cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
-#endif
+#endif  // __CUDACC__
 
     std::set<unsigned int> clids;
     for (int i = 0; i < n; ++i) {

From 16131354a17df75de384b7420e86995c324f3797 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Dec 2020 09:03:26 -0800
Subject: [PATCH 4/6] [cudacompat] Simplify cudacompat layer to use a
 1-dimensional grid (cms-patatrack/cmssw#586)

---
 src/cudacompat/CUDACore/cudaCompat.cc         | 17 -------
 src/cudacompat/CUDACore/cudaCompat.h          | 46 ++++++-------------
 .../gpuVertexFinderImpl.h                     |  4 --
 src/cudacompat/test/VertexFinder_t.h          |  3 --
 src/cudacompat/test/gpuClustering_t.h         |  8 ++--
 5 files changed, 16 insertions(+), 62 deletions(-)
 delete mode 100644 src/cudacompat/CUDACore/cudaCompat.cc

diff --git a/src/cudacompat/CUDACore/cudaCompat.cc b/src/cudacompat/CUDACore/cudaCompat.cc
deleted file mode 100644
index e6bb8069d..000000000
--- a/src/cudacompat/CUDACore/cudaCompat.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "CUDACore/cudaCompat.h"
-
-namespace cms {
-  namespace cudacompat {
-    thread_local dim3 blockIdx;
-    thread_local dim3 gridDim;
-  }  // namespace cudacompat
-}  // namespace cms
-
-namespace {
-  struct InitGrid {
-    InitGrid() { cms::cudacompat::resetGrid(); }
-  };
-
-  const InitGrid initGrid;
-
-}  // namespace
diff --git a/src/cudacompat/CUDACore/cudaCompat.h b/src/cudacompat/CUDACore/cudaCompat.h
index f9b4b2f8a..1cea2b254 100644
--- a/src/cudacompat/CUDACore/cudaCompat.h
+++ b/src/cudacompat/CUDACore/cudaCompat.h
@@ -11,21 +11,25 @@
 #include <cstdint>
 #include <cstring>
 
+// include the CUDA runtime header to define some of the attributes, types and sybols also on the CPU
 #include <cuda_runtime.h>
 
+// make sure function are inlined to avoid multiple definition
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+
+#undef __forceinline__
+#define __forceinline__ inline __attribute__((always_inline))
+
 namespace cms {
   namespace cudacompat {
 
-#ifndef __CUDA_RUNTIME_H__
-    struct dim3 {
-      uint32_t x, y, z;
-    };
-#endif
     const dim3 threadIdx = {0, 0, 0};
     const dim3 blockDim = {1, 1, 1};
 
-    extern thread_local dim3 blockIdx;
-    extern thread_local dim3 gridDim;
+    // 1-dimensional grid
+    const dim3 blockIdx = {0, 0, 0};
+    const dim3 gridDim = {1, 1, 1};
 
     template <typename T1, typename T2>
     T1 atomicCAS(T1* address, T1 compare, T2 val) {
@@ -77,36 +81,12 @@ namespace cms {
     inline T __ldg(T const* x) {
       return *x;
     }
-
-    inline void resetGrid() {
-      blockIdx = {0, 0, 0};
-      gridDim = {1, 1, 1};
-    }
-
   }  // namespace cudacompat
 }  // namespace cms
 
-// some  not needed as done by cuda runtime...
-#ifndef __CUDA_RUNTIME_H__
-#define __host__
-#define __device__
-#define __global__
-#define __shared__
-#define __forceinline__
-#endif
-
-// make sure function are inlined to avoid multiple definition
-#ifndef __CUDA_ARCH__
-#undef __global__
-#define __global__ inline __attribute__((always_inline))
-#undef __forceinline__
-#define __forceinline__ inline __attribute__((always_inline))
-#endif
-
-#ifndef __CUDA_ARCH__
+// make the cudacompat implementation available in the global namespace
 using namespace cms::cudacompat;
-#endif
 
-#endif
+#endif  // __CUDACC__
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
index f3260cad7..9a49c66a0 100644
--- a/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
+++ b/src/cudacompat/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
@@ -109,7 +109,6 @@ namespace gpuVertexFinder {
     loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
     cudaCheck(cudaGetLastError());
 #else
-    cms::cudacompat::resetGrid();
     init(soa, ws_d.get());
     loadTracks(tksoa, soa, ws_d.get(), ptMin);
 #endif
@@ -157,10 +156,7 @@ namespace gpuVertexFinder {
     // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
     fitVertices(soa, ws_d.get(), 50.);
     // one block per vertex!
-    blockIdx.x = 0;
-    gridDim.x = 1;
     splitVertices(soa, ws_d.get(), 9.f);
-    resetGrid();
     fitVertices(soa, ws_d.get(), 5000.);
     sortByPt2(soa, ws_d.get());
 #endif
diff --git a/src/cudacompat/test/VertexFinder_t.h b/src/cudacompat/test/VertexFinder_t.h
index 53f26d2de..aed660c0d 100644
--- a/src/cudacompat/test/VertexFinder_t.h
+++ b/src/cudacompat/test/VertexFinder_t.h
@@ -266,10 +266,7 @@ int main() {
       cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      gridDim.x = 1;
-      assert(blockIdx.x == 0);
       splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
-      resetGrid();
       nv = ws_d->nvIntermediate;
 #endif
       std::cout << "after split " << nv << std::endl;
diff --git a/src/cudacompat/test/gpuClustering_t.h b/src/cudacompat/test/gpuClustering_t.h
index 55998dcda..8c0f18b25 100644
--- a/src/cudacompat/test/gpuClustering_t.h
+++ b/src/cudacompat/test/gpuClustering_t.h
@@ -11,10 +11,10 @@
 
 #ifdef __CUDACC__
 
-#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
-#include "CUDACore/requireDevices.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
+#include "CUDACore/requireDevices.h"
 #endif  // __CUDACC__
 
 // dirty, but works
@@ -34,7 +34,6 @@ int main(void) {
   auto h_x = std::make_unique<uint16_t[]>(numElements);
   auto h_y = std::make_unique<uint16_t[]>(numElements);
   auto h_adc = std::make_unique<uint16_t[]>(numElements);
-
   auto h_clus = std::make_unique<int[]>(numElements);
 
 #ifdef __CUDACC__
@@ -47,11 +46,9 @@ int main(void) {
   auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
   auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
 #else  // __CUDACC__
-
   auto h_moduleStart = std::make_unique<uint32_t[]>(MaxNumModules + 1);
   auto h_clusInModule = std::make_unique<uint32_t[]>(MaxNumModules);
   auto h_moduleId = std::make_unique<uint32_t[]>(MaxNumModules);
-
 #endif
 
   // later random number
@@ -302,6 +299,7 @@ int main(void) {
 
     cudaDeviceSynchronize();
 #else  // __CUDACC__
+
     h_moduleStart[0] = nModules;
     countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
     memset(h_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));

From f4d3368a953fddcafbe92e2aa9ba90fbf3398794 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Dec 2020 11:20:20 -0800
Subject: [PATCH 5/6] [cudacompat] Make getHits() kernel independent of grid
 size

---
 .../plugin-SiPixelRecHits/gpuPixelRecHits.h   | 296 +++++++++---------
 1 file changed, 150 insertions(+), 146 deletions(-)

diff --git a/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
index d58984893..0eac1abdf 100644
--- a/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
+++ b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h
@@ -61,160 +61,164 @@ namespace gpuPixelRecHits {
     // as usual one block per module
     __shared__ ClusParams clusParams;
 
-    auto me = clusters.moduleId(blockIdx.x);
-    int nclus = clusters.clusInModule(me);
+    auto firstModule = blockIdx.x;
+    auto endModule = clusters.moduleStart(0);
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto me = clusters.moduleId(module);
+      int nclus = clusters.clusInModule(me);
 
-    if (0 == nclus)
-      return;
+      if (0 == nclus)
+        continue;
 
 #ifdef GPU_DEBUG
-    if (threadIdx.x == 0) {
-      auto k = clusters.moduleStart(1 + blockIdx.x);
-      while (digis.moduleInd(k) == InvId)
-        ++k;
-      assert(digis.moduleInd(k) == me);
-    }
+      if (threadIdx.x == 0) {
+        auto k = clusters.moduleStart(1 + module);
+        while (digis.moduleInd(k) == InvId)
+          ++k;
+        assert(digis.moduleInd(k) == me);
+      }
 #endif
 
 #ifdef GPU_DEBUG
-    if (me % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
+      if (me % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
 #endif
 
-    for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
-      auto first = clusters.moduleStart(1 + blockIdx.x);
-
-      int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
-      int lastClus = startClus + nClusInIter;
-      assert(nClusInIter <= nclus);
-      assert(nClusInIter > 0);
-      assert(lastClus <= nclus);
-
-      assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
-
-      // init
-      for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
-        clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
-        clusParams.maxRow[ic] = 0;
-        clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
-        clusParams.maxCol[ic] = 0;
-        clusParams.charge[ic] = 0;
-        clusParams.Q_f_X[ic] = 0;
-        clusParams.Q_l_X[ic] = 0;
-        clusParams.Q_f_Y[ic] = 0;
-        clusParams.Q_l_Y[ic] = 0;
-      }
-
-      first += threadIdx.x;
-
-      __syncthreads();
-
-      // one thead per "digi"
-
-      for (int i = first; i < numElements; i += blockDim.x) {
-        auto id = digis.moduleInd(i);
-        if (id == InvId)
-          continue;  // not valid
-        if (id != me)
-          break;  // end of module
-        auto cl = digis.clus(i);
-        if (cl < startClus || cl >= lastClus)
-          continue;
-        auto x = digis.xx(i);
-        auto y = digis.yy(i);
-        cl -= startClus;
-        assert(cl >= 0);
-        assert(cl < MaxHitsInIter);
-        atomicMin(&clusParams.minRow[cl], x);
-        atomicMax(&clusParams.maxRow[cl], x);
-        atomicMin(&clusParams.minCol[cl], y);
-        atomicMax(&clusParams.maxCol[cl], y);
-      }
-
-      __syncthreads();
-
-      // pixmx is not available in the binary dumps
-      //auto pixmx = cpeParams->detParams(me).pixmx;
-      auto pixmx = std::numeric_limits<uint16_t>::max();
-      for (int i = first; i < numElements; i += blockDim.x) {
-        auto id = digis.moduleInd(i);
-        if (id == InvId)
-          continue;  // not valid
-        if (id != me)
-          break;  // end of module
-        auto cl = digis.clus(i);
-        if (cl < startClus || cl >= lastClus)
-          continue;
-        cl -= startClus;
-        assert(cl >= 0);
-        assert(cl < MaxHitsInIter);
-        auto x = digis.xx(i);
-        auto y = digis.yy(i);
-        auto ch = std::min(digis.adc(i), pixmx);
-        atomicAdd(&clusParams.charge[cl], ch);
-        if (clusParams.minRow[cl] == x)
-          atomicAdd(&clusParams.Q_f_X[cl], ch);
-        if (clusParams.maxRow[cl] == x)
-          atomicAdd(&clusParams.Q_l_X[cl], ch);
-        if (clusParams.minCol[cl] == y)
-          atomicAdd(&clusParams.Q_f_Y[cl], ch);
-        if (clusParams.maxCol[cl] == y)
-          atomicAdd(&clusParams.Q_l_Y[cl], ch);
-      }
-
-      __syncthreads();
-
-      // next one cluster per thread...
-
-      first = clusters.clusModuleStart(me) + startClus;
-
-      for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
-        auto h = first + ic;  // output index in global memory
-
-        // this cannot happen anymore
-        if (h >= TrackingRecHit2DSOAView::maxHits())
-          break;  // overflow...
-        assert(h < hits.nHits());
-        assert(h < clusters.clusModuleStart(me + 1));
-
-        pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
-        pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
-
-        // store it
-
-        hits.charge(h) = clusParams.charge[ic];
-
-        hits.detectorIndex(h) = me;
-
-        float xl, yl;
-        hits.xLocal(h) = xl = clusParams.xpos[ic];
-        hits.yLocal(h) = yl = clusParams.ypos[ic];
-
-        hits.clusterSizeX(h) = clusParams.xsize[ic];
-        hits.clusterSizeY(h) = clusParams.ysize[ic];
-
-        hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic];
-        hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic];
-
-        // keep it local for computations
-        float xg, yg, zg;
-        // to global and compute phi...
-        cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
-        // here correct for the beamspot...
-        xg -= bs->x;
-        yg -= bs->y;
-        zg -= bs->z;
-
-        hits.xGlobal(h) = xg;
-        hits.yGlobal(h) = yg;
-        hits.zGlobal(h) = zg;
-
-        hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
-        hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
-      }
-      __syncthreads();
-    }  // end loop on batches
+      for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
+        auto first = clusters.moduleStart(1 + module);
+
+        int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
+        int lastClus = startClus + nClusInIter;
+        assert(nClusInIter <= nclus);
+        assert(nClusInIter > 0);
+        assert(lastClus <= nclus);
+
+        assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
+
+        // init
+        for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
+          clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
+          clusParams.maxRow[ic] = 0;
+          clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
+          clusParams.maxCol[ic] = 0;
+          clusParams.charge[ic] = 0;
+          clusParams.Q_f_X[ic] = 0;
+          clusParams.Q_l_X[ic] = 0;
+          clusParams.Q_f_Y[ic] = 0;
+          clusParams.Q_l_Y[ic] = 0;
+        }
+
+        first += threadIdx.x;
+
+        __syncthreads();
+
+        // one thead per "digi"
+
+        for (int i = first; i < numElements; i += blockDim.x) {
+          auto id = digis.moduleInd(i);
+          if (id == InvId)
+            continue;  // not valid
+          if (id != me)
+            break;  // end of module
+          auto cl = digis.clus(i);
+          if (cl < startClus || cl >= lastClus)
+            continue;
+          auto x = digis.xx(i);
+          auto y = digis.yy(i);
+          cl -= startClus;
+          assert(cl >= 0);
+          assert(cl < MaxHitsInIter);
+          atomicMin(&clusParams.minRow[cl], x);
+          atomicMax(&clusParams.maxRow[cl], x);
+          atomicMin(&clusParams.minCol[cl], y);
+          atomicMax(&clusParams.maxCol[cl], y);
+        }
+
+        __syncthreads();
+
+        // pixmx is not available in the binary dumps
+        //auto pixmx = cpeParams->detParams(me).pixmx;
+        auto pixmx = std::numeric_limits<uint16_t>::max();
+        for (int i = first; i < numElements; i += blockDim.x) {
+          auto id = digis.moduleInd(i);
+          if (id == InvId)
+            continue;  // not valid
+          if (id != me)
+            break;  // end of module
+          auto cl = digis.clus(i);
+          if (cl < startClus || cl >= lastClus)
+            continue;
+          cl -= startClus;
+          assert(cl >= 0);
+          assert(cl < MaxHitsInIter);
+          auto x = digis.xx(i);
+          auto y = digis.yy(i);
+          auto ch = std::min(digis.adc(i), pixmx);
+          atomicAdd(&clusParams.charge[cl], ch);
+          if (clusParams.minRow[cl] == x)
+            atomicAdd(&clusParams.Q_f_X[cl], ch);
+          if (clusParams.maxRow[cl] == x)
+            atomicAdd(&clusParams.Q_l_X[cl], ch);
+          if (clusParams.minCol[cl] == y)
+            atomicAdd(&clusParams.Q_f_Y[cl], ch);
+          if (clusParams.maxCol[cl] == y)
+            atomicAdd(&clusParams.Q_l_Y[cl], ch);
+        }
+
+        __syncthreads();
+
+        // next one cluster per thread...
+
+        first = clusters.clusModuleStart(me) + startClus;
+
+        for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
+          auto h = first + ic;  // output index in global memory
+
+          // this cannot happen anymore
+          if (h >= TrackingRecHit2DSOAView::maxHits())
+            break;  // overflow...
+          assert(h < hits.nHits());
+          assert(h < clusters.clusModuleStart(me + 1));
+
+          pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+          pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+
+          // store it
+
+          hits.charge(h) = clusParams.charge[ic];
+
+          hits.detectorIndex(h) = me;
+
+          float xl, yl;
+          hits.xLocal(h) = xl = clusParams.xpos[ic];
+          hits.yLocal(h) = yl = clusParams.ypos[ic];
+
+          hits.clusterSizeX(h) = clusParams.xsize[ic];
+          hits.clusterSizeY(h) = clusParams.ysize[ic];
+
+          hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic];
+          hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic];
+
+          // keep it local for computations
+          float xg, yg, zg;
+          // to global and compute phi...
+          cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
+          // here correct for the beamspot...
+          xg -= bs->x;
+          yg -= bs->y;
+          zg -= bs->z;
+
+          hits.xGlobal(h) = xg;
+          hits.yGlobal(h) = yg;
+          hits.zGlobal(h) = zg;
+
+          hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
+          hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
+        }
+        __syncthreads();
+      }  // end loop on batches
+    }    // loop over modules
   }
 
 }  // namespace gpuPixelRecHits

From 4de6a07905326cf86d7ad2434f7f18f01f3a891b Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Dec 2020 13:48:45 -0800
Subject: [PATCH 6/6] Update README and run-scan.py

---
 README.md   | 12 ++++++++++++
 run-scan.py |  1 +
 2 files changed, 13 insertions(+)

diff --git a/README.md b/README.md
index a1a0e99c4..750de8a12 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@
     * [`cuda`](#cuda)
     * [`cudadev`](#cudadev)
     * [`cudauvm`](#cudauvm)
+    * [`cudacompat`](#cudacompat)
     * [`kokkos` and `kokkostest`](#kokkos-and-kokkostest)
 * [Code structure](#code-structure)
 * [Build system](#build-system)
@@ -37,6 +38,7 @@ In addition, the individual programs assume the following be found from the syst
 | `cuda`       |                    | :heavy_check_mark:          |                                                                                                                  |
 | `cudadev`    |                    | :heavy_check_mark:          |                                                                                                                  |
 | `cudauvm`    |                    | :heavy_check_mark:          |                                                                                                                  |
+| `cudacompat` |                    | :heavy_check_mark:          |                                                                                                                  |
 | `kokkostest` | :heavy_check_mark: | :heavy_check_mark:          |                                                                                                                  |
 | `kokkos`     | :heavy_check_mark: | :heavy_check_mark:          |                                                                                                                  |
 | `alpakatest` |                    | :heavy_check_mark:          |                                                                                                                  |
@@ -54,6 +56,7 @@ All other dependencies (listed below) are downloaded and built automatically
 | `cuda`       | :heavy_check_mark:                  | :heavy_check_mark:                   |                                            |                                     |                                                  |
 | `cudadev`    | :heavy_check_mark:                  | :heavy_check_mark:                   |                                            |                                     |                                                  |
 | `cudauvm`    | :heavy_check_mark:                  | :heavy_check_mark:                   |                                            |                                     |                                                  |
+| `cudacompat` | :heavy_check_mark:                  | :heavy_check_mark:                   |                                            |                                     |                                                  |
 | `kokkostest` | :heavy_check_mark:                  |                                      | :heavy_check_mark:                         |                                     |                                                  |
 | `kokkos`     | :heavy_check_mark:                  | :heavy_check_mark:                   | :heavy_check_mark:                         |                                     |                                                  |
 | `alpakatest` | :heavy_check_mark:                  |                                      |                                            | :heavy_check_mark:                  | :heavy_check_mark:                               |
@@ -79,6 +82,7 @@ downloaded automatically during the build process.
 | `cuda`       | CUDA version (frozen)            | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
 | `cudadev`    | CUDA version (development)       | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
 | `cudauvm`    | CUDA version with managed memory | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
+| `cudacompat` | CPU version (with `cudaCompat`)  | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
 | `kokkostest` | Kokkos FW test                   | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |                    |                    |                    |
 | `kokkos`     | Kokkos version                   | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
 | `alpakatest` | Alpaka FW test                   | :heavy_check_mark: |                    | :white_check_mark: |                    |                    |                    |                    |                    |                    |                    |
@@ -183,6 +187,14 @@ To use managed memory also for temporary device-only allocations, compile with
 make cudauvm ... USER_CXXFLAGS="-DCUDAUVM_MANAGED_TEMPORARY"
 ```
 
+#### `cudacompat`
+
+This program is a fork of `cuda` by extending the use of `cudaCompat` to clustering and RecHits. The aim is to run the same code on CPU. Currently, however, the program requires a GPU because of (still) using pinned host memory in a few places. In the future the program could be extended to provide both CUDA and CPU flavors.
+
+The program contains the changes from following external PRs on top of `cuda`
+* [cms-patatrack/cmssw#586](https://github.com/cms-patatrack/cmssw/pull/586)
+* [cms-patatrack/cmssw#588](https://github.com/cms-patatrack/cmssw/pull/588)
+
 #### `kokkos` and `kokkostest`
 
 ```bash
diff --git a/run-scan.py b/run-scan.py
index 2df443e11..b1d7b7e9d 100755
--- a/run-scan.py
+++ b/run-scan.py
@@ -14,6 +14,7 @@
     "fwtest": 1,
     "cuda": {"": 100, "transfer": 100},
     "cudauvm": {"": 100, "transfer": 100},
+    "cudacompat": {"": 8},
 }
 
 result_re = re.compile("Processed (?P<events>\d+) events in (?P<time>\S+) seconds, throughput (?P<throughput>\S+) events/s")