From ec0cc29ba334afa5678dddf3461ba1018f3887aa Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 1 Mar 2018 20:57:16 +0100
Subject: [PATCH 001/102] Use the `gpu` modifier to read the pixel clusters
 from the unpacker (cms-patatrack#31)

When running the GPU algorithms, the pixel unpacker is reponsible
for providing both the digis and the cluster.
These changes make use of the unpacker label to access the clusters,
conditionally on the presence of the `gpu` process modifier.
---
 .../python/pixelTracksMonitoring_cff.py       |   2 +
 .../RecoTrack/python/TrackValidation_cff.py   | 445 ++++++------------
 2 files changed, 136 insertions(+), 311 deletions(-)

diff --git a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
index a075f671f05ce..711d757c94311 100644
--- a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
+++ b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 import DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi
 pixelTracksMonitoring = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone()
@@ -21,3 +22,4 @@
 pixelTracksMonitoring.doPlotsVsLUMI             = True
 pixelTracksMonitoring.doPlotsVsBX               = True
 
+gpu.toModify(pixelTracksMonitoring, pixelCluster4lumi = "siPixelDigis")
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index a425c88ef1021..87b1706b026aa 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -1,7 +1,6 @@
-from __future__ import absolute_import
 import FWCore.ParameterSet.Config as cms
 
-from SimTracker.TrackAssociatorProducers.trackAssociatorByChi2_cfi import *
+import SimTracker.TrackAssociatorProducers.trackAssociatorByChi2_cfi 
 from SimTracker.TrackAssociatorProducers.quickTrackAssociatorByHits_cfi import *
 from SimTracker.TrackAssociation.trackingParticleRecoTrackAsssociation_cfi import *
 import Validation.RecoTrack.MultiTrackValidator_cfi
@@ -9,8 +8,7 @@
 from SimTracker.TrackAssociation.LhcParametersDefinerForTP_cfi import *
 from SimTracker.TrackAssociation.CosmicParametersDefinerForTP_cfi import *
 from Validation.RecoTrack.PostProcessorTracker_cfi import *
-import Validation.RecoTrack.cutsRecoTracks_cfi as cutsRecoTracks_cfi
-#from . import cutsRecoTracks_cfi
+import cutsRecoTracks_cfi
 
 from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import *
 from SimTracker.VertexAssociation.VertexAssociatorByPositionAndTracks_cfi import *
@@ -23,7 +21,7 @@
 import RecoTracker.IterativeTracking.iterativeTkConfig as _cfg
 import RecoTracker.IterativeTracking.iterativeTkUtils as _utils
 from Configuration.Eras.Modifier_fastSim_cff import fastSim
-import six
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 ### First define the stuff for the standard validation sequence
 ## Track selectors
@@ -35,10 +33,8 @@
         _trackProd = []
 
     locals()["_algos"+_postfix] = ["generalTracks"] + _cfg.iterationAlgos(_postfix) + ["duplicateMerge"]
-    locals()["_seedProducersPreSplitting"+_postfix] = _seedProd
-    locals()["_trackProducersPreSplitting"+_postfix] = _trackProd
-    locals()["_seedProducers"+_postfix] = _cfg.seedProducers(_postfix)
-    locals()["_trackProducers"+_postfix] = _cfg.trackProducers(_postfix)
+    locals()["_seedProducers"+_postfix] = _seedProd + _cfg.seedProducers(_postfix)
+    locals()["_trackProducers"+_postfix] = _trackProd + _cfg.trackProducers(_postfix)
 
     if _eraName != "trackingPhase2PU140":
         locals()["_electronSeedProducers"+_postfix] = ["tripletElectronSeeds", "pixelPairElectronSeeds", "stripPairElectronSeeds"]
@@ -65,24 +61,21 @@ def _algoToSelector(algo):
 
 def _addSelectorsByAlgo(algos, modDict):
     names = []
-    task = cms.Task()
+    seq = cms.Sequence()
     for algo in algos:
         if algo == "generalTracks":
             continue
         modName = _algoToSelector(algo)
         if modName not in modDict:
-            mod = cutsRecoTracks_cfi.cutsRecoTracks.clone(
-#                src = [src],
-                algorithm=[algo]
-            )
+            mod = cutsRecoTracks_cfi.cutsRecoTracks.clone(algorithm=[algo])
             modDict[modName] = mod
         else:
             mod = modDict[modName]
         names.append(modName)
-        task.add(mod)
-    return (names, task)
+        seq += mod
+    return (names, seq)
 def _addSelectorsByHp(algos, modDict):
-    task = cms.Task()
+    seq = cms.Sequence()
     names = []
     for algo in algos:
         modName = _algoToSelector(algo)
@@ -96,10 +89,10 @@ def _addSelectorsByHp(algos, modDict):
         else:
             mod = modDict[modNameHp]
         names.append(modNameHp)
-        task.add(mod)
-    return (names, task)
+        seq += mod
+    return (names, seq)
 def _addSelectorsBySrc(modules, midfix, src, modDict):
-    task = cms.Task()
+    seq = cms.Sequence()
     names = []
     for modName in modules:
         modNameNew = modName.replace("cutsRecoTracks", "cutsRecoTracks"+midfix)
@@ -109,10 +102,10 @@ def _addSelectorsBySrc(modules, midfix, src, modDict):
         else:
             mod = modDict[modNameNew]
         names.append(modNameNew)
-        task.add(mod)
-    return (names, task)
+        seq += mod
+    return (names, seq)
 def _addSelectorsByOriginalAlgoMask(modules, midfix, algoParam,modDict):
-    task = cms.Task()
+    seq = cms.Sequence()
     names = []
     for modName in modules:
         if modName[-2:] == "Hp":
@@ -127,11 +120,11 @@ def _addSelectorsByOriginalAlgoMask(modules, midfix, algoParam,modDict):
         else:
             mod = modDict[modNameNew]
         names.append(modNameNew)
-        task.add(mod)
-    return (names, task)
+        seq += mod
+    return (names, seq)
 def _addSeedToTrackProducers(seedProducers,modDict):
     names = []
-    task = cms.Task()
+    seq = cms.Sequence()
     for seed in seedProducers:
         modName = "seedTracks"+seed
         if modName not in modDict:
@@ -140,8 +133,8 @@ def _addSeedToTrackProducers(seedProducers,modDict):
         else:
             mod = modDict[modName]
         names.append(modName)
-        task.add(mod)
-    return (names, task)
+        seq += mod
+    return (names, seq)
 
 _relevantEras = _cfg.allEras()
 _relevantErasAndFastSim = _relevantEras + [("fastSim", "_fastSim", fastSim)]
@@ -153,9 +146,9 @@ def _translateArgs(args, postfix, modDict):
         else:
             ret.append(modDict[arg+postfix])
     return ret
-def _taskForEachEra(function, args, names, task, modDict, plainArgs=[], modifyTask=None, includeFastSim=False):
-    if task[0] != "_":
-        raise Exception("Task name is expected to begin with _")
+def _sequenceForEachEra(function, args, names, sequence, modDict, plainArgs=[], modifySequence=None, includeFastSim=False):
+    if sequence[0] != "_":
+        raise Exception("Sequence name is expected to begin with _")
 
     _eras = _relevantErasAndFastSim if includeFastSim else _relevantEras
     for eraName, postfix, _era in _eras:
@@ -163,26 +156,26 @@ def _taskForEachEra(function, args, names, task, modDict, plainArgs=[], modifyTa
         _args.extend(plainArgs)
         ret = function(*_args, modDict=modDict)
         if len(ret) != 2:
-            raise Exception("_taskForEachEra is expected to return 2 values, but function returned %d" % len(ret))
+            raise Exception("_sequenceForEachEra is expected to return 2 values, but function returned %d" % len(ret))
         modDict[names+postfix] = ret[0]
-        modDict[task+postfix] = ret[1]
+        modDict[sequence+postfix] = ret[1]
 
-    # The task of the first era will be the default one
-    defaultTaskName = task+_eras[0][0]
-    defaultTask = modDict[defaultTaskName]
-    modDict[defaultTaskName[1:]] = defaultTask # remove leading underscore
+    # The sequence of the first era will be the default one
+    defaultSequenceName = sequence+_eras[0][0]
+    defaultSequence = modDict[defaultSequenceName]
+    modDict[defaultSequenceName[1:]] = defaultSequence # remove leading underscore
 
-    # Optionally modify task before applying the era
-    if modifyTask is not None:
+    # Optionally modify sequences before applying the era
+    if modifySequence is not None:
         for eraName, postfix, _era in _eras:
-            modifyTask(modDict[task+postfix])
+            modifySequence(modDict[sequence+postfix])
 
     # Apply eras
     for _eraName, _postfix, _era in _eras[1:]:
-        _era.toReplaceWith(defaultTask, modDict[task+_postfix])
+        _era.toReplaceWith(defaultSequence, modDict[sequence+_postfix])
 def _setForEra(module, eraName, era, **kwargs):
     if eraName == "":
-        for key, value in six.iteritems(kwargs):
+        for key, value in kwargs.iteritems():
             setattr(module, key, value)
     else:
         era.toModify(module, **kwargs)
@@ -242,28 +235,28 @@ def _getMVASelectors(postfix):
     mvaSel = _utils.getMVASelectors(postfix)
 
     pset = cms.untracked.PSet()
-    for iteration, (trackProducer, classifiers) in six.iteritems(mvaSel):
+    for iteration, (trackProducer, classifiers) in mvaSel.iteritems():
         setattr(pset, trackProducer, cms.untracked.vstring(classifiers))
     return pset
 for _eraName, _postfix, _era in _relevantEras:
     locals()["_mvaSelectors"+_postfix] = _getMVASelectors(_postfix)
 
 # Validation iterative steps
-_taskForEachEra(_addSelectorsByAlgo, args=["_algos"], names="_selectorsByAlgo", task="_tracksValidationSelectorsByAlgo", modDict=globals())
+_sequenceForEachEra(_addSelectorsByAlgo, args=["_algos"], names="_selectorsByAlgo", sequence="_tracksValidationSelectorsByAlgo", modDict=globals())
 
 # high purity
-_taskForEachEra(_addSelectorsByHp, args=["_algos"], names="_selectorsByAlgoHp", task="_tracksValidationSelectorsByAlgoHp", modDict=globals())
+_sequenceForEachEra(_addSelectorsByHp, args=["_algos"], names="_selectorsByAlgoHp", sequence="_tracksValidationSelectorsByAlgoHp", modDict=globals())
 
 # by originalAlgo
 for _eraName, _postfix, _era in _relevantEras:
     locals()["_selectorsByAlgoAndHp"+_postfix] = locals()["_selectorsByAlgo"+_postfix] + locals()["_selectorsByAlgoHp"+_postfix]
     # For ByAlgoMask
-    locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] = [n for n in locals()["_selectorsByAlgoAndHp"+_postfix] if n not in ["generalTracks", "cutsRecoTracksHp"]]
+    locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] = filter(lambda n: n not in ["generalTracks", "cutsRecoTracksHp"], locals()["_selectorsByAlgoAndHp"+_postfix])
     # For ByOriginalAlgo
-    locals()["_selectorsByAlgoAndHpNoGenTkDupMerge"+_postfix] = [n for n in locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] if n not in ["cutsRecoTracksDuplicateMerge", "cutsRecoTracksDuplicateMergeHp"]]
-_taskForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
+    locals()["_selectorsByAlgoAndHpNoGenTkDupMerge"+_postfix] = filter(lambda n: n not in ["cutsRecoTracksDuplicateMerge", "cutsRecoTracksDuplicateMergeHp"], locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix])
+_sequenceForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
                     args = ["_selectorsByAlgoAndHpNoGenTkDupMerge"], plainArgs = ["ByOriginalAlgo", "originalAlgorithm"],
-                    names = "_selectorsByOriginalAlgo", task = "_tracksValidationSelectorsByOriginalAlgo")
+                    names = "_selectorsByOriginalAlgo", sequence = "_tracksValidationSelectorsByOriginalAlgo")
 
 
 for _eraName, _postfix, _era in _relevantEras:
@@ -281,7 +274,7 @@ def _getMVASelectors(postfix):
     jets = "ak4PFJets"
 )
 from JetMETCorrections.Configuration.JetCorrectors_cff import *
-import JetMETCorrections.JetCorrector.jetTracksAssociationToTrackRefs_cfi as jetTracksAssociationToTrackRefs_cfi
+import CommonTools.RecoAlgos.jetTracksAssociationToTrackRefs_cfi as jetTracksAssociationToTrackRefs_cfi
 cutsRecoTracksAK4PFJets = jetTracksAssociationToTrackRefs_cfi.jetTracksAssociationToTrackRefs.clone(
     association = "ak4JetTracksAssociatorExplicitAll",
     jets = "ak4PFJets",
@@ -303,11 +296,11 @@ def _getMVASelectors(postfix):
 # select tracks with pT > 0.9 GeV (for upgrade fake rates)
 generalTracksPt09 = cutsRecoTracks_cfi.cutsRecoTracks.clone(ptMin=0.9)
 # and then the selectors
-_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
-                args=[["_generalTracksHp"]],
-                plainArgs=["Pt09", "generalTracksPt09"],
-                names="_selectorsPt09", task="_tracksValidationSelectorsPt09",
-                modifyTask=lambda task:task.add(generalTracksPt09))
+_sequenceForEachEra(_addSelectorsBySrc, modDict=globals(),
+                    args=[["_generalTracksHp"]],
+                    plainArgs=["Pt09", "generalTracksPt09"],
+                    names="_selectorsPt09", sequence="_tracksValidationSelectorsPt09",
+                    modifySequence=lambda seq:seq.insert(0, generalTracksPt09))
 
 # select tracks from the PV
 from CommonTools.RecoAlgos.TrackWithVertexRefSelector_cfi import trackWithVertexRefSelector as _trackWithVertexRefSelector
@@ -324,20 +317,20 @@ def _getMVASelectors(postfix):
     rhoVtx = 1e10, # intentionally no dxy cut
 )
 # and then the selectors
-_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
+_sequenceForEachEra(_addSelectorsBySrc, modDict=globals(),
                     args=[["_generalTracksHp"]],
                     plainArgs=["FromPV", "generalTracksFromPV"],
-                    names="_selectorsFromPV", task="_tracksValidationSelectorsFromPV",
-                    modifyTask=lambda task: task.add(generalTracksFromPV))
+                    names="_selectorsFromPV", sequence="_tracksValidationSelectorsFromPV",
+                    modifySequence=lambda seq: seq.insert(0, generalTracksFromPV))
 
 # select tracks with pT > 0.9 GeV from the PV
 generalTracksFromPVPt09 = generalTracksPt09.clone(src="generalTracksFromPV")
 # and then the selectors
-_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
-                args=[["_generalTracksHp"]],
-                plainArgs=["FromPVPt09", "generalTracksFromPVPt09"],
-                names="_selectorsFromPVPt09", task="_tracksValidationSelectorsFromPVPt09",
-                modifyTask=lambda task: task.add(generalTracksFromPVPt09))
+_sequenceForEachEra(_addSelectorsBySrc, modDict=globals(),
+                    args=[["_generalTracksHp"]],
+                    plainArgs=["FromPVPt09", "generalTracksFromPVPt09"],
+                    names="_selectorsFromPVPt09", sequence="_tracksValidationSelectorsFromPVPt09",
+                    modifySequence=lambda seq: seq.insert(0, generalTracksFromPVPt09))
 
 ## Select conversion TrackingParticles, and define the corresponding associator
 trackingParticlesConversion = _trackingParticleConversionRefSelector.clone()
@@ -353,15 +346,6 @@ def _getMVASelectors(postfix):
     ptMin = 0,
 )
 
-#ByChi2 association (for jetCore usage, not used by default)
-MTVTrackAssociationByChi2 = trackingParticleRecoTrackAsssociation.clone(
-     associator = cms.InputTag('trackAssociatorByChi2')
-)
-
-# Select jets for JetCore tracking
-highPtJets = cms.EDFilter("CandPtrSelector", src = cms.InputTag("ak4CaloJets"), cut = cms.string("pt()>1000"))
-highPtJetsForTrk = highPtJetsForTrk = highPtJets.clone(src = "ak4CaloJetsForTrk")
-
 # Select B-hadron TPs
 trackingParticlesBHadron = _trackingParticleBHadronRefSelector.clone()
 
@@ -369,13 +353,12 @@ def _getMVASelectors(postfix):
 trackValidator = Validation.RecoTrack.MultiTrackValidator_cfi.multiTrackValidator.clone(
     useLogPt = cms.untracked.bool(True),
     dodEdxPlots = True,
-    # associators=cms.untracked.VInputTag('MTVTrackAssociationByChi2'), #uncomment for byChi2 assoc. for jetcore studies (1/5)
     doPVAssociationPlots = True
     #,minpT = cms.double(-1)
     #,maxpT = cms.double(3)
     #,nintpT = cms.int32(40)
 )
-fastSim.toModify(trackValidator,
+fastSim.toModify(trackValidator, 
                       dodEdxPlots = False)
 
 for _eraName, _postfix, _era in _relevantEras:
@@ -393,7 +376,6 @@ def _getMVASelectors(postfix):
                    locals()["_generalTracksHp"+_postfix],
                    "generalTracksPt09",
                    "cutsRecoTracksBtvLike",
-                   "cutsRecoTracksJetCoreRegionalStepByOriginalAlgo",
                ]
     )
     _setForEra(trackValidator.histoProducerAlgoBlock, _eraName, _era, seedingLayerSets=locals()["_seedingLayerSets"+_postfix])
@@ -414,67 +396,6 @@ def _getMVASelectors(postfix):
     doResolutionPlotsForLabels = ["disabled"], # resolutions are same as in trackValidator, no need to repeat here
 )
 
-## Select signal TrackingParticles, and do the corresponding associations
-trackingParticlesEtaGreater2p7 = _trackingParticleRefSelector.clone(
-    signalOnly = cms.bool(False),
-    tip = 1e5,
-    lip = 1e5,
-    minRapidity = -2.7,
-    maxRapidity =  2.7,
-    invertRapidityCut = cms.bool(True),
-    ptMin = 0,
-)
-
-
-# select tracks with |eta| > 2.7
-generalTracksEtaGreater2p7 = cutsRecoTracks_cfi.cutsRecoTracks.clone(
-    minRapidity = cms.double(-2.7),
-    maxRapidity = cms.double( 2.7),
-    invertRapidityCut = cms.bool(True)
-)
-
-_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
-                    args=[["_generalTracksHp"]],
-                    plainArgs=["EtaGreater2p7", "generalTracksEtaGreater2p7"],
-                    names="_selectorsEtaGreater2p7", task="_tracksValidationSelectorsEtaGreater2p7",
-                    modifyTask=lambda task: task.add(generalTracksEtaGreater2p7))
-
-# for high-eta (phase2 : |eta| > 2.7)
-trackValidatorTPEtaGreater2p7 = trackValidator.clone(
-    dirName = "Tracking/TrackTPEtaGreater2p7/",
-    label_tp_effic = "trackingParticlesEtaGreater2p7",
-    label_tp_fake  = "trackingParticlesEtaGreater2p7",
-    label_tp_effic_refvector = True,
-    label_tp_fake_refvector  = True,
-    dodEdxPlots = False,
-#    doPVAssociationPlots = False,
-    minRapidityTP = -2.7,
-    maxRapidityTP = 2.7,
-    invertRapidityCutTP = True,
-#    ptMaxTP = 0.9, # set maximum pT globally
-    histoProducerAlgoBlock = dict(
-        TpSelectorForEfficiencyVsPt   = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True), # enough to set min pT here
-        TpSelectorForEfficiencyVsEta  = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True), # enough to set min pT here
-        TpSelectorForEfficiencyVsPhi  = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
-        TpSelectorForEfficiencyVsVTXR = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
-        TpSelectorForEfficiencyVsVTXZ = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
-        generalTpSelector             = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
-#        minEta  = -4.5,
-#        maxEta  =  4.5,
-#        nintEta = 90,
-        #    minPt  = 0.01,
-    ),
-    doSimPlots = True,       # ####same as in trackValidator, no need to repeat here
-    doRecoTrackPlots = True, # ####fake rates are same as in trackValidator, no need to repeat here
-    doResolutionPlotsForLabels = ["disabled"] # resolutions are same as in trackValidator, no need to repeat here
-)
-for _eraName, _postfix, _era in _relevantEras:
-    _setForEra(trackValidatorTPEtaGreater2p7, _eraName, _era,
-               label = ["generalTracksEtaGreater2p7"] + locals()["_selectorsEtaGreater2p7"+_postfix] +
-                       locals()["_selectorsByAlgo"+_postfix] + locals()["_selectorsByAlgoHp"+_postfix],
-               doResolutionPlotsForLabels = ["generalTracksEtaGreater2p7"] + locals()["_selectorsEtaGreater2p7"+_postfix]
-    )
-
 # For efficiency of signal TPs vs. signal tracks, and fake rate of
 # signal tracks vs. signal TPs
 trackValidatorFromPV = trackValidator.clone(
@@ -526,8 +447,6 @@ def _getMVASelectors(postfix):
     _setForEra(trackValidatorAllTPEffic, _eraName, _era, label = ["generalTracks", locals()["_generalTracksHp"+_postfix]])
 
 # Built tracks, in the standard sequence mainly for monitoring the track selection MVA
-tpClusterProducerPreSplitting = tpClusterProducer.clone(pixelClusterSrc = "siPixelClustersPreSplitting")
-quickTrackAssociatorByHitsPreSplitting = quickTrackAssociatorByHits.clone(cluster2TPSrc = "tpClusterProducerPreSplitting")
 _trackValidatorSeedingBuilding = trackValidator.clone( # common for built tracks and seeds (in trackingOnly)
     associators = ["quickTrackAssociatorByHits"],
     UseAssociators = True,
@@ -539,21 +458,12 @@ def _getMVASelectors(postfix):
 trackValidatorBuilding = _trackValidatorSeedingBuilding.clone(
     dirName = "Tracking/TrackBuilding/",
     doMVAPlots = True,
-    doResolutionPlotsForLabels = ['jetCoreRegionalStepTracks'],
-    # associators = ["trackAssociatorByChi2"], #uncomment for byChi2 assoc. for jetcore studies (2/5)
-    # UseAssociators = True, #uncomment for byChi2 assoc. for jetcore studies (3/5)
-)
-trackValidatorBuildingPreSplitting = trackValidatorBuilding.clone(
-    associators = ["quickTrackAssociatorByHitsPreSplitting"],
-    doMVAPlots = False,
-    doSummaryPlots = False,
 )
 for _eraName, _postfix, _era in _relevantErasAndFastSim:
     _setForEra(trackValidatorBuilding, _eraName, _era, label = locals()["_trackProducers"+_postfix])
 fastSim.toModify(trackValidatorBuilding, doMVAPlots=False)
 for _eraName, _postfix, _era in _relevantEras:
     _setForEra(trackValidatorBuilding, _eraName, _era, mvaLabels = locals()["_mvaSelectors"+_postfix])
-    _setForEra(trackValidatorBuildingPreSplitting, _eraName, _era, label = locals()["_trackProducersPreSplitting"+_postfix])
 
 
 # For conversions
@@ -624,59 +534,31 @@ def _uniqueFirstLayers(layerList):
 
 
 # the track selectors
-tracksValidationSelectors = cms.Task(
-    tracksValidationSelectorsByAlgo,
-    tracksValidationSelectorsByAlgoHp,
-    tracksValidationSelectorsByOriginalAlgo,
-    cutsRecoTracksBtvLike,
-    ak4JetTracksAssociatorExplicitAll,
+tracksValidationSelectors = cms.Sequence(
+    tracksValidationSelectorsByAlgo +
+    tracksValidationSelectorsByAlgoHp +
+    tracksValidationSelectorsByOriginalAlgo +
+    cutsRecoTracksBtvLike +
+    ak4JetTracksAssociatorExplicitAll +
     cutsRecoTracksAK4PFJets
 )
-phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(generalTracksEtaGreater2p7))
-phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(cutsRecoTracksEtaGreater2p7Hp))
-
-# Validation iterative steps
-_taskForEachEra(_addSelectorsByAlgo, modDict=globals(),
-                args=["_algos"], 
-                names="_selectorsByAlgo", task="_tracksEtaGreater2p7ValidationSelectorsByAlgo"                
-               )
-
-# high purity
-_taskForEachEra(_addSelectorsByHp, modDict=globals(),
-                args=["_algos"], 
-                names="_selectorsByAlgoHp", task="_tracksEtaGreater2p7ValidationSelectorsByAlgoHp"
-               )
-
-for _eraName, _postfix, _era in _relevantEras:
-    selectors = locals()["_selectorsByAlgoHp"+_postfix]
-    locals()["_generalTracksHp"+_postfix] = selectors[0]
-    locals()["_selectorsByAlgoHp"+_postfix] = selectors[1:]
-
-phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(tracksEtaGreater2p7ValidationSelectorsByAlgo))
-phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(tracksEtaGreater2p7ValidationSelectorsByAlgoHp))
-
-tracksValidationTruth = cms.Task(
-    tpClusterProducer,
-    tpClusterProducerPreSplitting,
-    # trackAssociatorByChi2, #uncomment for byChi2 assoc. for jetcore studies (4/5)
-    # MTVTrackAssociationByChi2, #uncomment for byChi2 assoc. for jetcore studies (5/5)
-    quickTrackAssociatorByHits,
-    quickTrackAssociatorByHitsPreSplitting,
-    trackingParticleRecoTrackAsssociation,
-    VertexAssociatorByPositionAndTracks,
+tracksValidationTruth = cms.Sequence(
+    tpClusterProducer +
+    quickTrackAssociatorByHits +
+    trackingParticleRecoTrackAsssociation +
+    VertexAssociatorByPositionAndTracks +
     trackingParticleNumberOfLayersProducer
 )
 fastSim.toModify(tracksValidationTruth, lambda x: x.remove(tpClusterProducer))
 
-tracksPreValidation = cms.Task(
-    highPtJetsForTrk,
-    tracksValidationSelectors,
-    tracksValidationSelectorsPt09,
-    tracksValidationSelectorsFromPV,
-    tracksValidationSelectorsFromPVPt09,
-    tracksValidationTruth,
-    trackingParticlesSignal,
-    trackingParticlesElectron,
+tracksPreValidation = cms.Sequence(
+    tracksValidationSelectors +
+    tracksValidationSelectorsPt09 +
+    tracksValidationSelectorsFromPV +
+    tracksValidationSelectorsFromPVPt09 +
+    tracksValidationTruth +
+    cms.ignore(trackingParticlesSignal) +
+    cms.ignore(trackingParticlesElectron) +
     trackingParticlesConversion
 )
 fastSim.toReplaceWith(tracksPreValidation, tracksPreValidation.copyAndExclude([
@@ -684,34 +566,18 @@ def _uniqueFirstLayers(layerList):
     trackingParticlesConversion,
 ]))
 
-
-
 tracksValidation = cms.Sequence(
+    tracksPreValidation +
     trackValidator +
     trackValidatorTPPtLess09 +
     trackValidatorFromPV +
     trackValidatorFromPVAllTP +
     trackValidatorAllTPEffic +
     trackValidatorBuilding +
-    trackValidatorBuildingPreSplitting +
     trackValidatorConversion +
-    trackValidatorGsfTracks,
-    tracksPreValidation
+    trackValidatorGsfTracks
 )
-
-from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
-#tracksValidationPhase2 = cms.Sequence(tracksValidation+trackValidatorTPEtaGreater2p7) # it does not work
-tracksPreValidationPhase2 = tracksPreValidation.copy()
-tracksPreValidationPhase2.add(trackingParticlesEtaGreater2p7)
-phase2_tracker.toReplaceWith(tracksPreValidation, tracksPreValidationPhase2)
-
-tracksValidationPhase2 = tracksValidation.copy()
-tracksValidationPhase2+=trackValidatorTPEtaGreater2p7
-phase2_tracker.toReplaceWith(tracksValidation, tracksValidationPhase2)
-
-
 fastSim.toReplaceWith(tracksValidation, tracksValidation.copyAndExclude([
-    trackValidatorBuildingPreSplitting,
     trackValidatorConversion,
     trackValidatorGsfTracks,
 ]))
@@ -719,80 +585,60 @@ def _uniqueFirstLayers(layerList):
 ### Then define stuff for standalone mode (i.e. MTV with RECO+DIGI input)
 
 # Select by originalAlgo and algoMask
-_taskForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
-                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["ByAlgoMask", "algorithmMaskContains"],
-                names = "_selectorsByAlgoMask", task = "_tracksValidationSelectorsByAlgoMaskStandalone")
+_sequenceForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
+                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["ByAlgoMask", "algorithmMaskContains"],
+                    names = "_selectorsByAlgoMask", sequence = "_tracksValidationSelectorsByAlgoMaskStandalone")
 
 # Select pT>0.9 by iteration
 # Need to avoid generalTracks+HP because those are already included in the standard validator
-_taskForEachEra(_addSelectorsBySrc, modDict = globals(),
-                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["Pt09", "generalTracksPt09"],
-                names = "_selectorsPt09Standalone", task = "_tracksValidationSelectorsPt09Standalone")
+_sequenceForEachEra(_addSelectorsBySrc, modDict = globals(),
+                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["Pt09", "generalTracksPt09"],
+                    names = "_selectorsPt09Standalone", sequence = "_tracksValidationSelectorsPt09Standalone")
 
 # Select fromPV by iteration
 # Need to avoid generalTracks+HP because those are already included in the standard validator
-_taskForEachEra(_addSelectorsBySrc, modDict = globals(),
-                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPV", "generalTracksFromPV"],
-                names = "_selectorsFromPVStandalone", task = "_tracksValidationSelectorsFromPVStandalone")
+_sequenceForEachEra(_addSelectorsBySrc, modDict = globals(),
+                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPV", "generalTracksFromPV"],
+                    names = "_selectorsFromPVStandalone", sequence = "_tracksValidationSelectorsFromPVStandalone")
 
 # Select pt>0.9 and fromPV by iteration
 # Need to avoid generalTracks+HP because those are already included in the standard validator
-_taskForEachEra(_addSelectorsBySrc, modDict = globals(),
-                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPVPt09", "generalTracksFromPVPt09"],
-                names = "_selectorsFromPVPt09Standalone", task = "_tracksValidationSelectorsFromPVPt09Standalone")
+_sequenceForEachEra(_addSelectorsBySrc, modDict = globals(),
+                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPVPt09", "generalTracksFromPVPt09"],
+                    names = "_selectorsFromPVPt09Standalone", sequence = "_tracksValidationSelectorsFromPVPt09Standalone")
 
 # MTV instances
-trackValidatorStandalone = trackValidator.clone(
-    cores = "highPtJets"
-)
-trackValidatorTPPtLess09Standalone = trackValidatorTPPtLess09.clone(
-    cores = "highPtJets"
-)
+trackValidatorStandalone = trackValidator.clone()
+trackValidatorTPPtLess09Standalone = trackValidatorTPPtLess09.clone()
 for _eraName, _postfix, _era in _relevantEras:
     _setForEra(trackValidatorStandalone, _eraName, _era, label = trackValidator.label + locals()["_selectorsByAlgoMask"+_postfix] + locals()["_selectorsPt09Standalone"+_postfix])
     _setForEra(trackValidatorTPPtLess09Standalone, _eraName, _era, label = trackValidatorTPPtLess09.label + locals()["_selectorsByAlgoMask"+_postfix] + locals()["_selectorsPt09Standalone"+_postfix])
 
-trackValidatorFromPVStandalone = trackValidatorFromPV.clone(
-    cores = "highPtJets"
-)
+trackValidatorFromPVStandalone = trackValidatorFromPV.clone()
 for _eraName, _postfix, _era in _relevantEras:
     _setForEra(trackValidatorFromPVStandalone, _eraName, _era, label = trackValidatorFromPV.label + locals()["_selectorsFromPVStandalone"+_postfix] + locals()["_selectorsFromPVPt09Standalone"+_postfix])
 # do resolutions as in the standard version
 
 trackValidatorFromPVAllTPStandalone = trackValidatorFromPVAllTP.clone(
-    label = trackValidatorFromPVStandalone.label.value(),
-    cores = "highPtJets"
-
+    label = trackValidatorFromPVStandalone.label.value()
 )
 trackValidatorAllTPEfficStandalone = trackValidatorAllTPEffic.clone(
-    label = [ x for x in trackValidator.label.value() if x not in ["cutsRecoTracksBtvLike", "cutsRecoTracksAK4PFJets"] and "Pt09" not in x],
-    cores = "highPtJets"
-)
-
-trackValidatorConversionStandalone = trackValidatorConversion.clone(
-    label = [x for x in trackValidatorConversion.label if x != "convStepTracks"],
-    cores = "highPtJets"
+    label = [ x for x in trackValidator.label.value() if x not in ["cutsRecoTracksBtvLike", "cutsRecoTracksAK4PFJets"] and "Pt09" not in x]
 )
 
-trackValidatorBHadronStandalone = trackValidatorBHadron.clone(
-    label = [x for x in trackValidatorStandalone.label if "Pt09" not in x],
-    cores = "highPtJets"
-)
+trackValidatorConversionStandalone = trackValidatorConversion.clone( label = [x for x in trackValidatorConversion.label if x != "convStepTracks"])
 
-trackValidatorGsfTracksStandalone = trackValidatorGsfTracks.clone(
-    cores = "highPtJets"
-)
+trackValidatorBHadronStandalone = trackValidatorBHadron.clone(label = [x for x in trackValidatorStandalone.label if "Pt09" not in x])
 
 # sequences
 tracksPreValidationStandalone = tracksPreValidation.copy()
-tracksPreValidationStandalone.add(trackingParticlesBHadron)
-tracksPreValidationStandalone.replace(highPtJetsForTrk,highPtJets)
+tracksPreValidationStandalone += trackingParticlesBHadron
 fastSim.toReplaceWith(tracksPreValidationStandalone, tracksPreValidation)
 
-tracksValidationSelectorsStandalone = cms.Task(
-    tracksValidationSelectorsByAlgoMaskStandalone,
-    tracksValidationSelectorsPt09Standalone,
-    tracksValidationSelectorsFromPVStandalone,
+tracksValidationSelectorsStandalone = cms.Sequence(
+    tracksValidationSelectorsByAlgoMaskStandalone +
+    tracksValidationSelectorsPt09Standalone +
+    tracksValidationSelectorsFromPVStandalone +
     tracksValidationSelectorsFromPVPt09Standalone
 )
 
@@ -805,101 +651,76 @@ def _uniqueFirstLayers(layerList):
     trackValidatorFromPVAllTPStandalone +
     trackValidatorAllTPEfficStandalone +
     trackValidatorConversionStandalone +
-    trackValidatorGsfTracksStandalone +
+    trackValidatorGsfTracks +
     trackValidatorBHadronStandalone
 )
-
-_trackValidatorsBasePhase2 = _trackValidatorsBase.copy()
-_trackValidatorsBasePhase2+=trackValidatorTPEtaGreater2p7
-phase2_tracker.toReplaceWith(_trackValidatorsBase, _trackValidatorsBasePhase2)
-
 trackValidatorsStandalone = _trackValidatorsBase.copy()
 fastSim.toModify(trackValidatorsStandalone, lambda x: x.remove(trackValidatorConversionStandalone) )
 
 tracksValidationStandalone = cms.Sequence(
     ak4PFL1FastL2L3CorrectorChain +
-    trackValidatorsStandalone,
-    tracksPreValidationStandalone,
-    tracksValidationSelectorsStandalone
+    tracksPreValidationStandalone +
+    tracksValidationSelectorsStandalone +
+    trackValidatorsStandalone
 )
 
 ### TrackingOnly mode (i.e. MTV with DIGI input + tracking-only reconstruction)
 
 # selectors
 tracksValidationSelectorsTrackingOnly = tracksValidationSelectors.copyAndExclude([ak4JetTracksAssociatorExplicitAll,cutsRecoTracksAK4PFJets]) # selectors using track information only (i.e. no PF)
-_taskForEachEra(_addSeedToTrackProducers, args=["_seedProducers"], names="_seedSelectors", task="_tracksValidationSeedSelectorsTrackingOnly", includeFastSim=True, modDict=globals())
-_taskForEachEra(_addSeedToTrackProducers, args=["_seedProducersPreSplitting"], names="_seedSelectorsPreSplitting", task="_tracksValidationSeedSelectorsPreSplittingTrackingOnly", modDict=globals())
-tracksValidationSeedSelectorsTrackingOnly.add(tracksValidationSeedSelectorsPreSplittingTrackingOnly)
+_sequenceForEachEra(_addSeedToTrackProducers, args=["_seedProducers"], names="_seedSelectors", sequence="_tracksValidationSeedSelectorsTrackingOnly", includeFastSim=True, modDict=globals())
 
 # MTV instances
-trackValidatorTrackingOnly = trackValidatorStandalone.clone(
-    label = [ x for x in trackValidatorStandalone.label if x != "cutsRecoTracksAK4PFJets"],
-    cores = "highPtJetsForTrk"
- )
+trackValidatorTrackingOnly = trackValidatorStandalone.clone(label = [ x for x in trackValidatorStandalone.label if x != "cutsRecoTracksAK4PFJets"] )
 
 trackValidatorSeedingTrackingOnly = _trackValidatorSeedingBuilding.clone(
     dirName = "Tracking/TrackSeeding/",
     label = _seedSelectors,
     doSeedPlots = True,
-    doResolutionPlotsForLabels = [ "seedTracksjetCoreRegionalStepSeeds",]
-)
-trackValidatorSeedingPreSplittingTrackingOnly = trackValidatorSeedingTrackingOnly.clone(
-    associators = ["quickTrackAssociatorByHitsPreSplitting"],
-    label = _seedSelectorsPreSplitting,
-    doSummaryPlots = False,
-
 )
 for _eraName, _postfix, _era in _relevantErasAndFastSim:
     _setForEra(trackValidatorSeedingTrackingOnly, _eraName, _era, label = locals()["_seedSelectors"+_postfix])
-for _eraName, _postfix, _era in _relevantEras:
-    _setForEra(trackValidatorSeedingPreSplittingTrackingOnly, _eraName, _era, label = locals()["_seedSelectorsPreSplitting"+_postfix])
 
 
 trackValidatorConversionTrackingOnly = trackValidatorConversion.clone(label = [x for x in trackValidatorConversion.label if x not in ["ckfInOutTracksFromConversions", "ckfOutInTracksFromConversions"]])
 
 trackValidatorBHadronTrackingOnly = trackValidatorBHadron.clone(label = [x for x in trackValidatorTrackingOnly.label if "Pt09" not in x])
 
-trackValidatorTPPtLess09TrackingOnly = trackValidatorTPPtLess09Standalone.clone(cores = "highPtJetsForTrk")
-trackValidatorFromPVTrackingOnly = trackValidatorFromPVStandalone.clone(cores = "highPtJetsForTrk")
-trackValidatorFromPVAllTPTrackingOnly = trackValidatorFromPVAllTPStandalone.clone(cores = "highPtJetsForTrk")
-trackValidatorAllTPEfficTrackingOnly = trackValidatorAllTPEfficStandalone.clone(cores = "highPtJetsForTrk")
 # sequences
 tracksPreValidationTrackingOnly = tracksPreValidationStandalone.copy()
 tracksPreValidationTrackingOnly.replace(tracksValidationSelectors, tracksValidationSelectorsTrackingOnly)
-tracksPreValidationTrackingOnly.replace(highPtJets,highPtJetsForTrk)
 
 trackValidatorsTrackingOnly = _trackValidatorsBase.copy()
 trackValidatorsTrackingOnly.replace(trackValidatorStandalone, trackValidatorTrackingOnly)
-trackValidatorsTrackingOnly.replace(trackValidatorTPPtLess09Standalone,trackValidatorTPPtLess09TrackingOnly)
-trackValidatorsTrackingOnly.replace(trackValidatorFromPVStandalone,trackValidatorFromPVTrackingOnly)
-trackValidatorsTrackingOnly.replace(trackValidatorFromPVAllTPStandalone,trackValidatorFromPVAllTPTrackingOnly)
-trackValidatorsTrackingOnly.replace(trackValidatorAllTPEfficStandalone,trackValidatorAllTPEfficTrackingOnly)
 trackValidatorsTrackingOnly += trackValidatorSeedingTrackingOnly
-trackValidatorsTrackingOnly += trackValidatorSeedingPreSplittingTrackingOnly
 trackValidatorsTrackingOnly += trackValidatorBuilding
-trackValidatorsTrackingOnly += trackValidatorBuildingPreSplitting
 trackValidatorsTrackingOnly.replace(trackValidatorConversionStandalone, trackValidatorConversionTrackingOnly)
-trackValidatorsTrackingOnly.remove(trackValidatorGsfTracksStandalone)
+trackValidatorsTrackingOnly.remove(trackValidatorGsfTracks)
 trackValidatorsTrackingOnly.replace(trackValidatorBHadronStandalone, trackValidatorBHadronTrackingOnly)
-fastSim.toReplaceWith(trackValidatorsTrackingOnly, trackValidatorsTrackingOnly.copyAndExclude([
-    trackValidatorBuildingPreSplitting,
-    trackValidatorSeedingPreSplittingTrackingOnly,
-    trackValidatorConversionTrackingOnly,
-    trackValidatorBHadronTrackingOnly
-]))
+fastSim.toModify(trackValidatorsTrackingOnly, lambda x: x.remove(trackValidatorConversionTrackingOnly))
+fastSim.toModify(trackValidatorsTrackingOnly, lambda x: x.remove(trackValidatorBHadronTrackingOnly))
+
 
 tracksValidationTrackingOnly = cms.Sequence(
-    trackValidatorsTrackingOnly,
-    tracksPreValidationTrackingOnly,
-    tracksValidationSelectorsStandalone,
-    tracksValidationSeedSelectorsTrackingOnly
+    tracksPreValidationTrackingOnly +
+    tracksValidationSelectorsStandalone +
+    tracksValidationSeedSelectorsTrackingOnly +
+    trackValidatorsTrackingOnly
 )
 
 
 ### Pixel tracking only mode (placeholder for now)
+tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone(
+    pixelClusterSrc = "siPixelClustersPreSplitting"
+)
+gpu.toModify(tpClusterProducerPixelTrackingOnly, pixelClusterSrc = "siPixelDigis")
+
+quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone(
+    cluster2TPSrc = "tpClusterProducerPixelTrackingOnly"
+)
 trackingParticlePixelTrackAsssociation = trackingParticleRecoTrackAsssociation.clone(
     label_tr = "pixelTracks",
-    associator = "quickTrackAssociatorByHitsPreSplitting",
+    associator = "quickTrackAssociatorByHitsPixelTrackingOnly",
 )
 PixelVertexAssociatorByPositionAndTracks = VertexAssociatorByPositionAndTracks.clone(
     trackAssociation = "trackingParticlePixelTrackAsssociation"
@@ -914,15 +735,16 @@ def _uniqueFirstLayers(layerList):
     label_vertex = "pixelVertices",
     vertexAssociator = "PixelVertexAssociatorByPositionAndTracks",
     dodEdxPlots = False,
-    cores = cms.InputTag(""),
 )
 
 tracksValidationTruthPixelTrackingOnly = tracksValidationTruth.copy()
+tracksValidationTruthPixelTrackingOnly.replace(tpClusterProducer, tpClusterProducerPixelTrackingOnly)
+tracksValidationTruthPixelTrackingOnly.replace(quickTrackAssociatorByHits, quickTrackAssociatorByHitsPixelTrackingOnly)
 tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation)
 tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks)
 tracksValidationPixelTrackingOnly = cms.Sequence(
-    trackValidatorPixelTrackingOnly,
-    tracksValidationTruthPixelTrackingOnly
+    tracksValidationTruthPixelTrackingOnly +
+    trackValidatorPixelTrackingOnly
 )
 
 
@@ -932,15 +754,16 @@ def _uniqueFirstLayers(layerList):
 )
 tracksValidationLite = cms.Sequence(
     cutsRecoTracksHp +
-    trackValidatorLite,
-    tracksValidationTruth
+    tracksValidationTruth +
+    trackValidatorLite
 )
 
 ## customization for timing
 from Configuration.Eras.Modifier_phase2_timing_layer_cff import phase2_timing_layer
 phase2_timing_layer.toModify( generalTracksFromPV, 
-                              timesTag  = cms.InputTag('tofPID:t0'), 
-                              timeResosTag = cms.InputTag('tofPID:sigmat0'),
+                              vertexTag = cms.InputTag('offlinePrimaryVertices4D'),
+                              timesTag  = cms.InputTag('trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModel'), 
+                              timeResosTag = cms.InputTag('trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModelResolution'), 
                               nSigmaDtVertex = cms.double(3) )
 phase2_timing_layer.toModify( trackValidatorStandalone,
                               label_vertex = cms.untracked.InputTag('offlinePrimaryVertices4D') )

From a10a72dc7c6b88a6a00863a7ef61650804f14878 Mon Sep 17 00:00:00 2001
From: Marco Rovere <rovere@users.noreply.github.com>
Date: Tue, 27 Mar 2018 07:53:22 +0200
Subject: [PATCH 002/102] Implement Riemann fit for pixel tracks
 (cms-patatrack#34)

Matrix operations are based on Eigen.

A first GPU version, running Eigen together with CUDA, is available in the test directory but currently disabled.
---
 .../PixelTrackFitting/BuildFile.xml           |  65 +-
 .../PixelTrackFitting/interface/RiemannFit.h  | 972 ++++++++++++++++++
 .../python/PixelTracks_cff.py                 |  29 +-
 .../PixelTrackFitting/test/BuildFile.xml      |  37 +-
 .../test/PixelTrackRiemannFit.cc              | 324 ++++++
 .../PixelTrackFitting/test/testEigenGPU.cu    | 209 ++++
 .../test/testEigenGPUNoFit.cu                 | 169 +++
 .../PixelTrackFitting/test/test_common.h      |  53 +
 8 files changed, 1806 insertions(+), 52 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/test_common.h

diff --git a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
index e6fc938dc25a7..3300d67809f33 100644
--- a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
@@ -1,36 +1,33 @@
-<use name="root"/>
-<use name="CommonTools/Statistics"/>
-<use name="CommonTools/Utils"/>
-<use name="DataFormats/GeometrySurface"/>
-<use name="DataFormats/GeometryVector"/>
-<use name="DataFormats/SiPixelDetId"/>
-<use name="DataFormats/TrackerRecHit2D"/>
-<use name="DataFormats/TrackingRecHit"/>
-<use name="DataFormats/TrackReco"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/MessageLogger"/>
-<use name="FWCore/ParameterSet"/>
-<use name="FWCore/Utilities"/>
-<use name="Geometry/CommonDetUnit"/>
-<use name="Geometry/Records"/>
-<use name="Geometry/TrackerGeometryBuilder"/>
-<use name="MagneticField/Engine"/>
-<use name="RecoTracker/TkHitPairs"/>
-<use name="RecoTracker/TkMSParametrization"/>
-<use name="RecoTracker/TkTrackingRegions"/>
-<use name="TrackingTools/DetLayers"/>
-<use name="TrackingTools/GeomPropagators"/>
-<use name="TrackingTools/KalmanUpdators"/>
-<use name="TrackingTools/PatternTools"/>
-<use name="TrackingTools/TrajectoryParametrization"/>
-<use name="TrackingTools/TrajectoryState"/>
-<use name="TrackingTools/TransientTrackingRecHit"/>
-<use name="DataFormats/BeamSpot"/>
-<use name="DataFormats/Common"/>
-<use name="DataFormats/GeometryCommonDetAlgo"/>
-<use name="DataFormats/TrackerCommon"/>
-<use name="RecoTracker/TkSeedingLayers"/>
-<use name="TrackingTools/TrajectoryFiltering"/>
+<use   name="root"/>
+<use   name="eigen"/>
+<use   name="CommonTools/Statistics"/>
+<use   name="DataFormats/GeometrySurface"/>
+<use   name="DataFormats/GeometryVector"/>
+<use   name="DataFormats/SiPixelDetId"/>
+<use   name="DataFormats/TrackerRecHit2D"/>
+<use   name="DataFormats/TrackingRecHit"/>
+<use   name="DataFormats/TrackReco"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/MessageLogger"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="FWCore/PluginManager"/>
+<use   name="FWCore/Utilities"/>
+<use   name="Geometry/CommonDetUnit"/>
+<use   name="Geometry/Records"/>
+<use   name="Geometry/TrackerGeometryBuilder"/>
+<use   name="MagneticField/Engine"/>
+<use   name="MagneticField/Records"/>
+<use   name="RecoTracker/TkMSParametrization"/>
+<use   name="RecoTracker/TkTrackingRegions"/>
+<use   name="TrackingTools/DetLayers"/>
+<use   name="TrackingTools/PatternTools"/>
+<use   name="TrackingTools/KalmanUpdators"/>
+<use   name="TrackingTools/Records"/>
+<use   name="TrackingTools/TrajectoryParametrization"/>
+<use   name="TrackingTools/TrajectoryState"/>
+<use   name="TrackingTools/TransientTrackingRecHit"/>
+<use   name="RecoTracker/TransientTrackingRecHit"/>
+<use   name="RecoPixelVertexing/PixelTriplets"/>
 <export>
-  <lib name="1"/>
+  <lib   name="1"/>
 </export>
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
new file mode 100644
index 0000000000000..d545f78274819
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -0,0 +1,972 @@
+#ifndef RECOPIXELVERTEXING_PIXELTRACKFITTING_RIEMANNFIT_H
+#define RECOPIXELVERTEXING_PIXELTRACKFITTING_RIEMANNFIT_H
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+#include <cuda.h>
+
+#ifdef __CUDACC__
+#define CUDA_HOSTDEV __host__ __device__
+#else
+#define CUDA_HOSTDEV
+#endif
+
+namespace Rfit {
+
+using namespace Eigen;
+
+constexpr double d = 1.e-4;         //!< used in numerical derivative (J2 in Circle_fit())
+constexpr unsigned int max_nop = 8;  //!< In order to avoid use of dynamic memory
+
+using MatrixNd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
+using ArrayNd = Eigen::Array<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
+using Matrix2Nd = Eigen::Matrix<double, Dynamic, Dynamic, 0, 2 * max_nop, 2 * max_nop>;
+using Matrix3Nd = Eigen::Matrix<double, Dynamic, Dynamic, 0, 3 * max_nop, 3 * max_nop>;
+using Matrix2xNd = Eigen::Matrix<double, 2, Dynamic, 0, 2, max_nop>;
+using Array2xNd = Eigen::Array<double, 2, Dynamic, 0, 2, max_nop>;
+using Matrix3xNd = Eigen::Matrix<double, 3, Dynamic, 0, 3, max_nop>;
+using MatrixNx3d = Eigen::Matrix<double, Dynamic, 3, 0, max_nop, 3>;
+using MatrixNx5d = Eigen::Matrix<double, Dynamic, 5, 0, max_nop, 5>;
+using VectorNd = Eigen::Matrix<double, Dynamic, 1, 0, max_nop, 1>;
+using Vector2Nd = Eigen::Matrix<double, Dynamic, 1, 0, 2 * max_nop, 1>;
+using Vector3Nd = Eigen::Matrix<double, Dynamic, 1, 0, 3 * max_nop, 1>;
+using RowVectorNd = Eigen::Matrix<double, 1, Dynamic, 1, 1, max_nop>;
+using RowVector2Nd = Eigen::Matrix<double, 1, Dynamic, 1, 1, 2 * max_nop>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+using Matrix6d = Eigen::Matrix<double, 6, 6>;
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using u_int    = unsigned int;
+
+struct circle_fit {
+  Vector3d par;  //!< parameter: (X0,Y0,R)
+  Matrix3d cov;
+  /*!< covariance matrix: \n
+      |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
+      |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
+      |cov(X0, R)|cov(Y0, R)|cov( R, R)|
+  */
+  int q;  //!< particle charge
+  double chi2;
+};
+
+struct line_fit {
+  Vector2d par;  //!<(cotan(theta),Zip)
+  Matrix2d cov;
+  /*!<
+      |cov(c_t,c_t)|cov(Zip,c_t)| \n
+      |cov(c_t,Zip)|cov(Zip,Zip)|
+  */
+  double chi2;
+};
+
+struct helix_fit {
+  Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
+  Matrix5d cov;
+  /*!< ()->cov() \n
+      |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
+      |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
+      |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
+      |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
+      |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
+  */
+  int q;  //!< particle charge
+  double chi2_circle;
+  double chi2_line;
+  Vector4d fast_fit;
+  VectorXd time;  // TO FIX just for profiling
+};
+
+
+template<class C>
+CUDA_HOSTDEV void printIt(C * m) {
+  for (u_int r = 0; r < m->rows(); ++r) {
+    for (u_int c = 0; c < m->cols(); ++c) {
+      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r,c));
+    }
+  }
+}
+
+
+/*!
+    \brief raise to square.
+*/
+CUDA_HOSTDEV inline double sqr(const double a) { return a * a; }
+
+/*!
+    \brief Compute cross product of two 2D vector (assuming z component 0),
+    returning z component of the result.
+
+    \param a first 2D vector in the product.
+    \param b second 2D vector in the product.
+
+    \return z component of the cross product.
+*/
+
+CUDA_HOSTDEV inline double cross2D(const Vector2d& a, const Vector2d& b) {
+  return a.x() * b.y() - a.y() * b.x();
+}
+
+/*!
+    \brief Compute the covariance matrix (in radial coordinates) of points in
+    the transverse plane due to multiple Coulomb scattering.
+
+    \param p2D 2D points in the transverse plane.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, Tan(Theta))).
+    \param B magnetic field use to compute p
+
+    \return scatter_cov_rad errors due to multiple scattering.
+
+    \warning input points must be ordered radially from the detector center
+    (from inner layer to outer ones; points on the same layer must ordered too).
+    \bug currently works only for points in the barrel.
+
+    \details Only the tangential component is computed (the radial one is
+    negligible).
+
+ */
+// X in input TO FIX
+CUDA_HOSTDEV MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D, const Vector4d& fast_fit, VectorNd const & rad, double B) {
+  u_int n = p2D.cols();
+  double X = 0.04;
+  double theta = atan(fast_fit(3));
+  double radlen_eff = X * sqrt(fast_fit(3) * fast_fit(3) + 1);
+  double p_t = fast_fit(2) * B;
+  double p_2 = p_t * p_t * (1. + 1./(fast_fit(3)*fast_fit(3)));
+
+  MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
+  const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff ;
+  for (u_int k = 0; k < n; ++k) {
+    for (u_int l = k; l < n; ++l) {
+      for (u_int i = 0; i < std::min(k, l); ++i) {
+        scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(sin(theta));
+        scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+      }
+    }
+  }
+  return scatter_cov_rad;
+}
+
+/*!
+    \brief Transform covariance matrix from radial (only tangential component)
+    to Cartesian coordinates (only transverse plane component).
+
+    \param p2D 2D points in the transverse plane.
+    \param cov_rad covariance matrix in radial coordinate.
+
+    \return cov_cart covariance matrix in Cartesian coordinates.
+*/
+
+CUDA_HOSTDEV inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
+    const MatrixNd& cov_rad,
+    const VectorNd &rad) {
+  u_int n = p2D.cols();
+  Matrix2Nd cov_cart = MatrixXd::Zero(2 * n, 2 * n);
+  VectorNd rad_inv = rad.cwiseInverse();
+  for (u_int i = 0; i < n; ++i) {
+    for (u_int j = i; j < n; ++j) {
+      cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+      cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+      cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+      cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+
+      cov_cart(j, i) = cov_cart(i, j);
+      cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
+      cov_cart(j + n, i) = cov_cart(i, j + n);
+      cov_cart(j, i + n) = cov_cart(i + n, j);
+    }
+  }
+  return cov_cart;
+}
+
+/*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to radial coordinates (both radial and
+    tangential component but only diagonal terms, correlation between different
+    point are not managed).
+
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+
+    \return cov_rad covariance matrix in raidal coordinate.
+
+    \warning correlation between different point are not computed.
+*/
+CUDA_HOSTDEV MatrixNd cov_carttorad(const Matrix2xNd& p2D,
+    const Matrix2Nd& cov_cart,
+    const VectorNd& rad) {
+  u_int n = p2D.cols();
+  MatrixNd cov_rad = MatrixXd::Zero(n, n);
+  const VectorNd rad_inv2 = rad.cwiseInverse().array().square();
+  for (u_int i = 0; i < n; ++i) {
+    //!< in case you have (0,0) to avoid dividing by 0 radius
+    if (rad(i) < 1.e-4)
+      cov_rad(i, i) = cov_cart(i, i);
+    else {
+      cov_rad(i, i) =
+          rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
+                         2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+    }
+  }
+  return cov_rad;
+}
+
+/*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to coordinates system orthogonal to the
+    pre-fitted circle in each point.
+    Further information in attached documentation.
+
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, tan(theta))).
+
+    \return cov_rad covariance matrix in the pre-fitted circle's
+    orthogonal system.
+
+*/
+
+CUDA_HOSTDEV MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
+                              const Vector4d& fast_fit,
+                              const VectorNd& rad) {
+  u_int n = p2D.cols();
+  MatrixNd cov_rad = MatrixXd::Zero(n, n);
+  for (u_int i = 0; i < n; ++i) {
+    //!< in case you have (0,0) to avoid dividing by 0 radius
+    if (rad(i) < 1.e-4)
+      cov_rad(i, i) = cov_cart(i, i);  // TO FIX
+    else {
+      Vector2d a = p2D.col(i);
+      Vector2d b = p2D.col(i) - fast_fit.head(2);
+      const double x2 = a.dot(b);
+      const double y2 = cross2D(a, b);
+      const double tan_c = - y2/x2;
+      const double tan_c2 = sqr(tan_c);
+      cov_rad(i, i) =
+          1. / (1. + tan_c2) *
+          (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+    }
+  }
+  return cov_rad;
+}
+
+/*!
+    \brief Compute the points' weights' vector for the circle fit when multiple
+    scattering is managed.
+    Further information in attached documentation.
+
+    \param cov_rad_inv covariance matrix inverse in radial coordinated
+    (or, beter, pre-fitted circle's orthogonal system).
+
+    \return weight VectorNd points' weights' vector.
+
+    \bug I'm not sure this is the right way to compute the weights for non
+    diagonal cov matrix. Further investigation needed.
+*/
+
+CUDA_HOSTDEV inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv) {
+  return cov_rad_inv.colwise().sum().transpose();
+}
+
+/*!
+    \brief Compute the points' weights' vector for the line fit (ODR).
+    Results from a pre-fit is needed in order to take the orthogonal (to the
+    line) component of the errors.
+
+    \param x_err2 squared errors in the x axis.
+    \param y_err2 squared errors in the y axis.
+    \param tan_theta tangent of theta (angle between y axis and line).
+
+    \return weight points' weights' vector for the line fit (ODR).
+*/
+
+CUDA_HOSTDEV inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta) {
+  return (1. + sqr(tan_theta)) * 1. / (x_err2 + y_err2 * sqr(tan_theta));
+}
+
+/*!
+    \brief Find particle q considering the  sign of cross product between
+    particles velocity (estimated by the first 2 hits) and the vector radius
+    between the first hit and the center of the fitted circle.
+
+    \param p2D 2D points in transverse plane.
+    \param par_uvr result of the circle fit in this form: (X0,Y0,R).
+
+    \return q int 1 or -1.
+*/
+
+CUDA_HOSTDEV inline int Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
+  return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) -
+              (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
+          0)
+             ? -1
+             : 1;
+}
+
+/*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and
+    consequently covariance matrix.
+
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+    \param B magnetic field in Gev/cm/c unit.
+    \param error flag for errors computation.
+*/
+
+CUDA_HOSTDEV void par_uvrtopak(circle_fit& circle, const double B, const bool& error) {
+  Vector3d par_pak;
+  const double temp0 = circle.par.head(2).squaredNorm();
+  const double temp1 = sqrt(temp0);
+  par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)),
+      circle.q * (temp1 - circle.par(2)), circle.par(2) * B;
+  if (error) {
+    const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+    const double temp3 = 1. / temp1 * circle.q;
+    Matrix3d J4;
+    J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0.,
+        circle.par(0) * temp3, circle.par(1) * temp3, -circle.q, 0., 0., B;
+    circle.cov = J4 * circle.cov * J4.transpose();
+  }
+  circle.par = par_pak;
+}
+
+/*!
+    \brief Compute the error propagation to obtain the square errors in the
+    x axis for the line fit. If errors have not been computed in the circle fit
+    than an'approximation is made.
+    Further information in attached documentation.
+
+    \param V hits' covariance matrix.
+    \param circle result of the previous circle fit (only the covariance matrix
+    is needed) TO FIX
+    \param J Jacobian of the transformation producing x values.
+    \param error flag for error computation.
+
+    \return x_err2 squared errors in the x axis.
+*/
+
+CUDA_HOSTDEV VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
+                const bool& error, u_int n) {
+  VectorNd x_err2(n);
+  for (u_int i = 0; i < n; ++i) {
+    Matrix5d Cov = MatrixXd::Zero(5, 5);
+    if (error) Cov.block(0, 0, 3, 3) = circle.cov;
+    Cov(3, 3) = V(i, i);
+    Cov(4, 4) = V(i + n, i + n);
+    Cov(3, 4) = Cov(4, 3) = V(i, i + n);
+    Eigen::Matrix<double, 1, 1> tmp;
+    tmp = J.row(i) * Cov * J.row(i).transpose().eval();
+    x_err2(i) = tmp(0,0);
+  }
+  return x_err2;
+}
+
+/*!
+    \brief Compute the eigenvector associated to the minimum eigenvalue.
+
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored.
+
+    \return the eigenvector associated to the minimum eigenvalue.
+
+    \warning double precision is needed for a correct assessment of chi2.
+
+    \details The minimus eigenvalue is related to chi2.
+    We exploit the fact that the matrix is symmetrical and small (2x2 for line
+    fit and 3x3 for circle fit), so the SelfAdjointEigenSolver from Eigen
+    library is used, with the computedDirect  method (available only for 2x2
+    and 3x3 Matrix) wich computes eigendecomposition of given matrix using a
+    fast closed-form algorithm.
+    For this optimization the matrix type must be known at compiling time.
+
+*/
+
+CUDA_HOSTDEV Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
+  SelfAdjointEigenSolver<Matrix3d> solver(3);
+  solver.computeDirect(A);
+  int min_index;
+  chi2 = solver.eigenvalues().minCoeff(&min_index);
+  return solver.eigenvectors().col(min_index);
+}
+
+/*!
+    \brief A faster version of min_eigen3D() where double precision is not
+    needed.
+
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+
+    \return the eigenvector associated to the minimum eigenvalue.
+
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 3x3 Matrix
+    indeed, use trigonometry function (it solves a third degree equation) which
+    speed up in  single precision.
+*/
+
+CUDA_HOSTDEV Vector3d min_eigen3D_fast(const Matrix3d& A) {
+  SelfAdjointEigenSolver<Matrix3f> solver(3);
+  solver.computeDirect(A.cast<float>());
+  int min_index;
+  solver.eigenvalues().minCoeff(&min_index);
+  return solver.eigenvectors().col(min_index).cast<double>();
+}
+
+/*!
+    \brief 2D version of min_eigen3D().
+
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+
+    \return the eigenvector associated to the minimum eigenvalue.
+
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix
+    do not use special math function (just sqrt) therefore it doesn't speed up
+    significantly in single precision.
+*/
+
+CUDA_HOSTDEV Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
+  SelfAdjointEigenSolver<Matrix2d> solver(2);
+  solver.computeDirect(A);
+  int min_index;
+  chi2 = solver.eigenvalues().minCoeff(&min_index);
+  return solver.eigenvectors().col(min_index);
+}
+
+/*!
+    \brief A very fast helix fit: it fits a circle by three points (first, middle
+    and last point) and a line by two points (first and last).
+
+    \param hits points to be fitted
+
+    \return result in this form: (X0,Y0,R,tan(theta)).
+
+    \warning points must be passed ordered (from internal layer to external) in
+    order to maximize accuracy and do not mistake tan(theta) sign.
+
+    \details This fast fit is used as pre-fit which is needed for:
+    - weights estimation and chi2 computation in line fit (fundamental);
+    - weights estimation and chi2 computation in circle fit (useful);
+    - computation of error due to multiple scattering.
+*/
+
+CUDA_HOSTDEV Vector4d Fast_fit(const Matrix3xNd& hits) {
+  Vector4d result;
+  u_int n = hits.cols(); // get the number of hits
+
+  // CIRCLE FIT
+  // Make segments between middle-to-first(b) and last-to-first(c) hits
+  const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+  const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+  // Compute their lengths
+  const double b2 = b.squaredNorm();
+  const double c2 = c.squaredNorm();
+  double X0;
+  double Y0;
+  // The algebra has been verified (MR). The usual approach has been followed:
+  // * use an orthogonal reference frame passing from the first point.
+  // * build the segments (chords)
+  // * build orthogonal lines through mid points
+  // * make a system and solve for X0 and Y0.
+  // * add the initial point
+  if (abs(b.x()) > abs(b.y())) {  //!< in case b.x is 0 (2 hits with same x)
+    const double k = c.x() / b.x();
+    const double div = 2. * (k * b.y() - c.y());
+    // if aligned TO FIX
+    Y0 = (k * b2 - c2) / div;
+    X0 = b2 / (2 * b.x()) - b.y() / b.x() * Y0;
+  } else {
+    const double k = c.y() / b.y();
+    const double div = 2. * (k * b.x() - c.x());
+    // if aligned TO FIX
+    X0 = (k * b2 - c2) / div;
+    Y0 = b2 / (2 * b.y()) - b.x() / b.y() * X0;
+  }
+
+  result(0) = X0 + hits(0, 0);
+  result(1) = Y0 + hits(1, 0);
+  result(2) = sqrt(sqr(X0) + sqr(Y0));
+
+  // LINE FIT
+  const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+  const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+  // Compute the arc-length between first and last point: L = R * theta = R *  atan (tan (Theta) )
+  const double dr = result(2) * atan2(cross2D(d, e), d.dot(e));
+  // Simple difference in Z between last and first hit
+  const double dz = hits(2, n - 1) - hits(2, 0);
+
+  result(3) = (dr / dz);
+
+  return result;
+}
+
+/*!
+    \brief Fit a generic number of 2D points with a circle using Riemann-Chernov
+    algorithm. Covariance matrix of fitted parameter is optionally computed.
+    Multiple scattering (currently only in barrel layer) is optionally handled.
+
+    \param hits2D 2D points to be fitted.
+    \param hits_cov2D covariance matrix of 2D points.
+    \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)).
+    (tan(theta) is not used).
+    \param B magnetic field
+    \param error flag for error computation.
+    \param scattering flag for multiple scattering
+
+    \return circle circle_fit:
+    -par parameter of the fitted circle in this form (X0,Y0,R); \n
+    -cov covariance matrix of the fitted parameter (not initialized if
+    error = false); \n
+    -q charge of the particle; \n
+    -chi2.
+
+    \warning hits must be passed ordered from inner to outer layer (double hits
+    on the same layer must be ordered too) so that multiple scattering is
+    treated properly.
+    \warning Multiple scattering for barrel is still not tested.
+    \warning Multiple scattering for endcap hits is not handled (yet). Do not
+    fit endcap hits with scattering = true !
+
+    \bug for small pt (<0.3 Gev/c) chi2 could be slightly underestimated.
+    \bug further investigation needed for error propagation with multiple
+    scattering.
+*/
+
+CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hits_cov2D,
+    const Vector4d& fast_fit, VectorNd const & rad,
+    const double B,
+    const bool& error = true,
+    const bool& scattering = false) {
+  // INITIALIZATION
+  Matrix2Nd V = hits_cov2D;
+  u_int n = hits2D.cols();
+
+  // WEIGHT COMPUTATION
+  VectorNd weight;
+  MatrixNd G;
+  double renorm;
+  {
+    MatrixNd cov_rad;
+    cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad);
+    // cov_rad = cov_carttorad(hits2D, V);
+
+    if (scattering) {
+      MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
+      V += cov_radtocart(hits2D, scatter_cov_rad, rad);
+      cov_rad += scatter_cov_rad;
+      G = cov_rad.inverse();
+      renorm = G.sum();
+      G *= 1. / renorm;
+      weight = Weight_circle(G);
+    } else {
+      weight = cov_rad.diagonal().cwiseInverse();
+      renorm = weight.sum();
+      weight *= 1. / renorm;
+    }
+  }
+
+  // SPACE TRANSFORMATION
+
+  // center
+  const Vector2d h_ = hits2D.rowwise().mean();  // centroid
+  Matrix3xNd p3D(3, n);
+  p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
+  Vector2Nd mc(2 * n);  // centered hits, used in error computation
+  mc << p3D.row(0).transpose(), p3D.row(1).transpose();
+
+  // scale
+  const double q = mc.squaredNorm();
+  const double s = sqrt(n * 1. / q);  // scaling factor
+  p3D *= s;
+
+  // project on paraboloid
+  p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
+
+  // COST FUNCTION
+
+  // compute
+  Matrix3d A = Matrix3d::Zero();
+  const Vector3d r0 = p3D * weight;  // center of gravity
+  const Matrix3xNd X = p3D.colwise() - r0;
+  if (scattering)
+    A = X * G * X.transpose();
+  else {
+    for (u_int i = 0; i < n; ++i) A += weight(i) * (X.col(i) * X.col(i).transpose());
+  }
+
+  // minimize
+  double chi2;
+  Vector3d v = min_eigen3D(A, chi2);
+  v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+  // This hack to be able to run on GPU where the automatic assignment to a
+  // double from the vector multiplication is not working.
+  Matrix<double, 1, 1> cm;
+  cm.noalias() = -v.transpose() * r0;
+  const double c = cm(0,0);
+
+  // COMPUTE CIRCLE PARAMETER
+
+  // auxiliary quantities
+  const double h = sqrt(1. - sqr(v(2)) - 4. * c * v(2));
+  const double v2x2_inv = 1. / (2. * v(2));
+  const double s_inv = 1. / s;
+  Vector3d par_uvr_;  // used in error propagation
+  par_uvr_ << -v(0) * v2x2_inv, -v(1) * v2x2_inv, h * v2x2_inv;
+
+  circle_fit circle;
+  circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
+  circle.q = Charge(hits2D, circle.par);
+  circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
+
+  // ERROR PROPAGATION
+  if (error) {
+    ArrayNd Vcs_[2][2];  // cov matrix of center & scaled points
+    {
+      const Matrix2Nd Vcs = sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
+                                             (2. * V.squaredNorm() + 4. * mc.transpose() * V * mc) *
+                                             mc * mc.transpose();
+      Vcs_[0][0] = Vcs.block(0, 0, n, n);
+      Vcs_[0][1] = Vcs.block(0, n, n, n);
+      Vcs_[1][1] = Vcs.block(n, n, n, n);
+      Vcs_[1][0] = Vcs_[0][1].transpose();
+    }
+
+    MatrixNd C[3][3];  // cov matrix of 3D transformed points
+    {
+      const ArrayNd t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+      const ArrayNd t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+      const ArrayNd t00 = p3D.row(0).transpose() * p3D.row(0);
+      const ArrayNd t01 = p3D.row(0).transpose() * p3D.row(1);
+      const ArrayNd t11 = p3D.row(1).transpose() * p3D.row(1);
+      const ArrayNd t10 = t01.transpose();
+      C[0][0] = Vcs_[0][0];
+      C[0][1] = Vcs_[0][1];
+      C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
+      C[1][1] = Vcs_[1][1];
+      C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
+      C[2][2] = 2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
+                      Vcs_[1][1] * Vcs_[1][1]) +
+                4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11);
+    }
+
+    Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+    for (u_int i = 0; i < 3; ++i) {
+      for (u_int j = i; j < 3; ++j) {
+        C0(i, j) = weight.transpose() * C[i][j] * weight;
+        C0(j, i) = C0(i, j);
+      }
+    }
+
+    const MatrixNd W = weight * weight.transpose();
+    const MatrixNd H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
+    const MatrixNx3d s_v = H * p3D.transpose();
+
+    MatrixNd D_[3][3];  // cov(s_v)
+    {
+      D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
+      D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
+      D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
+      D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
+      D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
+      D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
+      D_[1][0] = D_[0][1].transpose();
+      D_[2][0] = D_[0][2].transpose();
+      D_[2][1] = D_[1][2].transpose();
+    }
+
+    constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+    Matrix6d E;  // cov matrix of the 6 independent elements of A
+    for (u_int a = 0; a < 6; ++a) {
+      const u_int i = nu[a][0], j = nu[a][1];
+      for (u_int b = a; b < 6; ++b) {
+        const u_int k = nu[b][0], l = nu[b][1];
+        VectorNd t0(n);
+        VectorNd t1(n);
+        if (l == k) {
+          t0 = 2. * D_[j][l] * s_v.col(l);
+          if (i == j)
+            t1 = t0;
+          else
+            t1 = 2. * D_[i][l] * s_v.col(l);
+        } else {
+          t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
+          if (i == j)
+            t1 = t0;
+          else
+            t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
+        }
+
+        if (i == j)
+          E(a, b) = 0. + s_v.col(i).transpose() * (t0 + t1);
+        else
+          E(a, b) = 0. + (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+        if (b != a) E(b, a) = E(a, b);
+      }
+    }
+
+    Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
+    for (u_int a = 0; a < 6; ++a) {
+      const u_int i = nu[a][0], j = nu[a][1];
+      Matrix3d Delta = Matrix3d::Zero();
+      Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
+      J2.col(a) = min_eigen3D_fast(A + Delta);
+      const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
+      J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
+    }
+
+    Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
+    {
+      Matrix3d t0 = J2 * E * J2.transpose();
+      Vector3d t1 = -t0 * r0;
+      Cvc.block(0, 0, 3, 3) = t0;
+      Cvc.block(0, 3, 3, 1) = t1;
+      Cvc.block(3, 0, 1, 3) = t1.transpose();
+      Cvc(3, 3) =
+          (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+    }
+
+    Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+    {
+      const double t = 1. / h;
+      J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
+          0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+    }
+
+    const RowVector2Nd Jq = mc.transpose() * s * 1. / n;  // var(q)
+
+    Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                       + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
+
+    circle.cov = cov_uvr;
+  }
+
+
+  return circle;
+}
+
+/*!
+    \brief Fit of helix parameter cotan(theta)) and Zip by projection on the
+    pre-fitted cylinder  and line fit on its surface.
+
+    \param hits hits coordinates.
+    \param hits_cov covariance matrix of the hits.
+    \param circle cylinder parameter, their covariance (if computed, otherwise
+    uninitialized) and particle charge.
+    \param fast_fit result of the previous fast fit in this form:
+    (X0,Y0,R,cotan(theta))).
+    \param error flag for error computation.
+
+    \return line line_fit:
+    -par parameter of the line in this form: (cotan(theta)), Zip); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2.
+
+    \warning correlation between R and z are neglected, this could be relevant
+    if geometry detector provides sloped modules in the R/z plane.
+
+    \bug chi2 and errors could be slightly underestimated for small eta (<0.2)
+    when pt is small (<0.3 Gev/c).
+
+    \todo multiple scattering treatment.
+
+    \details Line fit is made by orthogonal distance regression where
+    correlation between coordinates in the transverse plane (x,y) and z are
+    neglected (for a barrel + endcap geometry this is a very good
+    approximation).
+    Covariance matrix of the fitted parameter is optionally computed.
+    Multiple scattering is not handled (yet).
+    A fast pre-fit is performed in order to evaluate weights and to compute
+    errors.
+*/
+
+CUDA_HOSTDEV line_fit Line_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const circle_fit& circle,
+                  const Vector4d& fast_fit, const bool& error = true) {
+  u_int n = hits.cols();
+  // PROJECTION ON THE CILINDER
+  Matrix2xNd p2D(2, n);
+  MatrixNx5d Jx(n, 5);
+
+  // x & associated Jacobian
+  // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+  // Slide 11
+  // a ==> -o i.e. the origin of the circle in XY plane, negative
+  // b ==> p i.e. distances of the points wrt the origin of the circle.
+  const Vector2d o(circle.par(0), circle.par(1));
+  for (u_int i = 0; i < n; ++i) {  // x
+    Vector2d p = hits.block(0, i, 2, 1) - o;
+    const double cross = cross2D(-o, p);
+    const double dot = (-o).dot(p);
+    // atan2(cross, dot) give back the angle in the transverse plane so tha the final equation reads:
+    // x_i = -q*R*theta (theta = angle returned by atan2)
+    const double atan2_ = -circle.q * atan2(cross, dot);
+    p2D(0, i) = atan2_ * circle.par(2);
+
+    // associated Jacobian, used in weights and errors computation
+    const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+    double d_X0 = 0, d_Y0 = 0, d_R = 0.;  // good approximation for big pt and eta
+    if (error) {
+      d_X0 = - temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
+      d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
+      d_R = atan2_;
+    }
+    const double d_x = temp0 * (o(1) * dot + o(0) * cross);
+    const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
+    Jx.row(i) << d_X0, d_Y0, d_R, d_x, d_y;
+  }
+  // Math of d_{X0,Y0,R,x,y} all verified by hand
+
+  // y
+  p2D.row(1) = hits.row(2);
+
+
+  // WEIGHT COMPUTATION
+  VectorNd x_err2 = X_err2(hits_cov, circle, Jx, error, n);
+  VectorNd y_err2 = hits_cov.block(2 * n, 2 * n, n, n).diagonal();
+
+  const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
+  const VectorNd weight = err2_inv * 1. / err2_inv.sum();
+  // COST FUNCTION
+
+  // compute
+  // r0 represents the weighted mean of "x" and "y".
+  const Vector2d r0 = p2D * weight;
+  // This is the X  vector that will be used to build the
+  // scatter matrix S = X^T * X
+  const Matrix2xNd X = p2D.colwise() - r0;
+  Matrix2d A = Matrix2d::Zero();
+  for (u_int i = 0; i < n; ++i) {
+    A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
+  }
+  // minimize
+  double chi2;
+  Vector2d v = min_eigen2D(A, chi2);
+  // n *= (chi2>0) ? 1 : -1; //TO FIX
+  // This hack to be able to run on GPU where the automatic assignment to a
+  // double from the vector multiplication is not working.
+  Matrix<double, 1, 1> cm;
+  cm.noalias() = -v.transpose() * r0;
+  const double c = cm(0,0);
+
+  // COMPUTE LINE PARAMETER
+  line_fit line;
+  line.par << -v(0) / v(1),                          // cotan(theta))
+      -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
+  line.chi2 = abs(chi2);
+
+  // ERROR PROPAGATION
+  if (error) {
+    const double v0_2 = sqr(v(0));
+    const double v1_2 = sqr(v(1));
+
+    Matrix3d C;  // cov(v,c)
+    {
+      double norm_chernov = 0.;
+      for (u_int i = 0; i < n; ++i)
+        norm_chernov += err2_inv(i) * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c)
+          * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c);
+      norm_chernov /= float(n);
+      // Indeed it should read:
+      // * compute the average error in the orthogonal direction: err2_inv.cwiseInverse().sum()/sqr(n)
+      // * normalize the A(0,0)+A(1,1) dividing by err2_inv.sum(), since those have been weighted
+      const double norm = (err2_inv.cwiseInverse().sum())*err2_inv.sum()*1./sqr(n);
+      const double sig2 = 1./(A(0,0) + A(1,1))*norm;
+//      const double sig2 = 1. / (A(0, 0) + A(1, 1));
+      C(0, 0) = sig2 * v1_2;
+      C(1, 1) = sig2 * v0_2;
+      C(0, 1) = C(1, 0) = -sig2 * v(0) * v(1);
+      const VectorNd weight_2 = (weight).array().square();
+      const Vector2d C0(weight_2.dot(x_err2), weight_2.dot(y_err2));
+      C.block(0, 2, 2, 1) = C.block(2, 0, 1, 2).transpose() = -C.block(0, 0, 2, 2) * r0;
+      Matrix<double, 1, 1> tmp = (r0.transpose() * C.block(0, 0, 2, 2) * r0);
+      C(2, 2) = v0_2 * C0(0) + v1_2 * C0(1) + C0(0) * C(0, 0) + C0(1) * C(1, 1) + tmp(0,0);
+    }
+
+    Matrix<double, 2, 3> J;  // Jacobian of (v,c) -> (cotan(theta)),Zip)
+    {
+      const double t0 = 1. / v(1);
+      const double t1 = sqr(t0);
+      const double sqrt_ = sqrt(v1_2 + v0_2);
+      const double t2 = 1. / sqrt_;
+      J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
+    }
+    Matrix<double, 3, 2> JT = J.transpose().eval();
+    line.cov.noalias() = J * C * JT;
+  }
+
+  return line;
+}
+
+/*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of hits projected in the transverse plane by Riemann-Chernov
+        algorithm (see Circle_fit() for further info); \n
+    -line fit of hits projected on cylinder surface by orthogonal distance
+        regression (see Line_fit for further info). \n
+    Points must be passed ordered (from inner to outer layer).
+
+    \param hits Matrix3xNd hits coordinates in this form: \n
+        |x0|x1|x2|...|xn| \n
+        |y0|y1|y2|...|yn| \n
+        |z0|z1|z2|...|zn|
+
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+
+   |(x0,x0)|(x1,x0)|(x2,x0)|.|(y0,x0)|(y1,x0)|(y2,x0)|.|(z0,x0)|(z1,x0)|(z2,x0)| \n
+   |(x0,x1)|(x1,x1)|(x2,x1)|.|(y0,x1)|(y1,x1)|(y2,x1)|.|(z0,x1)|(z1,x1)|(z2,x1)| \n
+   |(x0,x2)|(x1,x2)|(x2,x2)|.|(y0,x2)|(y1,x2)|(y2,x2)|.|(z0,x2)|(z1,x2)|(z2,x2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,y0)|(x1,y0)|(x2,y0)|.|(y0,y0)|(y1,y0)|(y2,x0)|.|(z0,y0)|(z1,y0)|(z2,y0)| \n
+   |(x0,y1)|(x1,y1)|(x2,y1)|.|(y0,y1)|(y1,y1)|(y2,x1)|.|(z0,y1)|(z1,y1)|(z2,y1)| \n
+   |(x0,y2)|(x1,y2)|(x2,y2)|.|(y0,y2)|(y1,y2)|(y2,x2)|.|(z0,y2)|(z1,y2)|(z2,y2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n
+   |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n
+   |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)|
+
+   \param B magnetic field in the center of the detector in Gev/cm/c
+   unit, in order to perform pt calculation.
+   \param error flag for error computation.
+   \param scattering flag for multiple scattering treatment.
+   (see Circle_fit() documentation for further info).
+
+   \warning see Circle_fit(), Line_fit() and Fast_fit() warnings.
+
+   \bug see Circle_fit(), Line_fit() and Fast_fit() bugs.
+*/
+
+helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const double B,
+                    const bool& error = true, const bool& scattering = false) {
+  u_int n = hits.cols();
+  VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
+  const Vector4d fast_fit = Fast_fit(hits);
+
+  circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov.block(0, 0, 2 * n, 2 * n),
+                                 fast_fit, rad, B, error, scattering);
+
+  const line_fit line = Line_fit(hits, hits_cov, circle, fast_fit, error);
+
+  par_uvrtopak(circle, B, error);
+
+  helix_fit helix;
+  helix.par << circle.par, line.par;
+  if (error) {
+    helix.cov = MatrixXd::Zero(5, 5);
+    helix.cov.block(0, 0, 3, 3) = circle.cov;
+    helix.cov.block(3, 3, 2, 2) = line.cov;
+  }
+  helix.q = circle.q;
+  helix.chi2_circle = circle.chi2;
+  helix.chi2_line = line.chi2;
+
+  return helix;
+}
+
+}  // namespace Rfit
+
+
+#endif
+
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 4334d724358f3..44bfb888df98c 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -1,5 +1,6 @@
 import FWCore.ParameterSet.Config as cms
 
+from RecoLocalTracker.SiPixelRecHits.PixelCPEParmError_cfi import *
 from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import *
 from RecoLocalTracker.SiStripRecHitConverter.SiStripRecHitMatcher_cfi import *
 from RecoTracker.TransientTrackingRecHit.TransientTrackingRecHitBuilder_cfi import *
@@ -11,6 +12,7 @@
 from RecoTracker.TkSeedingLayers.PixelLayerTriplets_cfi import *
 from RecoTracker.TkSeedingLayers.TTRHBuilderWithoutAngle4PixelTriplets_cfi import *
 from RecoPixelVertexing.PixelTrackFitting.pixelFitterByHelixProjections_cfi import pixelFitterByHelixProjections
+from RecoPixelVertexing.PixelTrackFitting.pixelFitterByRiemannParaboloid_cfi import pixelFitterByRiemannParaboloid
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackFilterByKinematics_cfi import pixelTrackFilterByKinematics
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackCleanerBySharedHits_cfi import pixelTrackCleanerBySharedHits
 from RecoPixelVertexing.PixelTrackFitting.pixelTracks_cfi import pixelTracks as _pixelTracks
@@ -63,17 +65,22 @@
 )
 trackingLowPU.toModify(pixelTracks, SeedingHitSets = "pixelTracksHitTriplets")
 
-pixelTracksTask = cms.Task(
-    pixelTracksTrackingRegions,
-    pixelFitterByHelixProjections,
-    pixelTrackFilterByKinematics,
-    pixelTracksSeedLayers,
-    pixelTracksHitDoublets,
-    pixelTracksHitQuadruplets,
+pixelTracksSequence = cms.Sequence(
+    pixelTracksTrackingRegions +
+    pixelFitterByHelixProjections +
+    pixelTrackFilterByKinematics +
+    pixelTracksSeedLayers +
+    pixelTracksHitDoublets +
+    pixelTracksHitQuadruplets +
     pixelTracks
 )
-_pixelTracksTask_lowPU = pixelTracksTask.copy()
-_pixelTracksTask_lowPU.replace(pixelTracksHitQuadruplets, pixelTracksHitTriplets)
-trackingLowPU.toReplaceWith(pixelTracksTask, _pixelTracksTask_lowPU)
+_pixelTracksSequence_lowPU = pixelTracksSequence.copy()
+_pixelTracksSequence_lowPU.replace(pixelTracksHitQuadruplets, pixelTracksHitTriplets)
+trackingLowPU.toReplaceWith(pixelTracksSequence, _pixelTracksSequence_lowPU)
 
-pixelTracksSequence = cms.Sequence(pixelTracksTask)
+# Use Riemann fit and substitute previous Fitter producer with the Riemann one
+from Configuration.ProcessModifiers.riemannFit_cff import riemannFit
+riemannFit.toModify(pixelTracks, Fitter = "pixelFitterByRiemannParaboloid")
+_pixelTracksSequence_riemannFit = pixelTracksSequence.copy()
+_pixelTracksSequence_riemannFit.replace(pixelFitterByHelixProjections, pixelFitterByRiemannParaboloid)
+riemannFit.toReplaceWith(pixelTracksSequence, _pixelTracksSequence_riemannFit)
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 44820da381dd1..8c61394fec6ec 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -1,8 +1,31 @@
-<use name="boost"/>
-<use name="root"/>
-<use name="DataFormats/TrackReco"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
-<library file="PixelTrackTest.cc" name="PixelTrackTest">
-  <flags EDM_PLUGIN="1"/>
+<use   name="boost"/>
+<use   name="root"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/PluginManager"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="Geometry/Records"/>
+<use   name="Geometry/CommonDetUnit"/>
+<use   name="Geometry/TrackerGeometryBuilder"/>
+<use   name="DataFormats/TrackerRecHit2D"/>
+<use   name="RecoTracker/TkHitPairs"/>
+<use   name="RecoTracker/TkTrackingRegions"/>
+<use   name="RecoPixelVertexing/PixelTriplets"/>
+<use   name="RecoPixelVertexing/PixelTrackFitting"/>
+<!--
+<bin file="testEigenGPU.cu" name="testEigenGPU_t">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+<bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+-->
+<library   file="PixelTrackTest.cc" name="PixelTrackTest">
+  <flags   EDM_PLUGIN="1"/>
 </library>
+<bin file="PixelTrackRiemannFit.cc">
+  <flags   CXXFLAGS="-g"/>
+</bin>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
new file mode 100644
index 0000000000000..9f60b2f431e96
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -0,0 +1,324 @@
+#define _USE_MATH_DEFINES
+
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <memory>  // unique_ptr
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+
+using namespace std;
+using namespace Eigen;
+using namespace Rfit;
+using std::unique_ptr;
+
+namespace Rfit {
+using Vector3i = Eigen::Matrix<int, 3, 1>;
+using Vector4i = Eigen::Matrix<int, 4, 1>;
+using Vector6d = Eigen::Matrix<double, 6, 1>;
+using Vector8d = Eigen::Matrix<double, 8, 1>;
+};  // namespace Rfit
+
+struct hits_gen {
+  Matrix3xNd hits;
+  Matrix3Nd hits_cov;
+  Vector5d true_par;
+};
+
+struct geometry {
+  Vector8d barrel;
+  Vector4i barrel_2;
+  Vector8d R_err;
+  Vector8d Rp_err;
+  Vector8d z_err;
+  Vector6d hand;
+  Vector3i hand_2;
+  Vector6d xy_err;
+  Vector6d zh_err;
+  double z_max;
+  double r_max;
+};
+
+void test_helix_fit();
+
+constexpr int c_speed = 299792458;
+constexpr double pi = M_PI;
+default_random_engine generator(1);
+
+void smearing(const Vector5d& err, const bool& isbarrel, double& x, double& y, double& z) {
+  normal_distribution<double> dist_R(0., err[0]);
+  normal_distribution<double> dist_Rp(0., err[1]);
+  normal_distribution<double> dist_z(0., err[2]);
+  normal_distribution<double> dist_xyh(0., err[3]);
+  normal_distribution<double> dist_zh(0., err[4]);
+  if (isbarrel) {
+    double dev_Rp = dist_Rp(generator);
+    double dev_R = dist_R(generator);
+    double R = sqrt(Rfit::sqr(x) + Rfit::sqr(y));
+    x += dev_Rp * +y / R + dev_R * -x / R;
+    y += dev_Rp * -x / R + dev_R * -y / R;
+    z += dist_z(generator);
+  } else {
+    x += dist_xyh(generator);
+    y += dist_xyh(generator);
+    z += dist_zh(generator);
+  }
+}
+
+void Hits_cov(Matrix3Nd& V, const unsigned int& i, const unsigned int& n, const Matrix3xNd& hits,
+              const Vector5d& err, bool isbarrel) {
+  if (isbarrel) {
+    double R2 = Rfit::sqr(hits(0, i)) + Rfit::sqr(hits(1, i));
+    V(i, i) =
+        (Rfit::sqr(err[1]) * Rfit::sqr(hits(1, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(0, i))) /
+        R2;
+    V(i + n, i + n) =
+        (Rfit::sqr(err[1]) * Rfit::sqr(hits(0, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(1, i))) /
+        R2;
+    V(i, i + n) = V(i + n, i) =
+        (Rfit::sqr(err[0]) - Rfit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2;
+    V(i + 2 * n, i + 2 * n) = Rfit::sqr(err[2]);
+  } else {
+    V(i, i) = Rfit::sqr(err[3]);
+    V(i + n, i + n) = Rfit::sqr(err[3]);
+    V(i + 2 * n, i + 2 * n) = Rfit::sqr(err[4]);
+  }
+}
+
+hits_gen Hits_gen(const unsigned int& n, const Matrix<double, 6, 1>& gen_par) {
+  hits_gen gen;
+  gen.hits = MatrixXd::Zero(3, n);
+  gen.hits_cov = MatrixXd::Zero(3 * n, 3 * n);
+  // err /= 10000.;
+  constexpr double rad[8] = {2.95, 6.8, 10.9, 16., 3.1, 7., 11., 16.2};
+  // constexpr double R_err[8] = {5./10000, 5./10000, 5./10000, 5./10000, 5./10000,
+  // 5./10000, 5./10000, 5./10000};  constexpr double Rp_err[8] = {35./10000, 18./10000,
+  // 15./10000, 34./10000, 35./10000, 18./10000, 15./10000, 34./10000};  constexpr double z_err[8] =
+  // {72./10000, 38./10000, 25./10000, 56./10000, 72./10000, 38./10000, 25./10000, 56./10000};
+  constexpr double R_err[8] = {10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000,
+                               10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000};
+  constexpr double Rp_err[8] = {35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000,
+                                35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000};
+  constexpr double z_err[8] = {72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000,
+                               72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000};
+  const double x2 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180);
+  const double y2 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180);
+  const double alpha = atan2(y2, x2);
+
+  for (unsigned int i = 0; i < n; ++i) {
+    const double a = gen_par(4);
+    const double b = rad[i];
+    const double c = sqrt(Rfit::sqr(x2) + Rfit::sqr(y2));
+    const double beta = acos((Rfit::sqr(a) - Rfit::sqr(b) - Rfit::sqr(c)) / (-2. * b * c));
+    const double gamma = alpha + beta;
+    gen.hits(0, i) = rad[i] * cos(gamma);
+    gen.hits(1, i) = rad[i] * sin(gamma);
+    gen.hits(2, i) = gen_par(2) + 1 / tan(gen_par(5) * pi / 180) * 2. *
+                                      asin(sqrt(Rfit::sqr((gen_par(0) - gen.hits(0, i))) +
+                                                Rfit::sqr((gen_par(1) - gen.hits(1, i)))) /
+                                           (2. * gen_par(4))) *
+                                      gen_par(4);
+    // isbarrel(i) = ??
+    Vector5d err;
+    err << R_err[i], Rp_err[i], z_err[i], 0, 0;
+    smearing(err, true, gen.hits(0, i), gen.hits(1, i), gen.hits(2, i));
+    Hits_cov(gen.hits_cov, i, n, gen.hits, err, true);
+  }
+
+  return gen;
+}
+
+Vector5d True_par(const Matrix<double, 6, 1>& gen_par, const int& charge, const double& B_field) {
+  Vector5d true_par;
+  const double x0 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180);
+  const double y0 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180);
+  circle_fit circle;
+  circle.par << x0, y0, gen_par(4);
+  circle.q = 1;
+  Rfit::par_uvrtopak(circle, B_field, false);
+  true_par.block(0, 0, 3, 1) = circle.par;
+  true_par(3) = 1 / tan(gen_par(5) * pi / 180);
+  const int dir = ((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1)) * (gen_par(1) - y0) -
+                       (gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)) * (gen_par(0) - x0) >
+                   0)
+                      ? -1
+                      : 1;
+  true_par(4) = gen_par(2) +
+                1 / tan(gen_par(5) * pi / 180) * dir * 2.f *
+                    asin(sqrt(Rfit::sqr((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1))) +
+                              Rfit::sqr((gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)))) /
+                         (2.f * gen_par(4))) *
+                    gen_par(4);
+  return true_par;
+}
+
+Matrix<double, 6, 1> New_par(const Matrix<double, 6, 1>& gen_par, const int& charge,
+                             const double& B_field) {
+  Matrix<double, 6, 1> new_par;
+  new_par.block(0, 0, 3, 1) = gen_par.block(0, 0, 3, 1);
+  new_par(3) = gen_par(3) - charge * 90;
+  new_par(4) = gen_par(4) / B_field;
+//  new_par(5) = atan(sinh(gen_par(5))) * 180 / pi;
+  new_par(5) = 2.*atan(exp(-gen_par(5))) * 180 / pi;
+  return new_par;
+}
+
+void test_helix_fit() {
+  int n_;
+  int iteration;
+  int debug2 = 0;
+  bool return_err;
+  const double B_field = 3.8 * c_speed / pow(10, 9) / 100;
+  Matrix<double, 6, 1> gen_par;
+  Vector5d true_par;
+  Vector5d err;
+//  while (1) {
+    generator.seed(1);
+    int debug = 0;
+    debug2 = 0;
+    std::cout << std::setprecision(6);
+    cout << "_________________________________________________________________________\n";
+    cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
+//    cin >> n_ >> gen_par(0) >> gen_par(1) >> gen_par(2) >> gen_par(3) >> gen_par(4) >> gen_par(5) >>
+//        iteration >> return_err >> debug2;
+    n_ = 4;
+    gen_par(0) = -0.1;  // x
+    gen_par(1) = 0.1;   // y
+    gen_par(2) = -1.;  // z
+    gen_par(3) = 45.;   // phi
+    gen_par(4) = 10.;   // R (p_t)
+    gen_par(5) = 1.;   // eta
+    iteration = 1;
+    return_err = 1;
+    debug2 = 1;
+
+    iteration *= 10;
+    gen_par = New_par(gen_par, 1, B_field);
+    true_par = True_par(gen_par, 1, B_field);
+    Matrix3xNd hits;
+    Matrix3Nd hits_cov;
+    unique_ptr<helix_fit[]> helix(new helix_fit[iteration]);
+//    helix_fit* helix = new helix_fit[iteration];
+    Matrix<double, 41, Dynamic, 1> score(41, iteration);
+
+    for (int i = 0; i < iteration; i++) {
+      if (debug2 == 1 && i == (iteration - 1)) {
+        debug = 1;
+      }
+      hits_gen gen;
+      gen = Hits_gen(n_, gen_par);
+//      gen.hits = MatrixXd::Zero(3, 4);
+//      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
+//      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
+//      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
+//      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
+//      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
+      helix[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err, false);
+
+      if (debug)
+        cout << std::setprecision(10)
+            << "phi:  " << helix[i].par(0) << " +/- " << sqrt(helix[i].cov(0, 0)) << " vs "
+            << true_par(0) << endl
+            << "Tip:  " << helix[i].par(1) << " +/- " << sqrt(helix[i].cov(1, 1)) << " vs "
+            << true_par(1) << endl
+            << "p_t:  " << helix[i].par(2) << " +/- " << sqrt(helix[i].cov(2, 2)) << " vs "
+            << true_par(2) << endl
+            << "theta:" << helix[i].par(3) << " +/- " << sqrt(helix[i].cov(3, 3)) << " vs "
+            << true_par(3) << endl
+            << "Zip:  " << helix[i].par(4) << " +/- " << sqrt(helix[i].cov(4, 4)) << " vs "
+            << true_par(4) << endl
+            << "charge:" << helix[i].q << " vs 1" << endl
+            << "covariance matrix:" << endl
+            << helix[i].cov << endl
+            << "Initial hits:\n" << gen.hits << endl
+            << "Initial Covariance:\n" << gen.hits_cov << endl;
+    }
+
+    for (int x = 0; x < iteration; x++) {
+      // Compute PULLS information
+      score(0, x) = (helix[x].par(0) - true_par(0)) / sqrt(helix[x].cov(0, 0));
+      score(1, x) = (helix[x].par(1) - true_par(1)) / sqrt(helix[x].cov(1, 1));
+      score(2, x) = (helix[x].par(2) - true_par(2)) / sqrt(helix[x].cov(2, 2));
+      score(3, x) = (helix[x].par(3) - true_par(3)) / sqrt(helix[x].cov(3, 3));
+      score(4, x) = (helix[x].par(4) - true_par(4)) / sqrt(helix[x].cov(4, 4));
+      score(5, x) =
+          (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) / (helix[x].cov(0, 1));
+      score(6, x) =
+          (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(0, 2));
+      score(7, x) =
+          (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(1, 2));
+      score(8, x) =
+          (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) / (helix[x].cov(3, 4));
+      score(9, x) = helix[x].chi2_circle;
+      score(25, x) = helix[x].chi2_line;
+      score(10, x) = sqrt(helix[x].cov(0, 0)) / helix[x].par(0) * 100;
+      score(13, x) = sqrt(helix[x].cov(3, 3)) / helix[x].par(3) * 100;
+      score(14, x) = sqrt(helix[x].cov(4, 4)) / helix[x].par(4) * 100;
+      score(15, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(3) - true_par(3)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(3, 3));
+      score(16, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(3) - true_par(3)) /
+                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(3, 3));
+      score(17, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(3) - true_par(3)) /
+                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(3, 3));
+      score(18, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(4, 4));
+      score(19, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(4, 4));
+      score(20, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(4, 4));
+      score(21, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(1, 1));
+      score(22, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(2, 2));
+      score(23, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) /
+                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(2, 2));
+      score(24, x) = (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(3, 3)) / sqrt(helix[x].cov(4, 4));
+    }
+
+    double phi_ = score.row(0).mean();
+    double a_ = score.row(1).mean();
+    double pt_ = score.row(2).mean();
+    double coT_ = score.row(3).mean();
+    double Zip_ = score.row(4).mean();
+    Matrix5d correlation;
+    correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
+        score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
+        score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
+        score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
+        score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
+        score.row(24).mean(), 1.;
+
+    cout << "\nPULLS:\n"
+         << "phi:  " << phi_ << "     "
+         << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(10).mean()) << "%\n"
+         << "a0 :  " << a_ << "     "
+         << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(11).mean()) << "%\n"
+         << "pt :  " << pt_ << "     "
+         << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(12).mean()) << "%\n"
+         << "coT:  " << coT_ << "     "
+         << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(13).mean()) << "%\n"
+         << "Zip:  " << Zip_ << "     "
+         << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(14).mean()) << "%\n\n"
+         << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
+         << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
+         << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
+         << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
+         << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
+         << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
+         << "correlation matrix:\n"
+         << correlation << "\n\n"
+         << endl;
+//  }
+}
+
+int main() {
+  test_helix_fit();
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
new file mode 100644
index 0000000000000..814e278690957
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -0,0 +1,209 @@
+#include "test_common.h"
+#include <iostream>
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+using namespace Eigen;
+
+__global__ void kernelFullFit(Rfit::Matrix3xNd * hits, Rfit::Matrix3Nd * hits_cov,
+    double B, Rfit::circle_fit * circle_fit_resultsGPU, Rfit::line_fit * line_fit_resultsGPU) {
+  Vector4d fast_fit = Rfit::Fast_fit(*hits);
+
+  u_int n = hits->cols();
+  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
+
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
+      fast_fit, rad, B, false, false);
+
+  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, true);
+
+  return;
+}
+
+__global__ void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
+  (*results) = Rfit::Fast_fit(*hits);
+}
+
+__global__ void kernelCircleFit(Rfit::Matrix3xNd * hits,
+    Rfit::Matrix3Nd * hits_cov, Vector4d * fast_fit_input, double B,
+    Rfit::circle_fit * circle_fit_resultsGPU) {
+  u_int n = hits->cols();
+  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
+
+  if (!NODEBUG) {
+    printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
+    printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
+    printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
+    printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
+    printf("rad(0,0): %f\n", rad(0,0));
+    printf("rad(1,1): %f\n", rad(1,1));
+    printf("rad(2,2): %f\n", rad(2,2));
+    printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
+    printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
+    printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
+    printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
+    printf("B: %f\n", B);
+  }
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
+      *fast_fit_input, rad, B, false, false);
+}
+
+__global__ void kernelLineFit(Rfit::Matrix3xNd * hits,
+                              Rfit::Matrix3Nd * hits_cov,
+                              Rfit::circle_fit * circle_fit,
+                              Vector4d * fast_fit,
+                              Rfit::line_fit * line_fit) {
+  (*line_fit) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit, *fast_fit, true);
+}
+
+void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
+  hits << 1.98645, 4.72598, 7.65632, 11.3151,
+          2.18002, 4.88864, 7.75845, 11.3134,
+          2.46338, 6.99838,  11.808,  17.793;
+  hits_cov(0,0) = 7.14652e-06;
+  hits_cov(1,1) = 2.15789e-06;
+  hits_cov(2,2) = 1.63328e-06;
+  hits_cov(3,3) = 6.27919e-06;
+  hits_cov(4,4) = 6.10348e-06;
+  hits_cov(5,5) = 2.08211e-06;
+  hits_cov(6,6) = 1.61672e-06;
+  hits_cov(7,7) = 6.28081e-06;
+  hits_cov(8,8) = 5.184e-05;
+  hits_cov(9,9) = 1.444e-05;
+  hits_cov(10,10) = 6.25e-06;
+  hits_cov(11,11) = 3.136e-05;
+  hits_cov(0,4) = hits_cov(4,0) = -5.60077e-06;
+  hits_cov(1,5) = hits_cov(5,1) = -1.11936e-06;
+  hits_cov(2,6) = hits_cov(6,2) = -6.24945e-07;
+  hits_cov(3,7) = hits_cov(7,3) = -5.28e-06;
+}
+
+void testFit() {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd hits(3,4);
+  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
+  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
+  Rfit::Matrix3Nd * hits_covGPU = nullptr;
+  Vector4d * fast_fit_resultsGPU = new Vector4d();
+  Vector4d * fast_fit_resultsGPUret = new Vector4d();
+  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
+  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
+
+  fillHitsAndHitsCov(hits, hits_cov);
+
+  // FAST_FIT_CPU
+  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
+  if (!NODEBUG) {
+    std::cout << "Generated hits:\n" << hits << std::endl;
+  }
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+  // FAST_FIT GPU
+  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
+  cudaMalloc((void**)&fast_fit_resultsGPU, sizeof(Vector4d));
+  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
+
+  kernelFastFit<<<1, 1>>>(hitsGPU, fast_fit_resultsGPU);
+  cudaDeviceSynchronize();
+  
+  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, sizeof(Vector4d), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << *fast_fit_resultsGPUret << std::endl;
+  assert(isEqualFuzzy(fast_fit_results, (*fast_fit_resultsGPUret)));
+
+  // CIRCLE_FIT CPU
+  u_int n = hits.cols();
+  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
+      hits_cov.block(0, 0, 2 * n, 2 * n),
+      fast_fit_results, rad, B, false, false);
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
+
+  // CIRCLE_FIT GPU
+  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
+  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
+  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
+
+  kernelCircleFit<<<1,1>>>(hitsGPU, hits_covGPU,
+      fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
+      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
+
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+
+  // LINE_FIT GPU
+  Rfit::line_fit * line_fit_resultsGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
+
+  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
+
+  kernelLineFit<<<1,1>>>(hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
+}
+
+void testFitOneGo() {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd hits(3,4);
+  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
+
+  fillHitsAndHitsCov(hits, hits_cov);
+
+  // FAST_FIT_CPU
+  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
+  // CIRCLE_FIT CPU
+  u_int n = hits.cols();
+  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n), 
+      hits_cov.block(0, 0, 2 * n, 2 * n),
+      fast_fit_results, rad, B, false, false);
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
+
+  // FIT GPU
+  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
+  Rfit::Matrix3Nd * hits_covGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
+  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
+  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
+
+  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
+  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
+  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
+  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
+  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
+  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
+
+  kernelFullFit<<<1, 1>>>(hitsGPU, hits_covGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
+      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  std::cout << "Fitted values (LineFit): GPU\n" << line_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
+}
+
+int main (int argc, char * argv[]) {
+  testFit();
+  testFitOneGo();
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
new file mode 100644
index 0000000000000..210b10cd14ed1
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -0,0 +1,169 @@
+#include "test_common.h"
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+
+using namespace Eigen;
+
+__host__ __device__ void eigenValues(Matrix3d * m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType * ret) {
+  if (!NODEBUG) {
+    printf("Matrix(0,0): %f\n", (*m)(0,0));
+    printf("Matrix(1,1): %f\n", (*m)(1,1));
+    printf("Matrix(2,2): %f\n", (*m)(2,2));
+  }
+  SelfAdjointEigenSolver<Matrix3d> es;
+  es.computeDirect(*m);
+  (*ret) = es.eigenvalues();
+  return;
+}
+
+__global__ void kernel(Matrix3d * m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType * ret) {
+  eigenValues(m, ret);
+}
+
+__global__ void kernelInverse(Matrix3d * in, Matrix3d * out) {
+//  (*out) = in->inverse();
+}
+
+template<typename M1, typename M2, typename M3>
+__global__ void kernelMultiply(M1 * J,
+                               M2 * C,
+                               M3 * result) {
+//  Map<M3> res(result->data());
+  if (!NODEBUG)
+    printf("*** GPU IN ***\n");
+  printIt(J);
+  printIt(C);
+//  res.noalias() = (*J) * (*C);
+//  printIt(&res);
+  (*result) = (*J) * (*C);
+  if (!NODEBUG)
+    printf("*** GPU OUT ***\n");
+  return;
+}
+
+template<int row1, int col1, int row2, int col2>
+void testMultiply() {
+  std::cout << "TEST MULTIPLY" << std::endl;
+  std::cout << "Product of type " << row1 << "x" << col1
+    << " * " << row2 << "x" << col2 << std::endl;
+  Eigen::Matrix<double, row1, col1> J;
+  fillMatrix(J);
+  Eigen::Matrix<double, row2, col2> C;
+  fillMatrix(C);
+  Eigen::Matrix<double, row1, col2> multiply_result = J * C;
+  if (!NODEBUG) {
+    std::cout << "Input J:" << std::endl; printIt(&J);
+    std::cout << "Input C:" << std::endl; printIt(&C);
+    std::cout << "Output:" << std::endl;
+    printIt(&multiply_result);
+  }
+  // GPU
+  Eigen::Matrix<double, row1, col1> *JGPU = nullptr;
+  Eigen::Matrix<double, row2, col2> *CGPU = nullptr;
+  Eigen::Matrix<double, row1, col2> *multiply_resultGPU = nullptr;
+  Eigen::Matrix<double, row1, col2> *multiply_resultGPUret = new Eigen::Matrix<double, row1, col2>();
+
+  cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>));
+  cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>));
+  cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>));
+  cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice);
+  cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice);
+  cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice);
+
+  kernelMultiply<<<1,1>>>(JGPU, CGPU, multiply_resultGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(multiply_resultGPUret, multiply_resultGPU, 
+      sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
+  printIt(multiply_resultGPUret);
+  assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
+}
+
+void testInverse() {
+  std::cout << "TEST INVERSE" << std::endl;
+  Matrix3d m = Matrix3d::Random();
+  Matrix3d *mGPU = nullptr;
+  Matrix3d *mGPUret = nullptr;
+  Matrix3d *mCPUret = new Matrix3d();
+
+  if (!NODEBUG) {
+    std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+    std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+  }
+  cudaMalloc((void **)&mGPU, sizeof(Matrix3d));
+  cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
+  cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+
+  kernelInverse<<<1,1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
+  if (!NODEBUG)
+    std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+}
+
+void testEigenvalues() {
+  std::cout << "TEST EIGENVALUES" << std::endl;
+  Matrix3d m = Matrix3d::Random();
+  Matrix3d mt = m.transpose();
+  m += mt;
+  Matrix3d * m_gpu = nullptr;
+  Matrix3d * mgpudebug = new Matrix3d();
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret = new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret1 = new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret_gpu = nullptr;
+  eigenValues(&m, ret);
+  if (!NODEBUG) {
+    std::cout << "Generated Matrix M 3x3:\n" << m << std::endl;
+    std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
+    std::cout << "*************************\n\n" << std::endl;
+  }
+  cudaMalloc((void **)&m_gpu, sizeof(Matrix3d));
+  cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType));
+  cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+
+  kernel<<<1,1>>>(m_gpu, ret_gpu);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
+  cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost);
+  if (!NODEBUG) {
+    std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
+    std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
+    std::cout << "*************************\n\n" << std::endl;
+  }
+  assert(isEqualFuzzy(*ret, *ret1));
+}
+
+
+int main (int argc, char * argv[]) {
+
+  testEigenvalues();
+  testInverse();
+  testMultiply<1, 2, 2, 1>();
+  testMultiply<1, 2, 2, 2>();
+  testMultiply<1, 2, 2, 3>();
+  testMultiply<1, 2, 2, 4>();
+  testMultiply<1, 2, 2, 5>();
+  testMultiply<2, 1, 1, 2>();
+  testMultiply<2, 1, 1, 3>();
+  testMultiply<2, 1, 1, 4>();
+  testMultiply<2, 1, 1, 5>();
+  testMultiply<2, 2, 2, 2>();
+  testMultiply<2, 3, 3, 1>();
+  testMultiply<2, 3, 3, 2>();
+  testMultiply<2, 3, 3, 4>();
+  testMultiply<2, 3, 3, 5>();
+  testMultiply<3, 2, 2, 3>();
+  testMultiply<2, 3, 3, 3>(); // DOES NOT COMPILE W/O PATCHING EIGEN
+  testMultiply<3, 3, 3, 3>();
+  testMultiply<8, 8, 8, 8>();
+  testMultiply<3, 4, 4, 3>();
+  testMultiply<2, 4, 4, 2>();
+  testMultiply<3, 4, 4, 2>(); // DOES NOT COMPILE W/O PATCHING EIGEN
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
new file mode 100644
index 0000000000000..0290ee2db641b
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
@@ -0,0 +1,53 @@
+#ifndef RecoPixelVertexing__PixelTrackFitting__test_common_h
+#define RecoPixelVertexing__PixelTrackFitting__test_common_h
+
+#include <algorithm>
+#include <random>
+#include <cassert>
+
+#define NODEBUG 1
+
+template<class C>
+__host__ __device__ void printIt(C * m) {
+  if (!NODEBUG) {
+    printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
+    for (u_int r = 0; r < m->rows(); ++r) {
+      for (u_int c = 0; c < m->cols(); ++c) {
+        printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r,c));
+      }
+    }
+  }
+}
+
+template<class C>
+bool isEqualFuzzy(C a, C b) {
+  constexpr double epsilon = 1e-6;
+  for (unsigned int i = 0; i < a.rows(); ++i) {
+    for (unsigned int j = 0; j < a.cols(); ++j) {
+      assert(std::abs(a(i,j)-b(i,j))
+          < std::min(std::abs(a(i,j)), std::abs(b(i,j)))*epsilon);
+    }
+  }
+  return true;
+}
+
+bool isEqualFuzzy(double a, double b) {
+  constexpr double epsilon = 1e-6;
+  return std::abs(a-b) < std::min(std::abs(a), std::abs(b))*epsilon;
+}
+
+template<typename T>
+void fillMatrix(T & t) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 2.0);
+  for (int row = 0; row < t.rows(); ++row) {
+    for (int col = 0; col < t.cols(); ++col) {
+      t(row, col) = dis(gen);
+    }
+  }
+  return;
+}
+
+
+#endif

From 7dc668231d4c3acbcdff0cfd1ea2c8399a88e18c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 4 Jun 2018 11:26:15 +0200
Subject: [PATCH 003/102] Implement a Heterogeneous version of Raw2Cluster and
 RecHit (cms-patatrack#62)

  - reorganize `SiPixelRawToDigi` as `SiPixelRawToDigiHeterogeneous` using `HeterogeneousEDProducer`
      - output a `HeterogeneousEvent`
      - use `PixelThresholdClusterizer`
      - add `SiPixelDigiHeterogeneousConverter`
      - make cabling and gain transfers asynchronous
  - reorganize `SiPixelRecHits` as `SiPixelRecHitHeterogeneous`
  - move `PixelThresholdClusterizer` (back?) to interface+src in order to use it outside of RecoLocalTracker/SiPixelClusterizer
  - replace __host__ __device__ with constexpr to avoid weird compilation failures
  - split clusters to their own converter
---
 DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py | 3 ---
 Validation/RecoTrack/python/TrackValidation_cff.py            | 2 --
 2 files changed, 5 deletions(-)

diff --git a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
index 711d757c94311..c91dc2b2730de 100644
--- a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
+++ b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
@@ -1,5 +1,4 @@
 import FWCore.ParameterSet.Config as cms
-from Configuration.ProcessModifiers.gpu_cff import gpu
 
 import DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi
 pixelTracksMonitoring = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone()
@@ -21,5 +20,3 @@
 pixelTracksMonitoring.doPlotsVsGoodPVtx         = True
 pixelTracksMonitoring.doPlotsVsLUMI             = True
 pixelTracksMonitoring.doPlotsVsBX               = True
-
-gpu.toModify(pixelTracksMonitoring, pixelCluster4lumi = "siPixelDigis")
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 87b1706b026aa..550aa57bf2669 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -21,7 +21,6 @@
 import RecoTracker.IterativeTracking.iterativeTkConfig as _cfg
 import RecoTracker.IterativeTracking.iterativeTkUtils as _utils
 from Configuration.Eras.Modifier_fastSim_cff import fastSim
-from Configuration.ProcessModifiers.gpu_cff import gpu
 
 ### First define the stuff for the standard validation sequence
 ## Track selectors
@@ -713,7 +712,6 @@ def _uniqueFirstLayers(layerList):
 tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone(
     pixelClusterSrc = "siPixelClustersPreSplitting"
 )
-gpu.toModify(tpClusterProducerPixelTrackingOnly, pixelClusterSrc = "siPixelDigis")
 
 quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone(
     cluster2TPSrc = "tpClusterProducerPixelTrackingOnly"

From b117d17b7858a993408fbd9f7cdff8f007e8bb67 Mon Sep 17 00:00:00 2001
From: Felice <felice.pantaleo@cern.ch>
Date: Thu, 14 Jun 2018 19:08:28 +0200
Subject: [PATCH 004/102] Heterogeneous Cellular Automaton for pixel tracks

Port the Cellular Automaton (back) to GPUs and CUDA, using the
`HeterogeneousEDProducer` approach:
  - do memory allocations in the framework begin stream
  - run the memory copies and kernels asynchronously, in a dedicated
    CUDA stream per framework stream

Use the new GPU::VecArray for holding repeated data structures.

By default, run on the GPU in all gpu-enable workflows (e.g. 10824.8).
---
 .../PixelTriplets/plugins/BuildFile.xml       |  25 +-
 .../PixelTriplets/plugins/GPUCACell.h         | 273 ++++++++++++++++++
 .../python/caHitQuadrupletEDProducer_cfi.py   |   8 +
 3 files changed, 297 insertions(+), 9 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index f76451675de59..b0bca04309c4c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,10 +1,17 @@
-<use name="CommonTools/RecoAlgos"/>
-<use name="RecoTracker/TkTrackingRegions"/>
-<use name="RecoPixelVertexing/PixelTriplets"/>
-<use name="RecoTracker/TkSeedingLayers"/>
-<library file="*.cc" name="RecoPixelVertexingPixelTripletsPlugins">
-  <flags EDM_PLUGIN="1"/>
+<use   name="RecoTracker/TkTrackingRegions"/>
+<use   name="RecoPixelVertexing/PixelTriplets"/>
+<use   name="RecoTracker/TkSeedingLayers"/>
+<use   name="RecoPixelVertexing/PixelTrackFitting"/>
+<library   file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
+  <use   name="cuda"/>
+  <flags   EDM_PLUGIN="1"/>
+  <flags   CUDA_FLAGS="--expt-relaxed-constexpr"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="HeterogeneousCore/Producer"/>
+  <use name="HeterogeneousCore/Product"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="cuda-api-wrappers"/>
 </library>
-
-<use name="ofast-flag"/>
-<flags CXXFLAGS="-fno-math-errno"/>
+<flags   CXXFLAGS="-Ofast -fno-math-errno"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
new file mode 100644
index 0000000000000..14d8ee833ce71
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -0,0 +1,273 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+#ifndef GPU_CACELL_H_
+#define GPU_CACELL_H_
+
+#include "GPUHitsAndDoublets.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include <cuda.h>
+struct Quadruplet {
+  int2 layerPairsAndCellId[3];
+};
+
+class GPUCACell {
+public:
+  __host__ __device__  GPUCACell() {}
+
+  __host__ __device__ void init(const GPULayerDoublets *doublets,
+                                const GPULayerHits *hitsOnLayer,
+                                int layerPairId, int doubletId, int innerHitId,
+                                int outerHitId, float regionX, float regionY) {
+
+    theInnerHitId = innerHitId;
+    theOuterHitId = outerHitId;
+    theDoubletId = doubletId;
+    theLayerPairId = layerPairId;
+
+    auto innerLayerId = doublets->innerLayerId;
+    auto outerLayerId = doublets->outerLayerId;
+
+    theInnerX = hitsOnLayer[innerLayerId].x[innerHitId];
+    theOuterX = hitsOnLayer[outerLayerId].x[outerHitId];
+
+    theInnerY = hitsOnLayer[innerLayerId].y[innerHitId];
+    theOuterY = hitsOnLayer[outerLayerId].y[outerHitId];
+
+    theInnerZ = hitsOnLayer[innerLayerId].z[innerHitId];
+    theOuterZ = hitsOnLayer[outerLayerId].z[outerHitId];
+    theInnerR = hypot(theInnerX - regionX, theInnerY - regionY);
+    theOuterR = hypot(theOuterX - regionX, theOuterY - regionY);
+    theOuterNeighbors.reset();
+  }
+
+  constexpr float get_inner_x() const { return theInnerX; }
+  constexpr float get_outer_x() const { return theOuterX; }
+  constexpr float get_inner_y() const { return theInnerY; }
+  constexpr float get_outer_y() const { return theOuterY; }
+  constexpr float get_inner_z() const { return theInnerZ; }
+  constexpr float get_outer_z() const { return theOuterZ; }
+  constexpr float get_inner_r() const { return theInnerR; }
+  constexpr float get_outer_r() const { return theOuterR; }
+  constexpr unsigned int get_inner_hit_id() const {
+    return theInnerHitId;
+  }
+  constexpr unsigned int get_outer_hit_id() const {
+    return theOuterHitId;
+  }
+
+  constexpr void print_cell() const {
+
+    printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
+           "%d, innerradius %f, outerRadius %f \n",
+           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId,
+           theInnerR, theOuterR);
+  }
+
+
+
+  __host__ __device__ bool check_alignment_and_tag(
+      const GPUCACell *cells, unsigned int innerCellId, const float ptmin,
+      const float region_origin_x, const float region_origin_y,
+      const float region_origin_radius, const float thetaCut,
+      const float phiCut, const float hardPtCut) {
+    auto ro = get_outer_r();
+    auto zo = get_outer_z();
+    const auto &otherCell = cells[innerCellId];
+
+    auto r1 = otherCell.get_inner_r();
+    auto z1 = otherCell.get_inner_z();
+    bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut);
+    return (aligned &&
+            haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x,
+                                 region_origin_y, region_origin_radius, phiCut,
+                                 hardPtCut));
+  }
+
+
+  constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo,
+                                        const float ptmin,
+                                        const float thetaCut) const {
+    float radius_diff = std::abs(r1 - ro);
+    float distance_13_squared =
+        radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
+
+    float pMin =
+        ptmin * std::sqrt(distance_13_squared); // this needs to be divided by
+                                                // radius_diff later
+
+    float tan_12_13_half_mul_distance_13_squared =
+        fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r()));
+    return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
+  }
+
+  constexpr bool
+  haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId,
+                       const float ptmin, const float region_origin_x,
+                       const float region_origin_y,
+                       const float region_origin_radius, const float phiCut,
+                       const float hardPtCut) const {
+
+    const auto &otherCell = cells[innerCellId];
+
+    auto x1 = otherCell.get_inner_x();
+    auto y1 = otherCell.get_inner_y();
+
+    auto x2 = get_inner_x();
+    auto y2 = get_inner_y();
+
+    auto x3 = get_outer_x();
+    auto y3 = get_outer_y();
+
+    float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3);
+    float tan_12_13_half_mul_distance_13_squared =
+        fabs(y1 * (x2 - x3) + y2 * (x3 - x1) + y3 * (x1 - x2));
+    // high pt : just straight
+    if (tan_12_13_half_mul_distance_13_squared * ptmin <=
+        1.0e-4f * distance_13_squared) {
+
+      float distance_3_beamspot_squared =
+          (x3 - region_origin_x) * (x3 - region_origin_x) +
+          (y3 - region_origin_y) * (y3 - region_origin_y);
+
+      float dot_bs3_13 = ((x1 - x3) * (region_origin_x - x3) +
+                          (y1 - y3) * (region_origin_y - y3));
+      float proj_bs3_on_13_squared =
+          dot_bs3_13 * dot_bs3_13 / distance_13_squared;
+
+      float distance_13_beamspot_squared =
+          distance_3_beamspot_squared - proj_bs3_on_13_squared;
+
+      return distance_13_beamspot_squared <
+             (region_origin_radius + phiCut) * (region_origin_radius + phiCut);
+    }
+
+    // 87 cm/GeV = 1/(3.8T * 0.3)
+
+    // take less than radius given by the hardPtCut and reject everything below
+    float minRadius = hardPtCut * 87.f; // FIXME move out and use real MagField
+
+    auto det = (x1 - x2) * (y2 - y3) - (x2 - x3) * (y1 - y2);
+
+    auto offset = x2 * x2 + y2 * y2;
+
+    auto bc = (x1 * x1 + y1 * y1 - offset) * 0.5f;
+
+    auto cd = (offset - x3 * x3 - y3 * y3) * 0.5f;
+
+    auto idet = 1.f / det;
+
+    auto x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
+    auto y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;
+
+    auto radius = std::sqrt((x2 - x_center) * (x2 - x_center) +
+                            (y2 - y_center) * (y2 - y_center));
+
+    if (radius < minRadius)
+      return false; // hard cut on pt
+
+    auto centers_distance_squared =
+        (x_center - region_origin_x) * (x_center - region_origin_x) +
+        (y_center - region_origin_y) * (y_center - region_origin_y);
+    auto region_origin_radius_plus_tolerance = region_origin_radius + phiCut;
+    auto minimumOfIntersectionRange =
+        (radius - region_origin_radius_plus_tolerance) *
+        (radius - region_origin_radius_plus_tolerance);
+
+    if (centers_distance_squared >= minimumOfIntersectionRange) {
+      auto maximumOfIntersectionRange =
+          (radius + region_origin_radius_plus_tolerance) *
+          (radius + region_origin_radius_plus_tolerance);
+      return centers_distance_squared <= maximumOfIntersectionRange;
+    }
+
+    return false;
+  }
+
+  // trying to free the track building process from hardcoded layers, leaving
+  // the visit of the graph based on the neighborhood connections between cells.
+  #if defined(__NVCC__) || defined(__CUDACC__)
+
+  __device__ inline void find_ntuplets(
+      const GPUCACell *cells,
+      GPU::SimpleVector<Quadruplet> *foundNtuplets,
+      GPU::VecArray<unsigned int,3> &tmpNtuplet,
+      const unsigned int minHitsPerNtuplet) const {
+
+    // the building process for a track ends if:
+    // it has no right neighbor
+    // it has no compatible neighbor
+    // the ntuplets is then saved if the number of hits it contains is greater
+    // than a threshold
+
+
+    if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
+      Quadruplet tmpQuadruplet;
+      for (unsigned int i = 0; i < minHitsPerNtuplet - 1; ++i) {
+        tmpQuadruplet.layerPairsAndCellId[i].x = cells[tmpNtuplet[i]].theLayerPairId;
+        tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i];
+      }
+      foundNtuplets->push_back(tmpQuadruplet);
+    }
+    else {
+      for (int j = 0; j < theOuterNeighbors.size(); ++j) {
+        auto otherCell = theOuterNeighbors[j];
+        tmpNtuplet.push_back_unsafe(otherCell);
+        cells[otherCell].find_ntuplets(cells, foundNtuplets, tmpNtuplet,
+                                       minHitsPerNtuplet);
+        tmpNtuplet.pop_back();
+      }
+    }
+  }
+
+#endif
+  template <int maxNumberOfQuadruplets>
+  __host__ inline void find_ntuplets_host(
+      const GPUCACell *cells,
+      GPU::VecArray<Quadruplet, maxNumberOfQuadruplets> *foundNtuplets,
+      GPU::VecArray<unsigned int, 3> &tmpNtuplet,
+      const unsigned int minHitsPerNtuplet) const {
+
+    Quadruplet tmpQuadruplet;
+    if (tmpNtuplet.size() >= minHitsPerNtuplet - 1) {
+      for (int i = 0; i < minHitsPerNtuplet - 1; ++i) {
+        tmpQuadruplet.layerPairsAndCellId[i].x =
+            cells[tmpNtuplet[i]].theLayerPairId;
+
+        tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i];
+      }
+      foundNtuplets->push_back(tmpQuadruplet);
+
+    }
+
+    else {
+      for (int j = 0; j < theOuterNeighbors.size(); ++j) {
+        auto otherCell = theOuterNeighbors[j];
+        tmpNtuplet.push_back_unsafe(otherCell);
+        cells[otherCell].find_ntuplets_host(cells, foundNtuplets, tmpNtuplet,
+                                            minHitsPerNtuplet);
+
+        tmpNtuplet.pop_back();
+      }
+    }
+  }
+  GPU::VecArray< unsigned int, 40> theOuterNeighbors;
+
+  int theDoubletId;
+  int theLayerPairId;
+
+private:
+  unsigned int theInnerHitId;
+  unsigned int theOuterHitId;
+  float theInnerX;
+  float theOuterX;
+  float theInnerY;
+  float theOuterY;
+  float theInnerZ;
+  float theOuterZ;
+  float theInnerR;
+  float theOuterR;
+};
+
+#endif /*CACELL_H_ */
diff --git a/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py
new file mode 100644
index 0000000000000..8497eba9f759f
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py
@@ -0,0 +1,8 @@
+import FWCore.ParameterSet.Config as cms
+from RecoPixelVertexing.PixelTriplets.caHitQuadrupletDefaultEDProducer_cfi import caHitQuadrupletDefaultEDProducer as _caHitQuadrupletDefaultEDProducer
+
+caHitQuadrupletEDProducer = _caHitQuadrupletDefaultEDProducer.clone()
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelTriplets.caHitQuadrupletHeterogeneousEDProducer_cfi import caHitQuadrupletHeterogeneousEDProducer as _caHitQuadrupletHeterogeneousEDProducer
+gpu.toReplaceWith(caHitQuadrupletEDProducer, _caHitQuadrupletHeterogeneousEDProducer)

From ba46ad58f24a6078d1240ed3291c7f08f2072a24 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 18 Jun 2018 14:38:25 +0200
Subject: [PATCH 005/102] Clean up `CAHitNtupletHeterogeneousEDProducer`
 (cms-patatrack#83)

Apply some clean up to the code and formatting of `CAHitNtupletHeterogeneousEDProducer` and `CAHitQuadrupletGeneratorGPU`, as suggested by @makortel during the review of #48:
  - clean up the `BuildFile.xml`
  - remove unused data members and arguments from function calls;
  - percolate the CUDA stream instead of storing it as a data member.

Also:
  - add `cudaCheck` calls around memory allocations and copies;
  - reduce the number of memory allocations used to set up the GPU state.
---
 .../PixelTriplets/plugins/BuildFile.xml       | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index b0bca04309c4c..8956caca42899 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,17 +1,16 @@
-<use   name="RecoTracker/TkTrackingRegions"/>
-<use   name="RecoPixelVertexing/PixelTriplets"/>
-<use   name="RecoTracker/TkSeedingLayers"/>
-<use   name="RecoPixelVertexing/PixelTrackFitting"/>
-<library   file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
-  <use   name="cuda"/>
-  <flags   EDM_PLUGIN="1"/>
-  <flags   CUDA_FLAGS="--expt-relaxed-constexpr"/>
-  <use name="FWCore/Framework"/>
-  <use name="FWCore/PluginManager"/>
-  <use name="FWCore/ParameterSet"/>
-  <use name="HeterogeneousCore/Producer"/>
-  <use name="HeterogeneousCore/Product"/>
-  <use name="HeterogeneousCore/CUDACore"/>
-  <use name="cuda-api-wrappers"/>
+<use name="cuda"/>
+<use name="cuda-api-wrappers"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/Producer"/>
+<use name="HeterogeneousCore/Product"/>
+<use name="RecoPixelVertexing/PixelTrackFitting"/>
+<use name="RecoPixelVertexing/PixelTriplets"/>
+<use name="RecoTracker/TkSeedingLayers"/>
+<use name="RecoTracker/TkTrackingRegions"/>
+<library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
+  <flags EDM_PLUGIN="1"/>
 </library>
-<flags   CXXFLAGS="-Ofast -fno-math-errno"/>
+<flags CXXFLAGS="-Ofast -fno-math-errno"/>

From b26350967f6631b28568ccbb847318974f63e854 Mon Sep 17 00:00:00 2001
From: Marco Rovere <rovere@users.noreply.github.com>
Date: Mon, 18 Jun 2018 15:35:18 +0200
Subject: [PATCH 006/102] Port the Riemann fit to CUDA (cms-patatrack#60)

  - the CPU Riemann fit works using all combinations between the 2 booleans: `useErrors` and `useMultipleScattering`;
  - the standalone version of the GPU Riemann fit has been updated in order to explore all possibilities among the 2 booleans above: all of them work and produce identical results up to 1e-5 precision (the default one, 1e-6 fails when enabling multiScattering, most likely due to matrix inversions);
  - the GPU version of the Riemann fit within CMSSW works, with 1 fit assigned to each thread, with 32 threads/warps, all dynamically computed.

Things that needs a "hack":

  - limit the "dynamic" size of Eigen matrices to at most, 4x4, which is just fine for quadruplets. Using anything wider will cause errors which I *believe* is related to the stack size of threads on the GPU;
  - cast matrices to be inverted to 4x4 (was done before the previous point: will revert it back and see if that's still needed or not, but I believe it is); this was done in order to "specialize" the `invert()` call to something that is "natively" supported by Eigen on GPU (that brought in also few `__host__` `__device__` here and there in Eigen);
  - fix the alignment of the `struct` holding the results of the fit, since its size was different on GPU and CPU, causing an annoying off-by-one effect.
---
 .../PixelTrackFitting/interface/RiemannFit.h  | 243 +++++++++++++++---
 .../plugins/PixelTrackProducer.cc             |  66 +++--
 .../plugins/PixelTrackProducer.h              |  13 +-
 .../python/PixelTracks_cff.py                 |   2 +
 .../PixelTrackFitting/test/BuildFile.xml      |   2 -
 .../PixelTrackFitting/test/testEigenGPU.cu    | 103 ++++++--
 .../test/testEigenGPUNoFit.cu                 |  45 +++-
 .../PixelTrackFitting/test/test_common.h      |   6 +-
 8 files changed, 369 insertions(+), 111 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index d545f78274819..ed1efc8a3240b 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -11,12 +11,14 @@
 #define CUDA_HOSTDEV
 #endif
 
+#define DEBUG 0
+
 namespace Rfit {
 
 using namespace Eigen;
 
 constexpr double d = 1.e-4;         //!< used in numerical derivative (J2 in Circle_fit())
-constexpr unsigned int max_nop = 8;  //!< In order to avoid use of dynamic memory
+constexpr unsigned int max_nop = 4;  //!< In order to avoid use of dynamic memory
 
 using MatrixNd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
 using ArrayNd = Eigen::Array<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
@@ -45,7 +47,7 @@ struct circle_fit {
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
   */
-  int q;  //!< particle charge
+  int64_t q;  //!< particle charge
   double chi2;
 };
 
@@ -69,19 +71,21 @@ struct helix_fit {
       |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
       |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
   */
-  int q;  //!< particle charge
   double chi2_circle;
   double chi2_line;
   Vector4d fast_fit;
-  VectorXd time;  // TO FIX just for profiling
-};
+  int64_t q;  //!< particle charge
+  //  VectorXd time;  // TO FIX just for profiling
+} __attribute__ ((aligned(16)) );
 
 
 template<class C>
-CUDA_HOSTDEV void printIt(C * m) {
+CUDA_HOSTDEV void printIt(C * m, const char * prefix = "", bool debug=false) {
   for (u_int r = 0; r < m->rows(); ++r) {
     for (u_int c = 0; c < m->cols(); ++c) {
-      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r,c));
+      if (debug) {
+        printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r,c));
+      }
     }
   }
 }
@@ -126,7 +130,10 @@ CUDA_HOSTDEV inline double cross2D(const Vector2d& a, const Vector2d& b) {
 
  */
 // X in input TO FIX
-CUDA_HOSTDEV MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D, const Vector4d& fast_fit, VectorNd const & rad, double B) {
+CUDA_HOSTDEV inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
+    const Vector4d& fast_fit,
+    VectorNd const & rad,
+    double B) {
   u_int n = p2D.cols();
   double X = 0.04;
   double theta = atan(fast_fit(3));
@@ -144,6 +151,7 @@ CUDA_HOSTDEV MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D, const Vector4d& fas
       }
     }
   }
+  Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ", DEBUG);
   return scatter_cov_rad;
 }
 
@@ -160,9 +168,14 @@ CUDA_HOSTDEV MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D, const Vector4d& fas
 CUDA_HOSTDEV inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
     const MatrixNd& cov_rad,
     const VectorNd &rad) {
+  if (DEBUG) {
+    printf("Address of p2D: %p\n", &p2D);
+  }
+  printIt(&p2D, "cov_radtocart - p2D:");
   u_int n = p2D.cols();
   Matrix2Nd cov_cart = MatrixXd::Zero(2 * n, 2 * n);
   VectorNd rad_inv = rad.cwiseInverse();
+  printIt(&rad_inv, "cov_radtocart - rad_inv:");
   for (u_int i = 0; i < n; ++i) {
     for (u_int j = i; j < n; ++j) {
       cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
@@ -192,7 +205,7 @@ CUDA_HOSTDEV inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
 
     \warning correlation between different point are not computed.
 */
-CUDA_HOSTDEV MatrixNd cov_carttorad(const Matrix2xNd& p2D,
+CUDA_HOSTDEV inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
     const Matrix2Nd& cov_cart,
     const VectorNd& rad) {
   u_int n = p2D.cols();
@@ -227,9 +240,9 @@ CUDA_HOSTDEV MatrixNd cov_carttorad(const Matrix2xNd& p2D,
 
 */
 
-CUDA_HOSTDEV MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
-                              const Vector4d& fast_fit,
-                              const VectorNd& rad) {
+CUDA_HOSTDEV inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
+    const Vector4d& fast_fit,
+    const VectorNd& rad) {
   u_int n = p2D.cols();
   MatrixNd cov_rad = MatrixXd::Zero(n, n);
   for (u_int i = 0; i < n; ++i) {
@@ -296,7 +309,7 @@ CUDA_HOSTDEV inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y
     \return q int 1 or -1.
 */
 
-CUDA_HOSTDEV inline int Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
+CUDA_HOSTDEV inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
   return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) -
               (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
           0)
@@ -314,7 +327,7 @@ CUDA_HOSTDEV inline int Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
     \param error flag for errors computation.
 */
 
-CUDA_HOSTDEV void par_uvrtopak(circle_fit& circle, const double B, const bool& error) {
+CUDA_HOSTDEV inline void par_uvrtopak(circle_fit& circle, const double B, const bool& error) {
   Vector3d par_pak;
   const double temp0 = circle.par.head(2).squaredNorm();
   const double temp1 = sqrt(temp0);
@@ -346,7 +359,7 @@ CUDA_HOSTDEV void par_uvrtopak(circle_fit& circle, const double B, const bool& e
     \return x_err2 squared errors in the x axis.
 */
 
-CUDA_HOSTDEV VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
+CUDA_HOSTDEV inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
                 const bool& error, u_int n) {
   VectorNd x_err2(n);
   for (u_int i = 0; i < n; ++i) {
@@ -382,11 +395,17 @@ CUDA_HOSTDEV VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const
 
 */
 
-CUDA_HOSTDEV Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
+CUDA_HOSTDEV inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
+  if (DEBUG) {
+    printf("min_eigen3D - enter\n");
+  }
   SelfAdjointEigenSolver<Matrix3d> solver(3);
   solver.computeDirect(A);
   int min_index;
   chi2 = solver.eigenvalues().minCoeff(&min_index);
+  if (DEBUG) {
+    printf("min_eigen3D - exit\n");
+  }
   return solver.eigenvectors().col(min_index);
 }
 
@@ -404,7 +423,7 @@ CUDA_HOSTDEV Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
     speed up in  single precision.
 */
 
-CUDA_HOSTDEV Vector3d min_eigen3D_fast(const Matrix3d& A) {
+CUDA_HOSTDEV inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
   SelfAdjointEigenSolver<Matrix3f> solver(3);
   solver.computeDirect(A.cast<float>());
   int min_index;
@@ -425,7 +444,7 @@ CUDA_HOSTDEV Vector3d min_eigen3D_fast(const Matrix3d& A) {
     significantly in single precision.
 */
 
-CUDA_HOSTDEV Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
+CUDA_HOSTDEV inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
   SelfAdjointEigenSolver<Matrix2d> solver(2);
   solver.computeDirect(A);
   int min_index;
@@ -450,14 +469,17 @@ CUDA_HOSTDEV Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
     - computation of error due to multiple scattering.
 */
 
-CUDA_HOSTDEV Vector4d Fast_fit(const Matrix3xNd& hits) {
+CUDA_HOSTDEV inline Vector4d Fast_fit(const Matrix3xNd& hits) {
   Vector4d result;
   u_int n = hits.cols(); // get the number of hits
+  printIt(&hits, "Fast_fit - hits: ", DEBUG);
 
   // CIRCLE FIT
   // Make segments between middle-to-first(b) and last-to-first(c) hits
   const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
   const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+  printIt(&b, "Fast_fit - b: ", DEBUG);
+  printIt(&c, "Fast_fit - c: ", DEBUG);
   // Compute their lengths
   const double b2 = b.squaredNorm();
   const double c2 = c.squaredNorm();
@@ -486,10 +508,13 @@ CUDA_HOSTDEV Vector4d Fast_fit(const Matrix3xNd& hits) {
   result(0) = X0 + hits(0, 0);
   result(1) = Y0 + hits(1, 0);
   result(2) = sqrt(sqr(X0) + sqr(Y0));
+  printIt(&result, "Fast_fit - result: ", DEBUG);
 
   // LINE FIT
   const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
   const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+  printIt(&e, "Fast_fit - e: ", DEBUG);
+  printIt(&d, "Fast_fit - d: ", DEBUG);
   // Compute the arc-length between first and last point: L = R * theta = R *  atan (tan (Theta) )
   const double dr = result(2) * atan2(cross2D(d, e), d.dot(e));
   // Simple difference in Z between last and first hit
@@ -497,6 +522,10 @@ CUDA_HOSTDEV Vector4d Fast_fit(const Matrix3xNd& hits) {
 
   result(3) = (dr / dz);
 
+  if (DEBUG) {
+    printf("Fast_fit: [%f, %f, %f, %f]\n", result(0),
+        result(1), result(2), result(3));
+  }
   return result;
 }
 
@@ -532,15 +561,25 @@ CUDA_HOSTDEV Vector4d Fast_fit(const Matrix3xNd& hits) {
     scattering.
 */
 
-CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hits_cov2D,
-    const Vector4d& fast_fit, VectorNd const & rad,
+CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
+    const Matrix2Nd & hits_cov2D,
+    const Vector4d  & fast_fit,
+    const VectorNd  & rad,
     const double B,
-    const bool& error = true,
-    const bool& scattering = false) {
+    const bool error = true,
+    const bool scattering = false) {
+  if (true) {
+    printf("circle_fit - enter\n");
+  }
   // INITIALIZATION
   Matrix2Nd V = hits_cov2D;
   u_int n = hits2D.cols();
+  printIt(&hits2D, "circle_fit - hits2D:", DEBUG);
+  printIt(&hits_cov2D, "circle_fit - hits_cov2D:", DEBUG);
 
+  if (DEBUG) {
+    printf("circle_fit - WEIGHT COMPUTATION\n");
+  }
   // WEIGHT COMPUTATION
   VectorNd weight;
   MatrixNd G;
@@ -548,15 +587,28 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
   {
     MatrixNd cov_rad;
     cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad);
+    printIt(&cov_rad, "circle_fit - cov_rad:", DEBUG);
     // cov_rad = cov_carttorad(hits2D, V);
 
     if (scattering) {
       MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
+      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:", DEBUG);
+      printIt(&hits2D, "circle_fit - hits2D bis:", DEBUG);
+      if (DEBUG) {
+        printf("Address of hits2D: a) %p\n", &hits2D);
+      }
       V += cov_radtocart(hits2D, scatter_cov_rad, rad);
+      printIt(&V, "circle_fit - V:", DEBUG);
       cov_rad += scatter_cov_rad;
-      G = cov_rad.inverse();
-      renorm = G.sum();
-      G *= 1. / renorm;
+      printIt(&cov_rad, "circle_fit - cov_rad:", DEBUG);
+      Matrix4d cov_rad4 = cov_rad;
+      Matrix4d G4;
+      G4 = cov_rad4.inverse();
+      printIt(&G4, "circle_fit - G4:", DEBUG);
+      renorm = G4.sum();
+      G4 *= 1. / renorm;
+      printIt(&G4, "circle_fit - G4:", DEBUG);
+      G = G4;
       weight = Weight_circle(G);
     } else {
       weight = cov_rad.diagonal().cwiseInverse();
@@ -564,15 +616,25 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
       weight *= 1. / renorm;
     }
   }
+  printIt(&weight, "circle_fit - weight:", DEBUG);
 
+  if (DEBUG) {
+    printf("circle_fit - SPACE TRANSFORMATION\n");
+  }
   // SPACE TRANSFORMATION
 
   // center
+  if (DEBUG) {
+    printf("Address of hits2D: b) %p\n", &hits2D);
+  }
   const Vector2d h_ = hits2D.rowwise().mean();  // centroid
+  printIt(&h_, "circle_fit - h_:", DEBUG);
   Matrix3xNd p3D(3, n);
   p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
+  printIt(&p3D, "circle_fit - p3D: a)", DEBUG);
   Vector2Nd mc(2 * n);  // centered hits, used in error computation
   mc << p3D.row(0).transpose(), p3D.row(1).transpose();
+  printIt(&mc, "circle_fit - mc(centered hits):", DEBUG);
 
   // scale
   const double q = mc.squaredNorm();
@@ -581,7 +643,11 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
 
   // project on paraboloid
   p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
+  printIt(&p3D, "circle_fit - p3D: b)", DEBUG);
 
+  if (DEBUG) {
+    printf("circle_fit - COST FUNCTION\n");
+  }
   // COST FUNCTION
 
   // compute
@@ -593,17 +659,39 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
   else {
     for (u_int i = 0; i < n; ++i) A += weight(i) * (X.col(i) * X.col(i).transpose());
   }
+  printIt(&A, "circle_fit - A:", DEBUG);
 
+  if (DEBUG) {
+    printf("circle_fit - MINIMIZE\n");
+  }
   // minimize
   double chi2;
   Vector3d v = min_eigen3D(A, chi2);
+  if (DEBUG) {
+    printf("circle_fit - AFTER MIN_EIGEN\n");
+  }
+  printIt(&v, "v BEFORE INVERSION", DEBUG);
   v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+  printIt(&v, "v AFTER INVERSION", DEBUG);
   // This hack to be able to run on GPU where the automatic assignment to a
   // double from the vector multiplication is not working.
+  if (DEBUG) {
+    printf("circle_fit - AFTER MIN_EIGEN 1\n");
+  }
   Matrix<double, 1, 1> cm;
-  cm.noalias() = -v.transpose() * r0;
+  if (DEBUG) {
+    printf("circle_fit - AFTER MIN_EIGEN 2\n");
+  }
+  cm = -v.transpose() * r0;
+  if (DEBUG) {
+    printf("circle_fit - AFTER MIN_EIGEN 3\n");
+  }
   const double c = cm(0,0);
+//  const double c = -v.transpose() * r0;
 
+  if (DEBUG) {
+    printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
+  }
   // COMPUTE CIRCLE PARAMETER
 
   // auxiliary quantities
@@ -617,18 +705,40 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
   circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
   circle.q = Charge(hits2D, circle.par);
   circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
+  printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:", DEBUG);
+  printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:", DEBUG);
+  if (DEBUG) {
+    printf("circle_fit - CIRCLE CHARGE: %ld\n", circle.q);
+  }
 
+  if (DEBUG) {
+    printf("circle_fit - ERROR PROPAGATION\n");
+  }
   // ERROR PROPAGATION
   if (error) {
+    if (DEBUG) {
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+    }
     ArrayNd Vcs_[2][2];  // cov matrix of center & scaled points
+    if (DEBUG) {
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+    }
     {
+      Matrix<double, 1, 1> cm;
+      Matrix<double, 1, 1> cm2;
+      cm = mc.transpose() * V * mc;
+//      cm2 = mc * mc.transpose();
+      const double c = cm(0,0);
+//      const double c2 = cm2(0,0);
       const Matrix2Nd Vcs = sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
-                                             (2. * V.squaredNorm() + 4. * mc.transpose() * V * mc) *
+                                             (2. * V.squaredNorm() + 4. * c) * // mc.transpose() * V * mc) *
                                              mc * mc.transpose();
+      printIt(&Vcs, "circle_fit - Vcs:", DEBUG);
       Vcs_[0][0] = Vcs.block(0, 0, n, n);
       Vcs_[0][1] = Vcs.block(0, n, n, n);
       Vcs_[1][1] = Vcs.block(n, n, n, n);
       Vcs_[1][0] = Vcs_[0][1].transpose();
+      printIt(&Vcs, "circle_fit - Vcs:", DEBUG);
     }
 
     MatrixNd C[3][3];  // cov matrix of 3D transformed points
@@ -648,18 +758,26 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
                       Vcs_[1][1] * Vcs_[1][1]) +
                 4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11);
     }
+    printIt(&C[0][0], "circle_fit - C[0][0]:", DEBUG);
 
     Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
     for (u_int i = 0; i < 3; ++i) {
       for (u_int j = i; j < 3; ++j) {
-        C0(i, j) = weight.transpose() * C[i][j] * weight;
+        Matrix<double, 1, 1> tmp;
+        tmp = weight.transpose() * C[i][j] * weight;
+        const double c = tmp(0,0);
+        C0(i, j) = c; //weight.transpose() * C[i][j] * weight;
         C0(j, i) = C0(i, j);
       }
     }
+    printIt(&C0, "circle_fit - C0:", DEBUG);
 
     const MatrixNd W = weight * weight.transpose();
     const MatrixNd H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
     const MatrixNx3d s_v = H * p3D.transpose();
+    printIt(&W, "circle_fit - W:", DEBUG);
+    printIt(&H, "circle_fit - H:", DEBUG);
+    printIt(&s_v, "circle_fit - s_v:", DEBUG);
 
     MatrixNd D_[3][3];  // cov(s_v)
     {
@@ -673,6 +791,7 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
       D_[2][0] = D_[0][2].transpose();
       D_[2][1] = D_[1][2].transpose();
     }
+    printIt(&D_[0][0], "circle_fit - D_[0][0]:", DEBUG);
 
     constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
 
@@ -697,13 +816,21 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
             t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
         }
 
-        if (i == j)
-          E(a, b) = 0. + s_v.col(i).transpose() * (t0 + t1);
-        else
-          E(a, b) = 0. + (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+        if (i == j) {
+          Matrix<double, 1, 1> cm;
+          cm = s_v.col(i).transpose() * (t0 + t1);
+          const double c = cm(0,0);
+          E(a, b) = 0. + c;
+        } else {
+          Matrix<double, 1, 1> cm;
+          cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+          const double c = cm(0,0);
+          E(a, b) = 0. + c;//(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+        }
         if (b != a) E(b, a) = E(a, b);
       }
     }
+    printIt(&E, "circle_fit - E:", DEBUG);
 
     Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
     for (u_int a = 0; a < 6; ++a) {
@@ -714,6 +841,7 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
       const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
       J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
     }
+    printIt(&J2, "circle_fit - J2:", DEBUG);
 
     Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
     {
@@ -722,9 +850,17 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
       Cvc.block(0, 0, 3, 3) = t0;
       Cvc.block(0, 3, 3, 1) = t1;
       Cvc.block(3, 0, 1, 3) = t1.transpose();
-      Cvc(3, 3) =
-          (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+      Matrix<double, 1, 1> cm1;
+//      Matrix<double, 1, 1> cm2;
+      Matrix<double, 1, 1> cm3;
+      cm1 = (v.transpose() * C0 * v);
+//      cm2 = (C0.cwiseProduct(t0)).sum();
+      cm3 = (r0.transpose() * t0 * r0);
+      const double c = cm1(0,0) + (C0.cwiseProduct(t0)).sum() + cm3(0,0);
+      Cvc(3, 3) = c;
+         // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
     }
+    printIt(&Cvc, "circle_fit - Cvc:", DEBUG);
 
     Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
     {
@@ -732,8 +868,10 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
       J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
           0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
     }
+    printIt(&J3, "circle_fit - J3:", DEBUG);
 
     const RowVector2Nd Jq = mc.transpose() * s * 1. / n;  // var(q)
+    printIt(&Jq, "circle_fit - Jq:", DEBUG);
 
     Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
                        + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
@@ -741,7 +879,10 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
     circle.cov = cov_uvr;
   }
 
-
+  printIt(&circle.cov, "Circle cov:", DEBUG);
+  if (DEBUG) {
+    printf("circle_fit - exit\n");
+  }
   return circle;
 }
 
@@ -780,13 +921,19 @@ CUDA_HOSTDEV circle_fit Circle_fit(const Matrix2xNd& hits2D, const Matrix2Nd& hi
     errors.
 */
 
-CUDA_HOSTDEV line_fit Line_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const circle_fit& circle,
-                  const Vector4d& fast_fit, const bool& error = true) {
+CUDA_HOSTDEV inline line_fit Line_fit(const Matrix3xNd& hits,
+    const Matrix3Nd& hits_cov,
+    const circle_fit& circle,
+    const Vector4d& fast_fit,
+    const bool error = true) {
   u_int n = hits.cols();
   // PROJECTION ON THE CILINDER
   Matrix2xNd p2D(2, n);
   MatrixNx5d Jx(n, 5);
 
+  printIt(&hits, "Line_fit points: ", DEBUG);
+  printIt(&hits_cov, "Line_fit covs: ", DEBUG);
+
   // x & associated Jacobian
   // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
   // Slide 11
@@ -826,6 +973,13 @@ CUDA_HOSTDEV line_fit Line_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov
 
   const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
   const VectorNd weight = err2_inv * 1. / err2_inv.sum();
+
+  printIt(&x_err2, "Line_fit - x_err2: ", DEBUG);
+  printIt(&y_err2, "Line_fit - y_err2: ", DEBUG);
+  printIt(&err2_inv, "Line_fit - err2_inv: ", DEBUG);
+  printIt(&weight, "Line_fit - weight: ", DEBUG);
+
+
   // COST FUNCTION
 
   // compute
@@ -838,14 +992,19 @@ CUDA_HOSTDEV line_fit Line_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov
   for (u_int i = 0; i < n; ++i) {
     A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
   }
+
+  printIt(&A, "Line_fit - A: ", DEBUG);
+
   // minimize
   double chi2;
   Vector2d v = min_eigen2D(A, chi2);
+  printIt(&v, "Line_fit - v: ", DEBUG);
+
   // n *= (chi2>0) ? 1 : -1; //TO FIX
   // This hack to be able to run on GPU where the automatic assignment to a
   // double from the vector multiplication is not working.
   Matrix<double, 1, 1> cm;
-  cm.noalias() = -v.transpose() * r0;
+  cm = -v.transpose() * r0;
   const double c = cm(0,0);
 
   // COMPUTE LINE PARAMETER
@@ -853,6 +1012,7 @@ CUDA_HOSTDEV line_fit Line_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov
   line.par << -v(0) / v(1),                          // cotan(theta))
       -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
   line.chi2 = abs(chi2);
+  printIt(&(line.par), "Line_fit - line.par: ", DEBUG);
 
   // ERROR PROPAGATION
   if (error) {
@@ -891,9 +1051,10 @@ CUDA_HOSTDEV line_fit Line_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov
       J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
     }
     Matrix<double, 3, 2> JT = J.transpose().eval();
-    line.cov.noalias() = J * C * JT;
+    line.cov = J * C * JT;
   }
 
+  printIt(&line.cov, "Line cov:", DEBUG);
   return line;
 }
 
@@ -936,7 +1097,7 @@ CUDA_HOSTDEV line_fit Line_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov
    \bug see Circle_fit(), Line_fit() and Fast_fit() bugs.
 */
 
-helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const double B,
+inline helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const double B,
                     const bool& error = true, const bool& scattering = false) {
   u_int n = hits.cols();
   VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
index bd390f5f65352..eadfda8cb6a26 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
@@ -22,30 +22,39 @@
 using namespace pixeltrackfitting;
 using edm::ParameterSet;
 
-PixelTrackProducer::PixelTrackProducer(const ParameterSet& cfg) : theReconstruction(cfg, consumesCollector()) {
-  edm::LogInfo("PixelTrackProducer") << " construction...";
+PixelTrackProducer::PixelTrackProducer(const ParameterSet& cfg)
+  : runOnGPU_(cfg.getParameter<bool>("runOnGPU")),
+  theReconstruction(cfg, consumesCollector()),
+  theGPUReconstruction(cfg, consumesCollector())
+{
+  edm::LogInfo("PixelTrackProducer")<<" construction...";
   produces<reco::TrackCollection>();
   produces<TrackingRecHitCollection>();
   produces<reco::TrackExtraCollection>();
 }
 
-PixelTrackProducer::~PixelTrackProducer() {}
+PixelTrackProducer::~PixelTrackProducer() { }
 
 void PixelTrackProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
-  desc.add<std::string>("passLabel", "pixelTracks");  // What is this? It is not used anywhere in this code.
+  desc.add<std::string>("passLabel", "pixelTracks"); // What is this? It is not used anywhere in this code.
+  desc.add<bool>("runOnGPU", false);
   PixelTrackReconstruction::fillDescriptions(desc);
 
   descriptions.add("pixelTracks", desc);
 }
 
-void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es) {
-  LogDebug("PixelTrackProducer, produce") << "event# :" << ev.id();
+void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es)
+{
+  LogDebug("PixelTrackProducer, produce")<<"event# :"<<ev.id();
 
   TracksWithTTRHs tracks;
-  theReconstruction.run(tracks, ev, es);
-
+  if (!runOnGPU_)
+    theReconstruction.run(tracks, ev, es);
+  else {
+    theGPUReconstruction.run(tracks, ev, es);
+  }
   edm::ESHandle<TrackerTopology> httopo;
   es.get<TrackerTopologyRcd>().get(httopo);
 
@@ -53,54 +62,59 @@ void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es) {
   store(ev, tracks, *httopo);
 }
 
-void PixelTrackProducer::store(edm::Event& ev, const TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo) {
+void PixelTrackProducer::store(edm::Event& ev, const TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo)
+{
   auto tracks = std::make_unique<reco::TrackCollection>();
   auto recHits = std::make_unique<TrackingRecHitCollection>();
   auto trackExtras = std::make_unique<reco::TrackExtraCollection>();
 
   int cc = 0, nTracks = tracksWithHits.size();
 
-  for (int i = 0; i < nTracks; i++) {
-    reco::Track* track = tracksWithHits.at(i).first;
+  for (int i = 0; i < nTracks; i++)
+  {
+    reco::Track* track =  tracksWithHits.at(i).first;
     const SeedingHitSet& hits = tracksWithHits.at(i).second;
 
-    for (unsigned int k = 0; k < hits.size(); k++) {
-      TrackingRecHit* hit = hits[k]->hit()->clone();
+    for (unsigned int k = 0; k < hits.size(); k++)
+    {
+      TrackingRecHit *hit = hits[k]->hit()->clone();
 
       track->appendHitPattern(*hit, ttopo);
       recHits->push_back(hit);
     }
     tracks->push_back(*track);
     delete track;
+
   }
 
-  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event"
-                            << "\n";
-  edm::OrphanHandle<TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
+  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event" << "\n";
+  edm::OrphanHandle <TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
 
   edm::RefProd<TrackingRecHitCollection> hitCollProd(ohRH);
-  for (int k = 0; k < nTracks; k++) {
+  for (int k = 0; k < nTracks; k++)
+  {
     reco::TrackExtra theTrackExtra{};
 
     //fill the TrackExtra with TrackingRecHitRef
     unsigned int nHits = tracks->at(k).numberOfValidHits();
     theTrackExtra.setHits(hitCollProd, cc, nHits);
-    cc += nHits;
-    AlgebraicVector5 v = AlgebraicVector5(0, 0, 0, 0, 0);
-    reco::TrackExtra::TrajParams trajParams(nHits, LocalTrajectoryParameters(v, 1.));
-    reco::TrackExtra::Chi2sFive chi2s(nHits, 0);
-    theTrackExtra.setTrajParams(std::move(trajParams), std::move(chi2s));
+    cc +=nHits;
+    AlgebraicVector5 v = AlgebraicVector5(0,0,0,0,0);
+    reco::TrackExtra::TrajParams trajParams(nHits,LocalTrajectoryParameters(v,1.));
+    reco::TrackExtra::Chi2sFive chi2s(nHits,0);
+    theTrackExtra.setTrajParams(std::move(trajParams),std::move(chi2s));
     trackExtras->push_back(theTrackExtra);
   }
 
-  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event"
-                            << "\n";
+  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event" << "\n";
   edm::OrphanHandle<reco::TrackExtraCollection> ohTE = ev.put(std::move(trackExtras));
 
-  for (int k = 0; k < nTracks; k++) {
-    const reco::TrackExtraRef theTrackExtraRef(ohTE, k);
+  for (int k = 0; k < nTracks; k++)
+  {
+    const reco::TrackExtraRef theTrackExtraRef(ohTE,k);
     (tracks->at(k)).setExtra(theTrackExtraRef);
   }
 
   ev.put(std::move(tracks));
+
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
index d756a9cf963f5..0803131715af6 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
@@ -4,16 +4,13 @@
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstruction.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstructionGPU.h"
 
-namespace edm {
-  class Event;
-  class EventSetup;
-  class ParameterSet;
-  class ConfigurationDescriptions;
-}  // namespace edm
+namespace edm { class Event; class EventSetup; class ParameterSet; class ConfigurationDescriptions; }
 class TrackerTopology;
 
-class PixelTrackProducer : public edm::stream::EDProducer<> {
+class PixelTrackProducer :  public edm::stream::EDProducer<> {
+
 public:
   explicit PixelTrackProducer(const edm::ParameterSet& conf);
 
@@ -25,6 +22,8 @@ class PixelTrackProducer : public edm::stream::EDProducer<> {
 
 private:
   void store(edm::Event& ev, const pixeltrackfitting::TracksWithTTRHs& selectedTracks, const TrackerTopology& ttopo);
+  bool runOnGPU_;
   PixelTrackReconstruction theReconstruction;
+  PixelTrackReconstructionGPU theGPUReconstruction;
 };
 #endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 44bfb888df98c..e1ccc16bf6430 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -80,7 +80,9 @@
 
 # Use Riemann fit and substitute previous Fitter producer with the Riemann one
 from Configuration.ProcessModifiers.riemannFit_cff import riemannFit
+from Configuration.ProcessModifiers.riemannFitGPU_cff import riemannFitGPU
 riemannFit.toModify(pixelTracks, Fitter = "pixelFitterByRiemannParaboloid")
+riemannFitGPU.toModify(pixelTracks, runOnGPU = True)
 _pixelTracksSequence_riemannFit = pixelTracksSequence.copy()
 _pixelTracksSequence_riemannFit.replace(pixelFitterByHelixProjections, pixelFitterByRiemannParaboloid)
 riemannFit.toReplaceWith(pixelTracksSequence, _pixelTracksSequence_riemannFit)
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 8c61394fec6ec..d6beb57b862b8 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -11,7 +11,6 @@
 <use   name="RecoTracker/TkTrackingRegions"/>
 <use   name="RecoPixelVertexing/PixelTriplets"/>
 <use   name="RecoPixelVertexing/PixelTrackFitting"/>
-<!--
 <bin file="testEigenGPU.cu" name="testEigenGPU_t">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
@@ -22,7 +21,6 @@
   <use name="cuda-api-wrappers"/>
   <flags CXXFLAGS="-g"/>
 </bin>
--->
 <library   file="PixelTrackTest.cc" name="PixelTrackTest">
   <flags   EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 814e278690957..6cf8a3664f903 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -2,23 +2,51 @@
 #include <iostream>
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
 using namespace Eigen;
 
-__global__ void kernelFullFit(Rfit::Matrix3xNd * hits, Rfit::Matrix3Nd * hits_cov,
-    double B, Rfit::circle_fit * circle_fit_resultsGPU, Rfit::line_fit * line_fit_resultsGPU) {
+__global__ void kernelFullFit(Rfit::Matrix3xNd * hits,
+    Rfit::Matrix3Nd * hits_cov,
+    double B,
+    bool errors,
+    bool scattering,
+    Rfit::circle_fit * circle_fit_resultsGPU,
+    Rfit::line_fit * line_fit_resultsGPU) {
   Vector4d fast_fit = Rfit::Fast_fit(*hits);
 
   u_int n = hits->cols();
   Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
 
+  Rfit::Matrix2xNd hits2D_local = (hits->block(0,0,2,n)).eval();
+  Rfit::Matrix2Nd hits_cov2D_local = (hits_cov->block(0, 0, 2 * n, 2 * n)).eval();
+  Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ", false);
+  Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ", false);
+  /*
+  printf("kernelFullFit - hits address: %p\n", hits);
+  printf("kernelFullFit - hits_cov address: %p\n", hits_cov);
+  printf("kernelFullFit - hits_cov2D address: %p\n", &hits2D_local);
+  printf("kernelFullFit - hits_cov2D_local address: %p\n", &hits_cov2D_local);
+  */
+  /* At some point I gave up and locally construct block on the stack, so that
+     the next invocation to Rfit::Circle_fit works properly. Failing to do so
+     implied basically an empty collection of hits and covariances. That could
+     have been partially fixed if values of the passed in matrices would have
+     been printed on screen since that, maybe, triggered internally the real
+     creations of the blocks. To be understood and compared against the myriad
+     of compilation warnings we have.
+     */
   (*circle_fit_resultsGPU) =
     Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-      fast_fit, rad, B, false, false);
-
-  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, true);
+      fast_fit, rad, B, errors, scattering);
+  /*
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
+      fast_fit, rad, B, errors, scattering);
+   */
+  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, errors);
 
   return;
 }
@@ -155,7 +183,7 @@ void testFit() {
   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
 }
 
-void testFitOneGo() {
+void testFitOneGo(bool errors, bool scattering, double epsilon=1e-6) {
   constexpr double B = 0.0113921;
   Rfit::Matrix3xNd hits(3,4);
   Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
@@ -170,40 +198,65 @@ void testFitOneGo() {
 
   Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n), 
       hits_cov.block(0, 0, 2 * n, 2 * n),
-      fast_fit_results, rad, B, false, false);
+      fast_fit_results, rad, B, errors, scattering);
   // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results,
+      fast_fit_results, errors);
 
   // FIT GPU
-  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
+  std::cout << "GPU FIT" << std::endl;
+  Rfit::Matrix3xNd * hitsGPU = nullptr; // new Rfit::Matrix3xNd(3,4);
   Rfit::Matrix3Nd * hits_covGPU = nullptr;
   Rfit::line_fit * line_fit_resultsGPU = nullptr;
   Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
-  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
+  Rfit::circle_fit * circle_fit_resultsGPU = nullptr; // new Rfit::circle_fit();
   Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
 
-  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
-  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
-  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
-  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
-  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
-  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4))));
+  cudaCheck(cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12))));
+  cudaCheck(cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice));
 
-  kernelFullFit<<<1, 1>>>(hitsGPU, hits_covGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU);
-  cudaDeviceSynchronize();
+  kernelFullFit<<<1, 1>>>(hitsGPU, hits_covGPU, B, errors, scattering,
+      circle_fit_resultsGPU, line_fit_resultsGPU);
+  cudaCheck(cudaDeviceSynchronize());
 
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
-      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
-  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
 
+  std::cout << "Fitted values (CircleFit) CPU:\n" << circle_fit_results.par << std::endl;
+  std::cout << "Fitted values (LineFit): CPU\n" << line_fit_results.par << std::endl;
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   std::cout << "Fitted values (LineFit): GPU\n" << line_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
-  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par, epsilon));
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, epsilon));
+
+  cudaCheck(cudaFree(hitsGPU));
+  cudaCheck(cudaFree(hits_covGPU));
+  cudaCheck(cudaFree(line_fit_resultsGPU));
+  cudaCheck(cudaFree(circle_fit_resultsGPU));
+  delete line_fit_resultsGPUret;
+  delete circle_fit_resultsGPUret;
+
+  cudaDeviceReset();
 }
 
 int main (int argc, char * argv[]) {
-  testFit();
-  testFitOneGo();
+//  testFit();
+  std::cout << "TEST FIT, NO ERRORS, NO SCATTERING" << std::endl;
+  testFitOneGo(false, false);
+
+  // The default 1e-6 is failing....
+  std::cout << "TEST FIT, ERRORS, NO SCATTER" << std::endl;
+  testFitOneGo(true, false, 1e-5);
+
+  std::cout << "TEST FIT, NO ERRORS, SCATTER" << std::endl;
+  testFitOneGo(false, true);
+
+  std::cout << "TEST FIT, ERRORS AND SCATTER" << std::endl;
+  testFitOneGo(true, true, 1e-5);
+
   return 0;
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index 210b10cd14ed1..475762f546807 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -23,10 +23,15 @@ __global__ void kernel(Matrix3d * m, Eigen::SelfAdjointEigenSolver<Matrix3d>::Re
   eigenValues(m, ret);
 }
 
-__global__ void kernelInverse(Matrix3d * in, Matrix3d * out) {
-//  (*out) = in->inverse();
+__global__ void kernelInverse3x3(Matrix3d * in, Matrix3d * out) {
+  (*out) = in->inverse();
 }
 
+__global__ void kernelInverse4x4(Matrix4d * in, Matrix4d * out) {
+  (*out) = in->inverse();
+}
+
+
 template<typename M1, typename M2, typename M3>
 __global__ void kernelMultiply(M1 * J,
                                M2 * C,
@@ -82,9 +87,10 @@ void testMultiply() {
   assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
 }
 
-void testInverse() {
-  std::cout << "TEST INVERSE" << std::endl;
+void testInverse3x3() {
+  std::cout << "TEST INVERSE 3x3" << std::endl;
   Matrix3d m = Matrix3d::Random();
+  Matrix3d m_inv = m.inverse();
   Matrix3d *mGPU = nullptr;
   Matrix3d *mGPUret = nullptr;
   Matrix3d *mCPUret = new Matrix3d();
@@ -97,12 +103,38 @@ void testInverse() {
   cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
   cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
 
-  kernelInverse<<<1,1>>>(mGPU, mGPUret);
+  kernelInverse3x3<<<1,1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
   cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
   if (!NODEBUG)
     std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
+void testInverse4x4() {
+  std::cout << "TEST INVERSE 4x4" << std::endl;
+  Matrix4d m = Matrix4d::Random();
+  Matrix4d m_inv = m.inverse();
+  Matrix4d *mGPU = nullptr;
+  Matrix4d *mGPUret = nullptr;
+  Matrix4d *mCPUret = new Matrix4d();
+
+  if (!NODEBUG) {
+    std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+    std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+  }
+  cudaMalloc((void **)&mGPU, sizeof(Matrix4d));
+  cudaMalloc((void **)&mGPUret, sizeof(Matrix4d));
+  cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice);
+
+  kernelInverse4x4<<<1,1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost);
+  if (!NODEBUG)
+    std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+  assert(isEqualFuzzy(m_inv, *mCPUret));
 }
 
 void testEigenvalues() {
@@ -142,7 +174,8 @@ void testEigenvalues() {
 int main (int argc, char * argv[]) {
 
   testEigenvalues();
-  testInverse();
+  testInverse3x3();
+  testInverse4x4();
   testMultiply<1, 2, 2, 1>();
   testMultiply<1, 2, 2, 2>();
   testMultiply<1, 2, 2, 3>();
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
index 0290ee2db641b..78ae4053b8429 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
+++ b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
@@ -20,8 +20,7 @@ __host__ __device__ void printIt(C * m) {
 }
 
 template<class C>
-bool isEqualFuzzy(C a, C b) {
-  constexpr double epsilon = 1e-6;
+bool isEqualFuzzy(C a, C b, double epsilon = 1e-6) {
   for (unsigned int i = 0; i < a.rows(); ++i) {
     for (unsigned int j = 0; j < a.cols(); ++j) {
       assert(std::abs(a(i,j)-b(i,j))
@@ -31,8 +30,7 @@ bool isEqualFuzzy(C a, C b) {
   return true;
 }
 
-bool isEqualFuzzy(double a, double b) {
-  constexpr double epsilon = 1e-6;
+bool isEqualFuzzy(double a, double b, double epsilon=1e-6) {
   return std::abs(a-b) < std::min(std::abs(a), std::abs(b))*epsilon;
 }
 

From a2e681dc0bed35a1729c32839590a7a4884271a6 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 4 Jul 2018 23:47:58 +0200
Subject: [PATCH 007/102] Synchronise with CMSSW_10_2_0_pre6

---
 .../interface/CAHitQuadrupletGenerator.h      | 194 +++++++++---------
 1 file changed, 96 insertions(+), 98 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
index 9d149533eefbc..721a5dfaef07e 100644
--- a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
+++ b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
@@ -16,120 +16,118 @@
 
 #include "RecoTracker/TkHitPairs/interface/IntermediateHitDoublets.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h"
-#include "RecoPixelVertexing/PixelTriplets/src/CACut.h"
 
 class TrackingRegion;
 class SeedingLayerSetsHits;
 
 namespace edm {
-  class Event;
-  class EventSetup;
-  class ParameterSetDescription;
-}  // namespace edm
+    class Event;
+    class EventSetup;
+    class ParameterSetDescription;
+}
 
 class CAHitQuadrupletGenerator {
 public:
-  typedef LayerHitMapCache LayerCacheType;
+    typedef LayerHitMapCache LayerCacheType;
 
-  static constexpr unsigned int minLayers = 4;
-  typedef OrderedHitSeeds ResultType;
+    static constexpr unsigned int minLayers = 4;
+    typedef OrderedHitSeeds ResultType;
 
 public:
-  CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC)
-      : CAHitQuadrupletGenerator(cfg, iC) {}
-  CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
 
-  ~CAHitQuadrupletGenerator() = default;
+    CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC): CAHitQuadrupletGenerator(cfg, iC) {}
+    CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
 
-  static void fillDescriptions(edm::ParameterSetDescription& desc);
-  static const char* fillDescriptionsLabel() { return "caHitQuadruplet"; }
+    ~CAHitQuadrupletGenerator() = default;
 
-  void initEvent(const edm::Event& ev, const edm::EventSetup& es);
+    static void fillDescriptions(edm::ParameterSetDescription& desc);
+    static const char *fillDescriptionsLabel() { return "caHitQuadrupletDefault"; }
 
-  void hitNtuplets(const IntermediateHitDoublets& regionDoublets,
-                   std::vector<OrderedHitSeeds>& result,
-                   const edm::EventSetup& es,
-                   const SeedingLayerSetsHits& layers);
+    void initEvent(const edm::Event& ev, const edm::EventSetup& es);
+
+    void hitNtuplets(const IntermediateHitDoublets& regionDoublets,
+                     std::vector<OrderedHitSeeds>& result,
+                     const edm::EventSetup& es,
+                     const SeedingLayerSetsHits& layers);
 
 private:
-  LayerCacheType theLayerCache;
-
-  std::unique_ptr<SeedComparitor> theComparitor;
-
-  class QuantityDependsPtEval {
-  public:
-    QuantityDependsPtEval(float v1, float v2, float c1, float c2)
-        : value1_(v1), value2_(v2), curvature1_(c1), curvature2_(c2) {}
-
-    float value(float curvature) const {
-      if (value1_ == value2_)  // not enabled
-        return value1_;
-
-      if (curvature1_ < curvature)
-        return value1_;
-      if (curvature2_ < curvature && curvature <= curvature1_)
-        return value2_ + (curvature - curvature2_) / (curvature1_ - curvature2_) * (value1_ - value2_);
-      return value2_;
-    }
-
-  private:
-    const float value1_;
-    const float value2_;
-    const float curvature1_;
-    const float curvature2_;
-  };
-
-  // Linear interpolation (in curvature) between value1 at pt1 and
-  // value2 at pt2. If disabled, value2 is given (the point is to
-  // allow larger/smaller values of the quantity at low pt, so it
-  // makes more sense to have the high-pt value as the default).
-
-  class QuantityDependsPt {
-  public:
-    explicit QuantityDependsPt(const edm::ParameterSet& pset)
-        : value1_(pset.getParameter<double>("value1")),
-          value2_(pset.getParameter<double>("value2")),
-          pt1_(pset.getParameter<double>("pt1")),
-          pt2_(pset.getParameter<double>("pt2")),
-          enabled_(pset.getParameter<bool>("enabled")) {
-      if (enabled_ && pt1_ >= pt2_)
-        throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 (" << pt1_
-                                              << ") needs to be smaller than pt2 (" << pt2_ << ")";
-      if (pt1_ <= 0)
-        throw cms::Exception("Configuration")
-            << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 needs to be > 0; is " << pt1_;
-      if (pt2_ <= 0)
-        throw cms::Exception("Configuration")
-            << "PixelQuadrupletGenerator::QuantityDependsPt: pt2 needs to be > 0; is " << pt2_;
-    }
-
-    QuantityDependsPtEval evaluator(const edm::EventSetup& es) const {
-      if (enabled_) {
-        return QuantityDependsPtEval(value1_,
-                                     value2_,
-                                     PixelRecoUtilities::curvature(1.f / pt1_, es),
-                                     PixelRecoUtilities::curvature(1.f / pt2_, es));
-      }
-      return QuantityDependsPtEval(value2_, value2_, 0.f, 0.f);
-    }
-
-  private:
-    const float value1_;
-    const float value2_;
-    const float pt1_;
-    const float pt2_;
-    const bool enabled_;
-  };
-
-  const float extraHitRPhitolerance;
-
-  const QuantityDependsPt maxChi2;
-  const bool fitFastCircle;
-  const bool fitFastCircleChi2Cut;
-  const bool useBendingCorrection;
-
-  CACut caThetaCut;
-  CACut caPhiCut;
-  const float caHardPtCut = 0.f;
+    LayerCacheType theLayerCache;
+
+    std::unique_ptr<SeedComparitor> theComparitor;
+
+    class QuantityDependsPtEval {
+    public:
+
+        QuantityDependsPtEval(float v1, float v2, float c1, float c2) :
+        value1_(v1), value2_(v2), curvature1_(c1), curvature2_(c2) {
+        }
+
+        float value(float curvature) const {
+            if (value1_ == value2_) // not enabled
+                return value1_;
+
+            if (curvature1_ < curvature)
+                return value1_;
+            if (curvature2_ < curvature && curvature <= curvature1_)
+                return value2_ + (curvature - curvature2_) / (curvature1_ - curvature2_) * (value1_ - value2_);
+            return value2_;
+        }
+
+    private:
+        const float value1_;
+        const float value2_;
+        const float curvature1_;
+        const float curvature2_;
+    };
+
+    // Linear interpolation (in curvature) between value1 at pt1 and
+    // value2 at pt2. If disabled, value2 is given (the point is to
+    // allow larger/smaller values of the quantity at low pt, so it
+    // makes more sense to have the high-pt value as the default).
+
+    class QuantityDependsPt {
+    public:
+
+        explicit QuantityDependsPt(const edm::ParameterSet& pset) :
+        value1_(pset.getParameter<double>("value1")),
+        value2_(pset.getParameter<double>("value2")),
+        pt1_(pset.getParameter<double>("pt1")),
+        pt2_(pset.getParameter<double>("pt2")),
+        enabled_(pset.getParameter<bool>("enabled")) {
+            if (enabled_ && pt1_ >= pt2_)
+                throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 (" << pt1_ << ") needs to be smaller than pt2 (" << pt2_ << ")";
+            if (pt1_ <= 0)
+                throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 needs to be > 0; is " << pt1_;
+            if (pt2_ <= 0)
+                throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt2 needs to be > 0; is " << pt2_;
+        }
+
+        QuantityDependsPtEval evaluator(const edm::EventSetup& es) const {
+            if (enabled_) {
+                return QuantityDependsPtEval(value1_, value2_,
+                        PixelRecoUtilities::curvature(1.f / pt1_, es),
+                        PixelRecoUtilities::curvature(1.f / pt2_, es));
+            }
+            return QuantityDependsPtEval(value2_, value2_, 0.f, 0.f);
+        }
+
+    private:
+        const float value1_;
+        const float value2_;
+        const float pt1_;
+        const float pt2_;
+        const bool enabled_;
+    };
+
+    const float extraHitRPhitolerance;
+
+    const QuantityDependsPt maxChi2;
+    const bool fitFastCircle;
+    const bool fitFastCircleChi2Cut;
+    const bool useBendingCorrection;
+
+    const float caThetaCut = 0.00125f;
+    const float caPhiCut = 0.1f;
+    const float caHardPtCut = 0.f;
 };
 #endif

From bb60075c6cced901b49e28d3d0ed0679500851b0 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 30 Jul 2018 13:23:42 +0200
Subject: [PATCH 008/102] Customize function to provide a minimal configuration
 for profiling (cms-patatrack#106)

Can be included with the following snippet in the configuration:

    from RecoPixelVertexing.Configuration.customizePixelTracksForProfiling import customizePixelTracksForProfiling
    process = customizePixelTracksForProfiling(process)

Removes validation, DQM, and output modules.
As suggested in #70 (comment), an `AsciiOutputModule` is used to require the `pixelTracks`.
---
 .../Configuration/python/customizePixelTracksForProfiling.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 4713b64e5e48a..dc6524babec9d 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -7,7 +7,7 @@ def customizePixelTracksForProfiling(process):
         ),
         verbosity = cms.untracked.uint32(0),
     )
-
+    
     process.outPath = cms.EndPath(process.out)
 
     process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.outPath)

From a7c22e4a3650ebe855594c3f98ba4d0c4696aee3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 31 Jul 2018 12:03:15 +0200
Subject: [PATCH 009/102] Heterogeneous ClusterTPAssociation
 (cms-patatrack#105)

Implement a heterogeneous Cluster-to-TrackingParticle associator running on the GPU.
---
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 161 ++++++++++++++++++
 .../RecoTrack/python/TrackValidation_cff.py   |  18 +-
 2 files changed, 175 insertions(+), 4 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
new file mode 100644
index 0000000000000..f09ae6aba5efb
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -0,0 +1,161 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+namespace gpuPixelDoublets {
+
+  __device__
+  std::pair<int,int>
+  findPhiLimits(int16_t phiMe, int16_t * iphi, uint16_t * index, uint16_t size, int16_t iphicut) {
+
+  assert(iphicut>0);
+
+  // find extreemes in top
+  int16_t minPhi = phiMe-iphicut;
+  int16_t maxPhi = phiMe+iphicut;
+
+  // std::cout << "\n phi min/max " << phiMe << ' ' << minPhi << ' ' << maxPhi << std::endl;
+
+  // guess and adjust
+  auto findLimit = [&](int16_t mPhi) {
+    int jm = float(0.5f*size)*(1.f+float(mPhi)/float(std::numeric_limits<short>::max()));
+    // std::cout << "jm for " << mPhi << ' ' << jm << std::endl;
+    jm = std::min(size-1,std::max(0,jm));
+    bool notDone=true;
+    while(jm>0 && mPhi<iphi[index[--jm]]){notDone=false;}
+    if (notDone) while(jm<size && mPhi>iphi[index[++jm]]){}
+    jm = std::min(size-1,std::max(0,jm));
+    return jm;
+  };
+
+  auto jmin = findLimit(minPhi);
+  auto jmax = findLimit(maxPhi);
+
+
+  /*
+  std::cout << "j min/max " << jmin << ' ' << jmax << std::endl;
+  std::cout << "found min/max " << iphi[index[jmin]] << ' ' << iphi[index[jmax]] << std::endl;
+  std::cout << "found min/max +1 " << iphi[index[jmin+1]] << ' ' << iphi[index[jmax+1]] << std::endl;
+  std::cout << "found min/max -1 " << iphi[index[jmin-1]] << ' ' << iphi[index[jmax-1]] << std::endl;
+  */
+
+  return std::make_pair(jmin,jmax);
+  }
+
+
+  __global__
+  void getDoubletsFromSorted(int16_t * iphi, uint16_t * index, uint32_t * offsets, float phiCut) {
+    auto iphicut = phi2short(phiCut);
+    auto i = blockIdx.x*blockDim.x + threadIdx.x;
+    if (i>=offsets[9]) {
+      // get rid of last layer
+      return;
+    }
+
+    assert(0==offsets[0]);
+    int top = (i>offsets[5]) ? 5: 0;
+    while (i>=offsets[++top]){};
+    assert(top<10);
+    auto bottom = top-1;
+    if (bottom == 3 or bottom == 6) {
+      // do not have UP... (9 we got rid already)
+      return;
+    }
+    assert(i >= offsets[bottom]);
+    assert(i < offsets[top]);
+
+    if (index[i]>= (offsets[top]-offsets[bottom])) {
+      printf("index problem: %d %d %d %d %d\n",i, offsets[top], offsets[bottom], offsets[top]-offsets[bottom], index[i]);
+      return;
+    }
+
+    assert(index[i]<offsets[top]-offsets[bottom]);
+
+    int16_t phiMe = iphi[offsets[bottom]+index[i]];
+
+    auto size = offsets[top+1] - offsets[top];
+    assert(size<std::numeric_limits<uint16_t>::max());
+
+    auto jLimits = findPhiLimits(phiMe, iphi+offsets[top],index+offsets[top],size,iphicut);
+
+    auto slidingWindow = [&](uint16_t mysize, uint16_t mymin,uint16_t mymax) {
+      auto topPhi = iphi+offsets[top];
+      uint16_t imax =  std::numeric_limits<uint16_t>::max();
+      uint16_t offset = (mymin>mymax) ? imax-(mysize-1) : 0;
+      int n=0;
+      for (uint16_t i = mymin+offset; i!=mymax; i++) {
+        assert(i<=imax);
+        uint16_t k = (i>mymax) ? i-offset : i;
+        assert(k<mysize);
+        assert(k>=mymin || k<mymax);
+        if (int16_t(topPhi[k]-phiMe)>2*iphicut && int16_t(phiMe-topPhi[k])>2*iphicut)
+          printf("deltaPhi problem: %d %d %d %d, deltas %d:%d cut %d\n",i,k,phiMe,topPhi[k],int16_t(topPhi[k]-phiMe),int16_t(phiMe-topPhi[k]),iphicut);
+        n++;
+      }
+      int tot = (mymin>mymax) ? (mysize-mymin)+mymax : mymax-mymin;
+      assert(n==tot);
+    };
+
+    slidingWindow(size,jLimits.first,jLimits.second);
+  }
+
+  template<typename Hist>
+  __device__
+  void doubletsFromHisto(int16_t const * iphi, Hist const * hist, uint32_t const * offsets, float phiCut) {
+    auto iphicut = phi2short(phiCut);
+    auto i = blockIdx.x*blockDim.x + threadIdx.x;
+    if (i>=offsets[9]) {
+      // get rid of last layer
+      return;
+    }
+
+    assert(0==offsets[0]);
+    int top = (i>offsets[5]) ? 5: 0;
+    while (i>=offsets[++top]){};
+    assert(top<10);
+    auto bottom = top-1;
+    if (bottom==3 || bottom==6) {
+      // do not have UP... (9 we got rid already)
+      return;
+    }
+    assert(i>=offsets[bottom]);
+    assert(i<offsets[top]);
+
+    auto mep = iphi[i];
+    auto kl = hist[top].bin(mep-iphicut);
+    auto kh = hist[top].bin(mep+iphicut);
+    auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
+    int tot  = 0;
+    int nmin = 0;
+    auto khh = kh;
+    incr(khh);
+    for (auto kk=kl; kk!=khh; incr(kk)) {
+      if (kk!=kl && kk!=kh) nmin+=hist[top].size(kk);
+      for (auto p=hist[top].begin(kk); p<hist[top].end(kk); ++p) {
+        if (std::min(std::abs(int16_t(iphi[*p]-mep)), std::abs(int16_t(mep-iphi[*p]))) > iphicut)
+          continue;
+        ++tot;
+      }
+    }
+    if (0==hist[top].nspills) assert(tot>=nmin);
+    // look in spill bin as well....
+  }
+
+  __global__
+  void getDoubletsFromHisto(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, float phiCut) {
+    auto const & hh = *hhp;
+    doubletsFromHisto(hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,phiCut);
+  }
+
+} // namespace end
+
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 550aa57bf2669..dff571862bb46 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -250,9 +250,9 @@ def _getMVASelectors(postfix):
 for _eraName, _postfix, _era in _relevantEras:
     locals()["_selectorsByAlgoAndHp"+_postfix] = locals()["_selectorsByAlgo"+_postfix] + locals()["_selectorsByAlgoHp"+_postfix]
     # For ByAlgoMask
-    locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] = filter(lambda n: n not in ["generalTracks", "cutsRecoTracksHp"], locals()["_selectorsByAlgoAndHp"+_postfix])
+    locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] = [n for n in locals()["_selectorsByAlgoAndHp"+_postfix] if n not in ["generalTracks", "cutsRecoTracksHp"]]
     # For ByOriginalAlgo
-    locals()["_selectorsByAlgoAndHpNoGenTkDupMerge"+_postfix] = filter(lambda n: n not in ["cutsRecoTracksDuplicateMerge", "cutsRecoTracksDuplicateMergeHp"], locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix])
+    locals()["_selectorsByAlgoAndHpNoGenTkDupMerge"+_postfix] = [n for n in locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] if n not in ["cutsRecoTracksDuplicateMerge", "cutsRecoTracksDuplicateMergeHp"]]
 _sequenceForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
                     args = ["_selectorsByAlgoAndHpNoGenTkDupMerge"], plainArgs = ["ByOriginalAlgo", "originalAlgorithm"],
                     names = "_selectorsByOriginalAlgo", sequence = "_tracksValidationSelectorsByOriginalAlgo")
@@ -709,9 +709,13 @@ def _uniqueFirstLayers(layerList):
 
 
 ### Pixel tracking only mode (placeholder for now)
-tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone(
-    pixelClusterSrc = "siPixelClustersPreSplitting"
+
+tpClusterProducerHeterogeneousPixelTrackingOnly = tpClusterProducerHeterogeneous.clone(
+   pixelClusterSrc = "siPixelClustersPreSplitting"
 )
+tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone()
+# Need to use the modifier to customize because the exact EDProducer type depends on the modifier
+gpu.toModify(tpClusterProducerPixelTrackingOnly, src = "tpClusterProducerHeterogeneousPixelTrackingOnly")
 
 quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone(
     cluster2TPSrc = "tpClusterProducerPixelTrackingOnly"
@@ -740,12 +744,18 @@ def _uniqueFirstLayers(layerList):
 tracksValidationTruthPixelTrackingOnly.replace(quickTrackAssociatorByHits, quickTrackAssociatorByHitsPixelTrackingOnly)
 tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation)
 tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks)
+
+_tracksValidationTruthPixelTrackingOnlyGPU = tracksValidationTruthPixelTrackingOnly.copy()
+_tracksValidationTruthPixelTrackingOnlyGPU.insert(0, tpClusterProducerHeterogeneousPixelTrackingOnly)
+gpu.toReplaceWith(tracksValidationTruthPixelTrackingOnly, _tracksValidationTruthPixelTrackingOnlyGPU)
+
 tracksValidationPixelTrackingOnly = cms.Sequence(
     tracksValidationTruthPixelTrackingOnly +
     trackValidatorPixelTrackingOnly
 )
 
 
+
 ### Lite mode (only generalTracks and HP)
 trackValidatorLite = trackValidator.clone(
     label = ["generalTracks", "cutsRecoTracksHp"]

From 1a435069623f6b2178bbadebe55f024309c33ea2 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 1 Aug 2018 12:08:58 +0200
Subject: [PATCH 010/102] Fix Cluster-to-TrackingParticle matching for pixel
 tracking CPU workflow (cms-patatrack#111)

---
 Validation/RecoTrack/python/TrackValidation_cff.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index dff571862bb46..5a806298bb7b3 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -713,9 +713,13 @@ def _uniqueFirstLayers(layerList):
 tpClusterProducerHeterogeneousPixelTrackingOnly = tpClusterProducerHeterogeneous.clone(
    pixelClusterSrc = "siPixelClustersPreSplitting"
 )
-tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone()
-# Need to use the modifier to customize because the exact EDProducer type depends on the modifier
-gpu.toModify(tpClusterProducerPixelTrackingOnly, src = "tpClusterProducerHeterogeneousPixelTrackingOnly")
+tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone(
+   pixelClusterSrc = "siPixelClustersPreSplitting"
+)
+from Configuration.ProcessModifiers.gpu_cff import gpu
+gpu.toReplaceWith(tpClusterProducerPixelTrackingOnly, tpClusterProducerConverter.clone(
+    src = "tpClusterProducerHeterogeneousPixelTrackingOnly"
+))
 
 quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone(
     cluster2TPSrc = "tpClusterProducerPixelTrackingOnly"

From b6b2fff3058e61db39a8522ab754fce678b332f4 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 1 Aug 2018 17:21:39 +0200
Subject: [PATCH 011/102] Synchronise with CMSSW_10_2_0

---
 Validation/RecoTrack/python/TrackValidation_cff.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 5a806298bb7b3..c8fb7653f0e43 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -21,6 +21,7 @@
 import RecoTracker.IterativeTracking.iterativeTkConfig as _cfg
 import RecoTracker.IterativeTracking.iterativeTkUtils as _utils
 from Configuration.Eras.Modifier_fastSim_cff import fastSim
+import six
 
 ### First define the stuff for the standard validation sequence
 ## Track selectors
@@ -174,7 +175,7 @@ def _sequenceForEachEra(function, args, names, sequence, modDict, plainArgs=[],
         _era.toReplaceWith(defaultSequence, modDict[sequence+_postfix])
 def _setForEra(module, eraName, era, **kwargs):
     if eraName == "":
-        for key, value in kwargs.iteritems():
+        for key, value in six.iteritems(kwargs):
             setattr(module, key, value)
     else:
         era.toModify(module, **kwargs)
@@ -234,7 +235,7 @@ def _getMVASelectors(postfix):
     mvaSel = _utils.getMVASelectors(postfix)
 
     pset = cms.untracked.PSet()
-    for iteration, (trackProducer, classifiers) in mvaSel.iteritems():
+    for iteration, (trackProducer, classifiers) in six.iteritems(mvaSel):
         setattr(pset, trackProducer, cms.untracked.vstring(classifiers))
     return pset
 for _eraName, _postfix, _era in _relevantEras:

From ecd14656a68c029c883c5faf492612da0a8db68b Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 8 Aug 2018 19:17:46 +0200
Subject: [PATCH 012/102] Pixel doublets on GPU (cms-patatrack#118)

Pixel doublets (actually CACells) are created on GPU and fed to CA.
The whole workflow up to quadruplets candidates is now fully on GPU.
---
 .../PixelTriplets/plugins/GPUCACell.h         |  74 ++----
 .../PixelTriplets/plugins/RecHitsMap.h        |  77 ++++++
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 233 +++++++++---------
 3 files changed, 223 insertions(+), 161 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 14d8ee833ce71..e8d389f00712b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -5,43 +5,43 @@
 #define GPU_CACELL_H_
 
 #include "GPUHitsAndDoublets.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include <cuda.h>
+
 struct Quadruplet {
-  int2 layerPairsAndCellId[3];
+  int hitId[4];
 };
 
+
 class GPUCACell {
 public:
   __host__ __device__  GPUCACell() {}
 
-  __host__ __device__ void init(const GPULayerDoublets *doublets,
-                                const GPULayerHits *hitsOnLayer,
-                                int layerPairId, int doubletId, int innerHitId,
-                                int outerHitId, float regionX, float regionY) {
 
+__host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
+                              int layerPairId, int doubletId, int innerHitId,int outerHitId) {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
 
-    auto innerLayerId = doublets->innerLayerId;
-    auto outerLayerId = doublets->outerLayerId;
-
-    theInnerX = hitsOnLayer[innerLayerId].x[innerHitId];
-    theOuterX = hitsOnLayer[outerLayerId].x[outerHitId];
+    theInnerX = hh.xg_d[innerHitId];
+    theOuterX = hh.xg_d[outerHitId];
 
-    theInnerY = hitsOnLayer[innerLayerId].y[innerHitId];
-    theOuterY = hitsOnLayer[outerLayerId].y[outerHitId];
+    theInnerY = hh.yg_d[innerHitId];
+    theOuterY = hh.yg_d[outerHitId];
 
-    theInnerZ = hitsOnLayer[innerLayerId].z[innerHitId];
-    theOuterZ = hitsOnLayer[outerLayerId].z[outerHitId];
-    theInnerR = hypot(theInnerX - regionX, theInnerY - regionY);
-    theOuterR = hypot(theOuterX - regionX, theOuterY - regionY);
+    theInnerZ = hh.zg_d[innerHitId];
+    theOuterZ = hh.zg_d[outerHitId];
+    theInnerR = hh.rg_d[innerHitId];
+    theOuterR = hh.rg_d[outerHitId];
     theOuterNeighbors.reset();
   }
 
+
+
   constexpr float get_inner_x() const { return theInnerX; }
   constexpr float get_outer_x() const { return theOuterX; }
   constexpr float get_inner_y() const { return theInnerY; }
@@ -201,57 +201,31 @@ class GPUCACell {
     // the ntuplets is then saved if the number of hits it contains is greater
     // than a threshold
 
+    tmpNtuplet.push_back_unsafe(theInnerHitId);
+    assert(tmpNtuplet.size()<=3);
 
-    if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
+    if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) {
       Quadruplet tmpQuadruplet;
-      for (unsigned int i = 0; i < minHitsPerNtuplet - 1; ++i) {
-        tmpQuadruplet.layerPairsAndCellId[i].x = cells[tmpNtuplet[i]].theLayerPairId;
-        tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i];
+      for (unsigned int i = 0; i < minHitsPerNtuplet-1; ++i) {
+        tmpQuadruplet.hitId[i] = tmpNtuplet[i];
       }
+      tmpQuadruplet.hitId[minHitsPerNtuplet-1] = theOuterHitId;
       foundNtuplets->push_back(tmpQuadruplet);
     }
     else {
       for (int j = 0; j < theOuterNeighbors.size(); ++j) {
         auto otherCell = theOuterNeighbors[j];
-        tmpNtuplet.push_back_unsafe(otherCell);
         cells[otherCell].find_ntuplets(cells, foundNtuplets, tmpNtuplet,
                                        minHitsPerNtuplet);
-        tmpNtuplet.pop_back();
       }
     }
+    tmpNtuplet.pop_back();
+    assert(tmpNtuplet.size()<3);
   }
 
 #endif
-  template <int maxNumberOfQuadruplets>
-  __host__ inline void find_ntuplets_host(
-      const GPUCACell *cells,
-      GPU::VecArray<Quadruplet, maxNumberOfQuadruplets> *foundNtuplets,
-      GPU::VecArray<unsigned int, 3> &tmpNtuplet,
-      const unsigned int minHitsPerNtuplet) const {
-
-    Quadruplet tmpQuadruplet;
-    if (tmpNtuplet.size() >= minHitsPerNtuplet - 1) {
-      for (int i = 0; i < minHitsPerNtuplet - 1; ++i) {
-        tmpQuadruplet.layerPairsAndCellId[i].x =
-            cells[tmpNtuplet[i]].theLayerPairId;
 
-        tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i];
-      }
-      foundNtuplets->push_back(tmpQuadruplet);
 
-    }
-
-    else {
-      for (int j = 0; j < theOuterNeighbors.size(); ++j) {
-        auto otherCell = theOuterNeighbors[j];
-        tmpNtuplet.push_back_unsafe(otherCell);
-        cells[otherCell].find_ntuplets_host(cells, foundNtuplets, tmpNtuplet,
-                                            minHitsPerNtuplet);
-
-        tmpNtuplet.pop_back();
-      }
-    }
-  }
   GPU::VecArray< unsigned int, 40> theOuterNeighbors;
 
   int theDoubletId;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
new file mode 100644
index 0000000000000..566a15591472c
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
@@ -0,0 +1,77 @@
+#ifndef RecHitsMap_H
+#define RecHitsMap_H
+  // store T for each cluster...
+
+#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
+#include <cstdint>
+#include <unordered_map>
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+
+//FIXME move it to a better place...
+template<typename T>
+class RecHitsMap {
+public: 
+
+     explicit RecHitsMap(T const & d=T()) : dummy(d){}
+
+     void clear() {m_map.clear();}
+
+     void error(const GeomDetUnit& gd) const {edm::LogError("RecHitMap") << "hit not found in det " << gd.index();  }
+     void error(uint32_t ind) const {edm::LogError("RecHitMap") << "hit not found in det " << ind;  }
+
+     // does not work for matched hits... (easy to extend)
+     void add(TrackingRecHit const & hit, T const & v) {
+       auto const & thit = static_cast<BaseTrackerRecHit const&>(hit);
+       auto const & clus = thit.firstClusterRef();
+
+       if (clus.isPixel()) 
+          add(clus.pixelCluster(), *thit.detUnit(),v);
+       else
+          add(clus.stripCluster(), *thit.detUnit(),v);
+     }
+
+     template<typename Cluster>
+     void add(const Cluster& cluster, const GeomDetUnit& gd, T const & v) { m_map[encode(cluster,gd)] = v; }
+
+     template<typename Cluster>
+     T const & get(const Cluster& cluster, const GeomDetUnit& gd) const {
+       auto p = m_map.find(encode(cluster,gd));
+       if (p!=m_map.end()) { return (*p).second; }
+       error(gd);
+       return dummy;
+     }
+
+     T const & get(uint32_t ind, uint16_t mr, uint16_t mc) const {
+       auto p = m_map.find(encode(ind,mr,mc));
+       if (p!=m_map.end()) { return (*p).second; }
+       error(ind);
+       return dummy;
+     }
+
+     static uint64_t encode(uint32_t ind, uint16_t mr, uint16_t mc) {
+          uint64_t u1 = ind;                            
+          uint64_t u2 = mr;
+          uint64_t u3 = mc;
+          return (u1<<32) | (u2<<16) | u3;
+     }
+
+     static uint64_t encode(const SiPixelCluster& cluster, const GeomDetUnit& det) {
+          uint64_t u1 = det.index();
+          uint64_t u2 = cluster.minPixelRow();
+          uint64_t u3 = cluster.minPixelCol();
+          return (u1<<32) | (u2<<16) | u3;
+     }
+     static uint64_t encode(const SiStripCluster& cluster, const GeomDetUnit& det) {
+          uint64_t u1 = det.index();
+          uint64_t u2 = cluster.firstStrip();
+       	  return (u1<<32) | u2;
+     }
+
+
+     std::unordered_map<uint64_t, T > m_map;
+     T dummy;
+  };
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index f09ae6aba5efb..6290c47b9e1ef 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -11,151 +11,162 @@
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
+#include "GPUCACell.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+
 namespace gpuPixelDoublets {
 
+  constexpr uint32_t MaxNumOfDoublets = 1024*1024*256;
+
+  template<typename Hist>
   __device__
-  std::pair<int,int>
-  findPhiLimits(int16_t phiMe, int16_t * iphi, uint16_t * index, uint16_t size, int16_t iphicut) {
+  void doubletsFromHisto(uint8_t const * layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
+                         int16_t const * iphi, Hist const * hist, uint32_t const * offsets,
+                         siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
+                         GPU::VecArray< unsigned int, 2048>  * isOuterHitOfCell,
+                         int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {
+
+    auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
+
+    // to be optimized later
+    uint32_t innerLayerCumulativeSize[64];
+    assert(nPairs<=64);
+    innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
+    for (uint32_t i=1; i<nPairs; ++i) {
+       innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
+    }
 
-  assert(iphicut>0);
+    auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-  // find extreemes in top
-  int16_t minPhi = phiMe-iphicut;
-  int16_t maxPhi = phiMe+iphicut;
 
-  // std::cout << "\n phi min/max " << phiMe << ' ' << minPhi << ' ' << maxPhi << std::endl;
+    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
+  for(auto j=idx;j<ntot;j+=blockDim.x*gridDim.x) {
+    auto j = idx; 
 
-  // guess and adjust
-  auto findLimit = [&](int16_t mPhi) {
-    int jm = float(0.5f*size)*(1.f+float(mPhi)/float(std::numeric_limits<short>::max()));
-    // std::cout << "jm for " << mPhi << ' ' << jm << std::endl;
-    jm = std::min(size-1,std::max(0,jm));
-    bool notDone=true;
-    while(jm>0 && mPhi<iphi[index[--jm]]){notDone=false;}
-    if (notDone) while(jm<size && mPhi>iphi[index[++jm]]){}
-    jm = std::min(size-1,std::max(0,jm));
-    return jm;
-  };
+    uint32_t pairLayerId=0;
+    while(j>=innerLayerCumulativeSize[pairLayerId++]);  --pairLayerId; // move to lower_bound ??
 
-  auto jmin = findLimit(minPhi);
-  auto jmax = findLimit(maxPhi);
+    assert(pairLayerId<nPairs);
+    assert(j<innerLayerCumulativeSize[pairLayerId]);
+    assert(0==pairLayerId || j>=innerLayerCumulativeSize[pairLayerId-1]);
 
+    uint8_t inner = layerPairs[2*pairLayerId];
+    uint8_t outer = layerPairs[2*pairLayerId+1];
+    assert(outer>inner);
 
-  /*
-  std::cout << "j min/max " << jmin << ' ' << jmax << std::endl;
-  std::cout << "found min/max " << iphi[index[jmin]] << ' ' << iphi[index[jmax]] << std::endl;
-  std::cout << "found min/max +1 " << iphi[index[jmin+1]] << ' ' << iphi[index[jmax+1]] << std::endl;
-  std::cout << "found min/max -1 " << iphi[index[jmin-1]] << ' ' << iphi[index[jmax-1]] << std::endl;
-  */
+    auto i = (0==pairLayerId) ? j :  j-innerLayerCumulativeSize[pairLayerId-1];
+    i += offsets[inner];
 
-  return std::make_pair(jmin,jmax);
-  }
+    // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
 
+    assert(i>=offsets[inner]);
+    assert(i<offsets[inner+1]);
 
-  __global__
-  void getDoubletsFromSorted(int16_t * iphi, uint16_t * index, uint32_t * offsets, float phiCut) {
-    auto iphicut = phi2short(phiCut);
-    auto i = blockIdx.x*blockDim.x + threadIdx.x;
-    if (i>=offsets[9]) {
-      // get rid of last layer
-      return;
-    }
-
-    assert(0==offsets[0]);
-    int top = (i>offsets[5]) ? 5: 0;
-    while (i>=offsets[++top]){};
-    assert(top<10);
-    auto bottom = top-1;
-    if (bottom == 3 or bottom == 6) {
-      // do not have UP... (9 we got rid already)
-      return;
-    }
-    assert(i >= offsets[bottom]);
-    assert(i < offsets[top]);
-
-    if (index[i]>= (offsets[top]-offsets[bottom])) {
-      printf("index problem: %d %d %d %d %d\n",i, offsets[top], offsets[bottom], offsets[top]-offsets[bottom], index[i]);
-      return;
-    }
+    // found hit corresponding to our cuda thread!!!!!
+    // do the job
 
-    assert(index[i]<offsets[top]-offsets[bottom]);
-
-    int16_t phiMe = iphi[offsets[bottom]+index[i]];
-
-    auto size = offsets[top+1] - offsets[top];
-    assert(size<std::numeric_limits<uint16_t>::max());
-
-    auto jLimits = findPhiLimits(phiMe, iphi+offsets[top],index+offsets[top],size,iphicut);
-
-    auto slidingWindow = [&](uint16_t mysize, uint16_t mymin,uint16_t mymax) {
-      auto topPhi = iphi+offsets[top];
-      uint16_t imax =  std::numeric_limits<uint16_t>::max();
-      uint16_t offset = (mymin>mymax) ? imax-(mysize-1) : 0;
-      int n=0;
-      for (uint16_t i = mymin+offset; i!=mymax; i++) {
-        assert(i<=imax);
-        uint16_t k = (i>mymax) ? i-offset : i;
-        assert(k<mysize);
-        assert(k>=mymin || k<mymax);
-        if (int16_t(topPhi[k]-phiMe)>2*iphicut && int16_t(phiMe-topPhi[k])>2*iphicut)
-          printf("deltaPhi problem: %d %d %d %d, deltas %d:%d cut %d\n",i,k,phiMe,topPhi[k],int16_t(topPhi[k]-phiMe),int16_t(phiMe-topPhi[k]),iphicut);
-        n++;
-      }
-      int tot = (mymin>mymax) ? (mysize-mymin)+mymax : mymax-mymin;
-      assert(n==tot);
+    auto mep = iphi[i];
+    auto mez = hh.zg_d[i];
+    auto mer = hh.rg_d[i];
+    auto cutoff = [&](int j) { return 
+        abs(hh.zg_d[j]-mez) > maxz[pairLayerId] ||
+      	abs(hh.zg_d[j]-mez) < minz[pairLayerId] ||
+        hh.rg_d[j]-mer > maxr[pairLayerId];
     };
 
-    slidingWindow(size,jLimits.first,jLimits.second);
-  }
-
-  template<typename Hist>
-  __device__
-  void doubletsFromHisto(int16_t const * iphi, Hist const * hist, uint32_t const * offsets, float phiCut) {
-    auto iphicut = phi2short(phiCut);
-    auto i = blockIdx.x*blockDim.x + threadIdx.x;
-    if (i>=offsets[9]) {
-      // get rid of last layer
-      return;
-    }
+    constexpr float z0cut = 12.f;
+    auto z0cutoff = [&](int j) {
+      auto zo =	hh.zg_d[j];
+      auto ro = hh.rg_d[j]; 
+      auto dr = ro-mer;
+      return dr > maxr[pairLayerId] || 
+             dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
+    };
 
-    assert(0==offsets[0]);
-    int top = (i>offsets[5]) ? 5: 0;
-    while (i>=offsets[++top]){};
-    assert(top<10);
-    auto bottom = top-1;
-    if (bottom==3 || bottom==6) {
-      // do not have UP... (9 we got rid already)
-      return;
-    }
-    assert(i>=offsets[bottom]);
-    assert(i<offsets[top]);
+    auto iphicut = phicuts[pairLayerId];
 
-    auto mep = iphi[i];
-    auto kl = hist[top].bin(mep-iphicut);
-    auto kh = hist[top].bin(mep+iphicut);
+    auto kl = hist[outer].bin(int16_t(mep-iphicut));
+    auto kh = hist[outer].bin(int16_t(mep+iphicut));
     auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
     int tot  = 0;
     int nmin = 0;
     auto khh = kh;
     incr(khh);
+    
+    int tooMany=0;
     for (auto kk=kl; kk!=khh; incr(kk)) {
-      if (kk!=kl && kk!=kh) nmin+=hist[top].size(kk);
-      for (auto p=hist[top].begin(kk); p<hist[top].end(kk); ++p) {
-        if (std::min(std::abs(int16_t(iphi[*p]-mep)), std::abs(int16_t(mep-iphi[*p]))) > iphicut)
+      if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk);
+      for (auto p=hist[outer].begin(kk); p<hist[outer].end(kk); ++p) {
+        auto oi=*p;
+        assert(oi>=offsets[outer]);
+        assert(oi<offsets[outer+1]);
+
+        if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
           continue;
+        if (z0cutoff(oi)) continue;
+        auto ind = atomicInc(nCells,MaxNumOfDoublets);
+        // int layerPairId, int doubletId, int innerHitId,int outerHitId)
+        cells[ind].init(hh,pairLayerId,ind,i,oi);
+        isOuterHitOfCell[oi].push_back(ind);
+        if (isOuterHitOfCell[oi].full()) ++tooMany;
         ++tot;
       }
     }
-    if (0==hist[top].nspills) assert(tot>=nmin);
+    if (tooMany>0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner,outer, kl,kh,nmin,tot);
+
+    if (hist[outer].nspills>0)
+      printf("spill bin to be checked in %d %d\n",outer,hist[outer].nspills);
+
+    // if (0==hist[outer].nspills) assert(tot>=nmin);
     // look in spill bin as well....
-  }
 
+
+  }  // loop in block...
+  }
   __global__
-  void getDoubletsFromHisto(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, float phiCut) {
+  void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp,                
+                            GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell) {
+
+    uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3 
+                                     // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6  
+                                     ,0,7 ,1,7 ,2,7 ,7,8 ,8,9
+                                     ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
+                                     };
+
+    const int16_t phi0p05 = phi2short(0.05);
+    const int16_t phi0p06 = phi2short(0.06);
+    const int16_t phi0p07 = phi2short(0.07);
+
+    int16_t const phicuts[13] { phi0p05, phi0p05, phi0p06
+                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
+                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
+                              };
+
+    float const minz[13] = { 0., 0., 0.
+                            ,0., 0., 0., 0., 0.
+      	       	       	    ,0., 0., 0., 0., 0.
+                           };
+
+    float const	maxz[13] = { 20.,15.,12.
+                            ,30.,20.,20., 50., 50.
+       	       	       	    ,30.,20.,20., 50., 50.
+                           };
+
+    float const maxr[13] = { 20., 20., 20.
+                            ,9., 7., 6., 5., 5.
+      	       	       	    ,9., 7., 6., 5., 5.
+                           };
+
+
     auto const & hh = *hhp;
-    doubletsFromHisto(hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,phiCut);
+    doubletsFromHisto(layerPairs, 13, cells, nCells, 
+                      hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,
+                      hh, isOuterHitOfCell,
+                      phicuts, minz, maxz, maxr);
   }
 
+
+
 } // namespace end
 
 #endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h

From e5390071e96eef705acafb746bdbbb1e3387a8a6 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 9 Aug 2018 12:26:40 +0200
Subject: [PATCH 013/102] Cleanup defines, includes, file names, and debug
 messages (cms-patatrack#122)

Do not #ifdef on __NVCC__: to protect CUDA-aware code sections, check if the __CUDACC__ symbol is defined. The symbol __NVCC__ is defined when building with nvcc, but not when building CUDA code with clang.

Move header files referenced from outside their directory to the interface/ directory, and update the include guards accordingly.

Include <cuda_runtime.h> instead of <cuda.h> to handle the CUDA attributes in non-CUDA compilations.

Rename PixelTrackReconstructionGPU_impl.cu to PixelTrackReconstructionGPU.cu.

Other cleanup: #defines, debug messages, change __inline__ to inline, fix include guards, whitespaces, etc.
---
 .../PixelTrackFitting/interface/RiemannFit.h  | 309 +++++++++---------
 .../PixelTrackFitting/test/testEigenGPU.cu    |  67 ++--
 .../test/testEigenGPUNoFit.cu                 |  80 ++---
 .../PixelTrackFitting/test/test_common.h      |  21 +-
 .../PixelTriplets/plugins/GPUCACell.h         |  45 ++-
 5 files changed, 274 insertions(+), 248 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index ed1efc8a3240b..8a1e29f78041b 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -1,17 +1,13 @@
-#ifndef RECOPIXELVERTEXING_PIXELTRACKFITTING_RIEMANNFIT_H
-#define RECOPIXELVERTEXING_PIXELTRACKFITTING_RIEMANNFIT_H
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
 
+#include <cuda_runtime.h>
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
-#include <cuda.h>
 
-#ifdef __CUDACC__
-#define CUDA_HOSTDEV __host__ __device__
-#else
-#define CUDA_HOSTDEV
-#endif
-
-#define DEBUG 0
+#ifndef RFIT_DEBUG
+#define RFIT_DEBUG 0
+#endif // RFIT_DEBUG
 
 namespace Rfit {
 
@@ -80,21 +76,24 @@ struct helix_fit {
 
 
 template<class C>
-CUDA_HOSTDEV void printIt(C * m, const char * prefix = "", bool debug=false) {
+__host__ __device__
+void printIt(C * m, const char * prefix = "") {
+#if RFIT_DEBUG
   for (u_int r = 0; r < m->rows(); ++r) {
     for (u_int c = 0; c < m->cols(); ++c) {
-      if (debug) {
-        printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r,c));
-      }
+      printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r,c));
     }
   }
+#endif
 }
 
 
 /*!
     \brief raise to square.
 */
-CUDA_HOSTDEV inline double sqr(const double a) { return a * a; }
+template <typename T>
+__host__ __device__
+inline T sqr(const T a) { return a * a; }
 
 /*!
     \brief Compute cross product of two 2D vector (assuming z component 0),
@@ -106,7 +105,8 @@ CUDA_HOSTDEV inline double sqr(const double a) { return a * a; }
     \return z component of the cross product.
 */
 
-CUDA_HOSTDEV inline double cross2D(const Vector2d& a, const Vector2d& b) {
+__host__ __device__
+inline double cross2D(const Vector2d& a, const Vector2d& b) {
   return a.x() * b.y() - a.y() * b.x();
 }
 
@@ -130,7 +130,8 @@ CUDA_HOSTDEV inline double cross2D(const Vector2d& a, const Vector2d& b) {
 
  */
 // X in input TO FIX
-CUDA_HOSTDEV inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
+__host__ __device__
+inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
     const Vector4d& fast_fit,
     VectorNd const & rad,
     double B) {
@@ -151,7 +152,7 @@ CUDA_HOSTDEV inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
       }
     }
   }
-  Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ", DEBUG);
+  printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
   return scatter_cov_rad;
 }
 
@@ -165,12 +166,13 @@ CUDA_HOSTDEV inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
     \return cov_cart covariance matrix in Cartesian coordinates.
 */
 
-CUDA_HOSTDEV inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
+__host__ __device__
+inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
     const MatrixNd& cov_rad,
     const VectorNd &rad) {
-  if (DEBUG) {
+#if RFIT_DEBUG
     printf("Address of p2D: %p\n", &p2D);
-  }
+#endif
   printIt(&p2D, "cov_radtocart - p2D:");
   u_int n = p2D.cols();
   Matrix2Nd cov_cart = MatrixXd::Zero(2 * n, 2 * n);
@@ -205,7 +207,8 @@ CUDA_HOSTDEV inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
 
     \warning correlation between different point are not computed.
 */
-CUDA_HOSTDEV inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
+__host__ __device__
+inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
     const Matrix2Nd& cov_cart,
     const VectorNd& rad) {
   u_int n = p2D.cols();
@@ -240,7 +243,8 @@ CUDA_HOSTDEV inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
 
 */
 
-CUDA_HOSTDEV inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
+__host__ __device__
+inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
     const Vector4d& fast_fit,
     const VectorNd& rad) {
   u_int n = p2D.cols();
@@ -278,7 +282,8 @@ CUDA_HOSTDEV inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const M
     diagonal cov matrix. Further investigation needed.
 */
 
-CUDA_HOSTDEV inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv) {
+__host__ __device__
+inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv) {
   return cov_rad_inv.colwise().sum().transpose();
 }
 
@@ -294,7 +299,8 @@ CUDA_HOSTDEV inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv) {
     \return weight points' weights' vector for the line fit (ODR).
 */
 
-CUDA_HOSTDEV inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta) {
+__host__ __device__
+inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta) {
   return (1. + sqr(tan_theta)) * 1. / (x_err2 + y_err2 * sqr(tan_theta));
 }
 
@@ -309,7 +315,8 @@ CUDA_HOSTDEV inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y
     \return q int 1 or -1.
 */
 
-CUDA_HOSTDEV inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
+__host__ __device__
+inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
   return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) -
               (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
           0)
@@ -327,7 +334,8 @@ CUDA_HOSTDEV inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uv
     \param error flag for errors computation.
 */
 
-CUDA_HOSTDEV inline void par_uvrtopak(circle_fit& circle, const double B, const bool& error) {
+__host__ __device__
+inline void par_uvrtopak(circle_fit& circle, const double B, const bool& error) {
   Vector3d par_pak;
   const double temp0 = circle.par.head(2).squaredNorm();
   const double temp1 = sqrt(temp0);
@@ -359,7 +367,8 @@ CUDA_HOSTDEV inline void par_uvrtopak(circle_fit& circle, const double B, const
     \return x_err2 squared errors in the x axis.
 */
 
-CUDA_HOSTDEV inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
+__host__ __device__
+inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
                 const bool& error, u_int n) {
   VectorNd x_err2(n);
   for (u_int i = 0; i < n; ++i) {
@@ -395,17 +404,18 @@ CUDA_HOSTDEV inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle
 
 */
 
-CUDA_HOSTDEV inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
-  if (DEBUG) {
-    printf("min_eigen3D - enter\n");
-  }
+__host__ __device__
+inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
+#if RFIT_DEBUG
+  printf("min_eigen3D - enter\n");
+#endif
   SelfAdjointEigenSolver<Matrix3d> solver(3);
   solver.computeDirect(A);
   int min_index;
   chi2 = solver.eigenvalues().minCoeff(&min_index);
-  if (DEBUG) {
-    printf("min_eigen3D - exit\n");
-  }
+#if RFIT_DEBUG
+  printf("min_eigen3D - exit\n");
+#endif
   return solver.eigenvectors().col(min_index);
 }
 
@@ -423,7 +433,8 @@ CUDA_HOSTDEV inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
     speed up in  single precision.
 */
 
-CUDA_HOSTDEV inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
+__host__ __device__
+inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
   SelfAdjointEigenSolver<Matrix3f> solver(3);
   solver.computeDirect(A.cast<float>());
   int min_index;
@@ -444,7 +455,8 @@ CUDA_HOSTDEV inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
     significantly in single precision.
 */
 
-CUDA_HOSTDEV inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
+__host__ __device__
+inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
   SelfAdjointEigenSolver<Matrix2d> solver(2);
   solver.computeDirect(A);
   int min_index;
@@ -469,17 +481,18 @@ CUDA_HOSTDEV inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
     - computation of error due to multiple scattering.
 */
 
-CUDA_HOSTDEV inline Vector4d Fast_fit(const Matrix3xNd& hits) {
+__host__ __device__
+inline Vector4d Fast_fit(const Matrix3xNd& hits) {
   Vector4d result;
   u_int n = hits.cols(); // get the number of hits
-  printIt(&hits, "Fast_fit - hits: ", DEBUG);
+  printIt(&hits, "Fast_fit - hits: ");
 
   // CIRCLE FIT
   // Make segments between middle-to-first(b) and last-to-first(c) hits
   const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
   const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
-  printIt(&b, "Fast_fit - b: ", DEBUG);
-  printIt(&c, "Fast_fit - c: ", DEBUG);
+  printIt(&b, "Fast_fit - b: ");
+  printIt(&c, "Fast_fit - c: ");
   // Compute their lengths
   const double b2 = b.squaredNorm();
   const double c2 = c.squaredNorm();
@@ -508,13 +521,13 @@ CUDA_HOSTDEV inline Vector4d Fast_fit(const Matrix3xNd& hits) {
   result(0) = X0 + hits(0, 0);
   result(1) = Y0 + hits(1, 0);
   result(2) = sqrt(sqr(X0) + sqr(Y0));
-  printIt(&result, "Fast_fit - result: ", DEBUG);
+  printIt(&result, "Fast_fit - result: ");
 
   // LINE FIT
   const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
   const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
-  printIt(&e, "Fast_fit - e: ", DEBUG);
-  printIt(&d, "Fast_fit - d: ", DEBUG);
+  printIt(&e, "Fast_fit - e: ");
+  printIt(&d, "Fast_fit - d: ");
   // Compute the arc-length between first and last point: L = R * theta = R *  atan (tan (Theta) )
   const double dr = result(2) * atan2(cross2D(d, e), d.dot(e));
   // Simple difference in Z between last and first hit
@@ -522,10 +535,9 @@ CUDA_HOSTDEV inline Vector4d Fast_fit(const Matrix3xNd& hits) {
 
   result(3) = (dr / dz);
 
-  if (DEBUG) {
-    printf("Fast_fit: [%f, %f, %f, %f]\n", result(0),
-        result(1), result(2), result(3));
-  }
+#if RFIT_DEBUG
+  printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
+#endif
   return result;
 }
 
@@ -561,25 +573,27 @@ CUDA_HOSTDEV inline Vector4d Fast_fit(const Matrix3xNd& hits) {
     scattering.
 */
 
-CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
+__host__ __device__
+inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     const Matrix2Nd & hits_cov2D,
     const Vector4d  & fast_fit,
     const VectorNd  & rad,
     const double B,
     const bool error = true,
-    const bool scattering = false) {
-  if (true) {
-    printf("circle_fit - enter\n");
-  }
+    const bool scattering = false)
+{
+#if RFIT_DEBUG
+  printf("circle_fit - enter\n");
+#endif
   // INITIALIZATION
   Matrix2Nd V = hits_cov2D;
   u_int n = hits2D.cols();
-  printIt(&hits2D, "circle_fit - hits2D:", DEBUG);
-  printIt(&hits_cov2D, "circle_fit - hits_cov2D:", DEBUG);
+  printIt(&hits2D, "circle_fit - hits2D:");
+  printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
 
-  if (DEBUG) {
-    printf("circle_fit - WEIGHT COMPUTATION\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - WEIGHT COMPUTATION\n");
+#endif
   // WEIGHT COMPUTATION
   VectorNd weight;
   MatrixNd G;
@@ -587,27 +601,27 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
   {
     MatrixNd cov_rad;
     cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad);
-    printIt(&cov_rad, "circle_fit - cov_rad:", DEBUG);
+    printIt(&cov_rad, "circle_fit - cov_rad:");
     // cov_rad = cov_carttorad(hits2D, V);
 
     if (scattering) {
       MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
-      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:", DEBUG);
-      printIt(&hits2D, "circle_fit - hits2D bis:", DEBUG);
-      if (DEBUG) {
-        printf("Address of hits2D: a) %p\n", &hits2D);
-      }
+      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
+      printIt(&hits2D, "circle_fit - hits2D bis:");
+#if RFIT_DEBUG
+      printf("Address of hits2D: a) %p\n", &hits2D);
+#endif
       V += cov_radtocart(hits2D, scatter_cov_rad, rad);
-      printIt(&V, "circle_fit - V:", DEBUG);
+      printIt(&V, "circle_fit - V:");
       cov_rad += scatter_cov_rad;
-      printIt(&cov_rad, "circle_fit - cov_rad:", DEBUG);
+      printIt(&cov_rad, "circle_fit - cov_rad:");
       Matrix4d cov_rad4 = cov_rad;
       Matrix4d G4;
       G4 = cov_rad4.inverse();
-      printIt(&G4, "circle_fit - G4:", DEBUG);
+      printIt(&G4, "circle_fit - G4:");
       renorm = G4.sum();
       G4 *= 1. / renorm;
-      printIt(&G4, "circle_fit - G4:", DEBUG);
+      printIt(&G4, "circle_fit - G4:");
       G = G4;
       weight = Weight_circle(G);
     } else {
@@ -616,25 +630,25 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
       weight *= 1. / renorm;
     }
   }
-  printIt(&weight, "circle_fit - weight:", DEBUG);
+  printIt(&weight, "circle_fit - weight:");
 
-  if (DEBUG) {
-    printf("circle_fit - SPACE TRANSFORMATION\n");
-  }
   // SPACE TRANSFORMATION
+#if RFIT_DEBUG
+  printf("circle_fit - SPACE TRANSFORMATION\n");
+#endif
 
   // center
-  if (DEBUG) {
-    printf("Address of hits2D: b) %p\n", &hits2D);
-  }
+#if RFIT_DEBUG
+  printf("Address of hits2D: b) %p\n", &hits2D);
+#endif
   const Vector2d h_ = hits2D.rowwise().mean();  // centroid
-  printIt(&h_, "circle_fit - h_:", DEBUG);
+  printIt(&h_, "circle_fit - h_:");
   Matrix3xNd p3D(3, n);
   p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
-  printIt(&p3D, "circle_fit - p3D: a)", DEBUG);
+  printIt(&p3D, "circle_fit - p3D: a)");
   Vector2Nd mc(2 * n);  // centered hits, used in error computation
   mc << p3D.row(0).transpose(), p3D.row(1).transpose();
-  printIt(&mc, "circle_fit - mc(centered hits):", DEBUG);
+  printIt(&mc, "circle_fit - mc(centered hits):");
 
   // scale
   const double q = mc.squaredNorm();
@@ -643,11 +657,11 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
 
   // project on paraboloid
   p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
-  printIt(&p3D, "circle_fit - p3D: b)", DEBUG);
+  printIt(&p3D, "circle_fit - p3D: b)");
 
-  if (DEBUG) {
-    printf("circle_fit - COST FUNCTION\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - COST FUNCTION\n");
+#endif
   // COST FUNCTION
 
   // compute
@@ -659,39 +673,39 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
   else {
     for (u_int i = 0; i < n; ++i) A += weight(i) * (X.col(i) * X.col(i).transpose());
   }
-  printIt(&A, "circle_fit - A:", DEBUG);
+  printIt(&A, "circle_fit - A:");
 
-  if (DEBUG) {
-    printf("circle_fit - MINIMIZE\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - MINIMIZE\n");
+#endif
   // minimize
   double chi2;
   Vector3d v = min_eigen3D(A, chi2);
-  if (DEBUG) {
-    printf("circle_fit - AFTER MIN_EIGEN\n");
-  }
-  printIt(&v, "v BEFORE INVERSION", DEBUG);
+#if RFIT_DEBUG
+  printf("circle_fit - AFTER MIN_EIGEN\n");
+#endif
+  printIt(&v, "v BEFORE INVERSION");
   v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
-  printIt(&v, "v AFTER INVERSION", DEBUG);
+  printIt(&v, "v AFTER INVERSION");
   // This hack to be able to run on GPU where the automatic assignment to a
   // double from the vector multiplication is not working.
-  if (DEBUG) {
-    printf("circle_fit - AFTER MIN_EIGEN 1\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - AFTER MIN_EIGEN 1\n");
+#endif
   Matrix<double, 1, 1> cm;
-  if (DEBUG) {
-    printf("circle_fit - AFTER MIN_EIGEN 2\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - AFTER MIN_EIGEN 2\n");
+#endif
   cm = -v.transpose() * r0;
-  if (DEBUG) {
-    printf("circle_fit - AFTER MIN_EIGEN 3\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - AFTER MIN_EIGEN 3\n");
+#endif
   const double c = cm(0,0);
 //  const double c = -v.transpose() * r0;
 
-  if (DEBUG) {
-    printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
+#endif
   // COMPUTE CIRCLE PARAMETER
 
   // auxiliary quantities
@@ -705,24 +719,24 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
   circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
   circle.q = Charge(hits2D, circle.par);
   circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
-  printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:", DEBUG);
-  printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:", DEBUG);
-  if (DEBUG) {
-    printf("circle_fit - CIRCLE CHARGE: %ld\n", circle.q);
-  }
+  printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
+  printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
+#if RFIT_DEBUG
+  printf("circle_fit - CIRCLE CHARGE: %ld\n", circle.q);
+#endif
 
-  if (DEBUG) {
-    printf("circle_fit - ERROR PROPAGATION\n");
-  }
+#if RFIT_DEBUG
+  printf("circle_fit - ERROR PROPAGATION\n");
+#endif
   // ERROR PROPAGATION
   if (error) {
-    if (DEBUG) {
-      printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
-    }
+#if RFIT_DEBUG
+    printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+#endif
     ArrayNd Vcs_[2][2];  // cov matrix of center & scaled points
-    if (DEBUG) {
-      printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
-    }
+#if RFIT_DEBUG
+    printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+#endif
     {
       Matrix<double, 1, 1> cm;
       Matrix<double, 1, 1> cm2;
@@ -733,12 +747,12 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
       const Matrix2Nd Vcs = sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
                                              (2. * V.squaredNorm() + 4. * c) * // mc.transpose() * V * mc) *
                                              mc * mc.transpose();
-      printIt(&Vcs, "circle_fit - Vcs:", DEBUG);
+      printIt(&Vcs, "circle_fit - Vcs:");
       Vcs_[0][0] = Vcs.block(0, 0, n, n);
       Vcs_[0][1] = Vcs.block(0, n, n, n);
       Vcs_[1][1] = Vcs.block(n, n, n, n);
       Vcs_[1][0] = Vcs_[0][1].transpose();
-      printIt(&Vcs, "circle_fit - Vcs:", DEBUG);
+      printIt(&Vcs, "circle_fit - Vcs:");
     }
 
     MatrixNd C[3][3];  // cov matrix of 3D transformed points
@@ -758,7 +772,7 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
                       Vcs_[1][1] * Vcs_[1][1]) +
                 4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11);
     }
-    printIt(&C[0][0], "circle_fit - C[0][0]:", DEBUG);
+    printIt(&C[0][0], "circle_fit - C[0][0]:");
 
     Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
     for (u_int i = 0; i < 3; ++i) {
@@ -770,14 +784,14 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         C0(j, i) = C0(i, j);
       }
     }
-    printIt(&C0, "circle_fit - C0:", DEBUG);
+    printIt(&C0, "circle_fit - C0:");
 
     const MatrixNd W = weight * weight.transpose();
     const MatrixNd H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
     const MatrixNx3d s_v = H * p3D.transpose();
-    printIt(&W, "circle_fit - W:", DEBUG);
-    printIt(&H, "circle_fit - H:", DEBUG);
-    printIt(&s_v, "circle_fit - s_v:", DEBUG);
+    printIt(&W, "circle_fit - W:");
+    printIt(&H, "circle_fit - H:");
+    printIt(&s_v, "circle_fit - s_v:");
 
     MatrixNd D_[3][3];  // cov(s_v)
     {
@@ -791,7 +805,7 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
       D_[2][0] = D_[0][2].transpose();
       D_[2][1] = D_[1][2].transpose();
     }
-    printIt(&D_[0][0], "circle_fit - D_[0][0]:", DEBUG);
+    printIt(&D_[0][0], "circle_fit - D_[0][0]:");
 
     constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
 
@@ -830,7 +844,7 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         if (b != a) E(b, a) = E(a, b);
       }
     }
-    printIt(&E, "circle_fit - E:", DEBUG);
+    printIt(&E, "circle_fit - E:");
 
     Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
     for (u_int a = 0; a < 6; ++a) {
@@ -841,7 +855,7 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
       const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
       J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
     }
-    printIt(&J2, "circle_fit - J2:", DEBUG);
+    printIt(&J2, "circle_fit - J2:");
 
     Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
     {
@@ -860,7 +874,7 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
       Cvc(3, 3) = c;
          // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
     }
-    printIt(&Cvc, "circle_fit - Cvc:", DEBUG);
+    printIt(&Cvc, "circle_fit - Cvc:");
 
     Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
     {
@@ -868,10 +882,10 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
       J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
           0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
     }
-    printIt(&J3, "circle_fit - J3:", DEBUG);
+    printIt(&J3, "circle_fit - J3:");
 
     const RowVector2Nd Jq = mc.transpose() * s * 1. / n;  // var(q)
-    printIt(&Jq, "circle_fit - Jq:", DEBUG);
+    printIt(&Jq, "circle_fit - Jq:");
 
     Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
                        + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
@@ -879,10 +893,10 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     circle.cov = cov_uvr;
   }
 
-  printIt(&circle.cov, "Circle cov:", DEBUG);
-  if (DEBUG) {
-    printf("circle_fit - exit\n");
-  }
+  printIt(&circle.cov, "Circle cov:");
+#if RFIT_DEBUG
+  printf("circle_fit - exit\n");
+#endif
   return circle;
 }
 
@@ -921,7 +935,8 @@ CUDA_HOSTDEV inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     errors.
 */
 
-CUDA_HOSTDEV inline line_fit Line_fit(const Matrix3xNd& hits,
+__host__ __device__
+inline line_fit Line_fit(const Matrix3xNd& hits,
     const Matrix3Nd& hits_cov,
     const circle_fit& circle,
     const Vector4d& fast_fit,
@@ -931,8 +946,8 @@ CUDA_HOSTDEV inline line_fit Line_fit(const Matrix3xNd& hits,
   Matrix2xNd p2D(2, n);
   MatrixNx5d Jx(n, 5);
 
-  printIt(&hits, "Line_fit points: ", DEBUG);
-  printIt(&hits_cov, "Line_fit covs: ", DEBUG);
+  printIt(&hits, "Line_fit points: ");
+  printIt(&hits_cov, "Line_fit covs: ");
 
   // x & associated Jacobian
   // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
@@ -974,10 +989,10 @@ CUDA_HOSTDEV inline line_fit Line_fit(const Matrix3xNd& hits,
   const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
   const VectorNd weight = err2_inv * 1. / err2_inv.sum();
 
-  printIt(&x_err2, "Line_fit - x_err2: ", DEBUG);
-  printIt(&y_err2, "Line_fit - y_err2: ", DEBUG);
-  printIt(&err2_inv, "Line_fit - err2_inv: ", DEBUG);
-  printIt(&weight, "Line_fit - weight: ", DEBUG);
+  printIt(&x_err2, "Line_fit - x_err2: ");
+  printIt(&y_err2, "Line_fit - y_err2: ");
+  printIt(&err2_inv, "Line_fit - err2_inv: ");
+  printIt(&weight, "Line_fit - weight: ");
 
 
   // COST FUNCTION
@@ -993,12 +1008,12 @@ CUDA_HOSTDEV inline line_fit Line_fit(const Matrix3xNd& hits,
     A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
   }
 
-  printIt(&A, "Line_fit - A: ", DEBUG);
+  printIt(&A, "Line_fit - A: ");
 
   // minimize
   double chi2;
   Vector2d v = min_eigen2D(A, chi2);
-  printIt(&v, "Line_fit - v: ", DEBUG);
+  printIt(&v, "Line_fit - v: ");
 
   // n *= (chi2>0) ? 1 : -1; //TO FIX
   // This hack to be able to run on GPU where the automatic assignment to a
@@ -1012,7 +1027,7 @@ CUDA_HOSTDEV inline line_fit Line_fit(const Matrix3xNd& hits,
   line.par << -v(0) / v(1),                          // cotan(theta))
       -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
   line.chi2 = abs(chi2);
-  printIt(&(line.par), "Line_fit - line.par: ", DEBUG);
+  printIt(&(line.par), "Line_fit - line.par: ");
 
   // ERROR PROPAGATION
   if (error) {
@@ -1054,7 +1069,7 @@ CUDA_HOSTDEV inline line_fit Line_fit(const Matrix3xNd& hits,
     line.cov = J * C * JT;
   }
 
-  printIt(&line.cov, "Line cov:", DEBUG);
+  printIt(&line.cov, "Line cov:");
   return line;
 }
 
@@ -1128,6 +1143,4 @@ inline helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, co
 
 }  // namespace Rfit
 
-
-#endif
-
+#endif // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 6cf8a3664f903..e5cd889321f6e 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -1,14 +1,17 @@
-#include "test_common.h"
 #include <iostream>
 
-#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "test_common.h"
+
 using namespace Eigen;
 
-__global__ void kernelFullFit(Rfit::Matrix3xNd * hits,
+__global__
+void kernelFullFit(Rfit::Matrix3xNd * hits,
     Rfit::Matrix3Nd * hits_cov,
     double B,
     bool errors,
@@ -22,8 +25,8 @@ __global__ void kernelFullFit(Rfit::Matrix3xNd * hits,
 
   Rfit::Matrix2xNd hits2D_local = (hits->block(0,0,2,n)).eval();
   Rfit::Matrix2Nd hits_cov2D_local = (hits_cov->block(0, 0, 2 * n, 2 * n)).eval();
-  Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ", false);
-  Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ", false);
+  Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ");
+  Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ");
   /*
   printf("kernelFullFit - hits address: %p\n", hits);
   printf("kernelFullFit - hits_cov address: %p\n", hits_cov);
@@ -51,40 +54,44 @@ __global__ void kernelFullFit(Rfit::Matrix3xNd * hits,
   return;
 }
 
-__global__ void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
+__global__
+void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
   (*results) = Rfit::Fast_fit(*hits);
 }
 
-__global__ void kernelCircleFit(Rfit::Matrix3xNd * hits,
+__global__
+void kernelCircleFit(Rfit::Matrix3xNd * hits,
     Rfit::Matrix3Nd * hits_cov, Vector4d * fast_fit_input, double B,
     Rfit::circle_fit * circle_fit_resultsGPU) {
   u_int n = hits->cols();
   Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
 
-  if (!NODEBUG) {
-    printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
-    printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
-    printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
-    printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
-    printf("rad(0,0): %f\n", rad(0,0));
-    printf("rad(1,1): %f\n", rad(1,1));
-    printf("rad(2,2): %f\n", rad(2,2));
-    printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
-    printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
-    printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
-    printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
-    printf("B: %f\n", B);
-  }
+#if TEST_DEBUG
+  printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
+  printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
+  printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
+  printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
+  printf("rad(0,0): %f\n", rad(0,0));
+  printf("rad(1,1): %f\n", rad(1,1));
+  printf("rad(2,2): %f\n", rad(2,2));
+  printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
+  printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
+  printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
+  printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
+  printf("B: %f\n", B);
+#endif
   (*circle_fit_resultsGPU) =
     Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
       *fast_fit_input, rad, B, false, false);
 }
 
-__global__ void kernelLineFit(Rfit::Matrix3xNd * hits,
-                              Rfit::Matrix3Nd * hits_cov,
-                              Rfit::circle_fit * circle_fit,
-                              Vector4d * fast_fit,
-                              Rfit::line_fit * line_fit) {
+__global__
+void kernelLineFit(Rfit::Matrix3xNd * hits,
+                   Rfit::Matrix3Nd * hits_cov,
+                   Rfit::circle_fit * circle_fit,
+                   Vector4d * fast_fit,
+                   Rfit::line_fit * line_fit)
+{
   (*line_fit) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit, *fast_fit, true);
 }
 
@@ -125,9 +132,9 @@ void testFit() {
 
   // FAST_FIT_CPU
   Vector4d fast_fit_results = Rfit::Fast_fit(hits);
-  if (!NODEBUG) {
-    std::cout << "Generated hits:\n" << hits << std::endl;
-  }
+#if TEST_DEBUG
+  std::cout << "Generated hits:\n" << hits << std::endl;
+#endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
   // FAST_FIT GPU
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index 475762f546807..2112f5f6027a5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -1,18 +1,18 @@
-#include "test_common.h"
 #include <iostream>
 
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
+#include "test_common.h"
 
 using namespace Eigen;
 
 __host__ __device__ void eigenValues(Matrix3d * m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType * ret) {
-  if (!NODEBUG) {
-    printf("Matrix(0,0): %f\n", (*m)(0,0));
-    printf("Matrix(1,1): %f\n", (*m)(1,1));
-    printf("Matrix(2,2): %f\n", (*m)(2,2));
-  }
+#if TEST_DEBUG
+  printf("Matrix(0,0): %f\n", (*m)(0,0));
+  printf("Matrix(1,1): %f\n", (*m)(1,1));
+  printf("Matrix(2,2): %f\n", (*m)(2,2));
+#endif
   SelfAdjointEigenSolver<Matrix3d> es;
   es.computeDirect(*m);
   (*ret) = es.eigenvalues();
@@ -37,15 +37,17 @@ __global__ void kernelMultiply(M1 * J,
                                M2 * C,
                                M3 * result) {
 //  Map<M3> res(result->data());
-  if (!NODEBUG)
-    printf("*** GPU IN ***\n");
+#if TEST_DEBUG
+  printf("*** GPU IN ***\n");
+#endif
   printIt(J);
   printIt(C);
 //  res.noalias() = (*J) * (*C);
 //  printIt(&res);
   (*result) = (*J) * (*C);
-  if (!NODEBUG)
-    printf("*** GPU OUT ***\n");
+#if TEST_DEBUG
+  printf("*** GPU OUT ***\n");
+#endif
   return;
 }
 
@@ -59,12 +61,12 @@ void testMultiply() {
   Eigen::Matrix<double, row2, col2> C;
   fillMatrix(C);
   Eigen::Matrix<double, row1, col2> multiply_result = J * C;
-  if (!NODEBUG) {
-    std::cout << "Input J:" << std::endl; printIt(&J);
-    std::cout << "Input C:" << std::endl; printIt(&C);
-    std::cout << "Output:" << std::endl;
-    printIt(&multiply_result);
-  }
+#if TEST_DEBUG
+  std::cout << "Input J:" << std::endl; printIt(&J);
+  std::cout << "Input C:" << std::endl; printIt(&C);
+  std::cout << "Output:" << std::endl;
+  printIt(&multiply_result);
+#endif
   // GPU
   Eigen::Matrix<double, row1, col1> *JGPU = nullptr;
   Eigen::Matrix<double, row2, col2> *CGPU = nullptr;
@@ -95,10 +97,10 @@ void testInverse3x3() {
   Matrix3d *mGPUret = nullptr;
   Matrix3d *mCPUret = new Matrix3d();
 
-  if (!NODEBUG) {
-    std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
-    std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
-  }
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
   cudaMalloc((void **)&mGPU, sizeof(Matrix3d));
   cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
   cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
@@ -107,8 +109,9 @@ void testInverse3x3() {
   cudaDeviceSynchronize();
 
   cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
-  if (!NODEBUG)
-    std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
   assert(isEqualFuzzy(m_inv, *mCPUret));
 }
 
@@ -120,10 +123,10 @@ void testInverse4x4() {
   Matrix4d *mGPUret = nullptr;
   Matrix4d *mCPUret = new Matrix4d();
 
-  if (!NODEBUG) {
-    std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
-    std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
-  }
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
   cudaMalloc((void **)&mGPU, sizeof(Matrix4d));
   cudaMalloc((void **)&mGPUret, sizeof(Matrix4d));
   cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice);
@@ -132,8 +135,9 @@ void testInverse4x4() {
   cudaDeviceSynchronize();
 
   cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost);
-  if (!NODEBUG)
-    std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
   assert(isEqualFuzzy(m_inv, *mCPUret));
 }
 
@@ -148,11 +152,11 @@ void testEigenvalues() {
   Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret1 = new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
   Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret_gpu = nullptr;
   eigenValues(&m, ret);
-  if (!NODEBUG) {
-    std::cout << "Generated Matrix M 3x3:\n" << m << std::endl;
-    std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
-    std::cout << "*************************\n\n" << std::endl;
-  }
+#if TEST_DEBUG
+  std::cout << "Generated Matrix M 3x3:\n" << m << std::endl;
+  std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
+  std::cout << "*************************\n\n" << std::endl;
+#endif
   cudaMalloc((void **)&m_gpu, sizeof(Matrix3d));
   cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType));
   cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
@@ -162,11 +166,11 @@ void testEigenvalues() {
 
   cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
   cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost);
-  if (!NODEBUG) {
-    std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
-    std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
-    std::cout << "*************************\n\n" << std::endl;
-  }
+#if TEST_DEBUG
+std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
+std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
+std::cout << "*************************\n\n" << std::endl;
+#endif
   assert(isEqualFuzzy(*ret, *ret1));
 }
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
index 78ae4053b8429..e22fb5cfbf59b 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
+++ b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
@@ -2,21 +2,24 @@
 #define RecoPixelVertexing__PixelTrackFitting__test_common_h
 
 #include <algorithm>
-#include <random>
 #include <cassert>
+#include <random>
 
-#define NODEBUG 1
+#ifndef TEST_DEBUG
+#define TEST_DEBUG 0
+#endif
 
 template<class C>
-__host__ __device__ void printIt(C * m) {
-  if (!NODEBUG) {
-    printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
-    for (u_int r = 0; r < m->rows(); ++r) {
-      for (u_int c = 0; c < m->cols(); ++c) {
-        printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r,c));
-      }
+__host__ __device__
+void printIt(C * m) {
+#if TEST_DEBUG
+  printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
+  for (u_int r = 0; r < m->rows(); ++r) {
+    for (u_int c = 0; c < m->cols(); ++c) {
+      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r,c));
     }
   }
+#endif
 }
 
 template<class C>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index e8d389f00712b..5995d286fc38d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -1,27 +1,28 @@
 //
 // Author: Felice Pantaleo, CERN
 //
-#ifndef GPU_CACELL_H_
-#define GPU_CACELL_H_
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+
+#include <cuda_runtime.h>
 
 #include "GPUHitsAndDoublets.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include <cuda.h>
 
 struct Quadruplet {
   int hitId[4];
 };
 
-
 class GPUCACell {
 public:
-  __host__ __device__  GPUCACell() {}
-
+  GPUCACell() = default;
 
-__host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
-                              int layerPairId, int doubletId, int innerHitId,int outerHitId) {
+  __host__ __device__
+  void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
+      int layerPairId, int doubletId, int innerHitId,int outerHitId)
+  {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
@@ -40,8 +41,6 @@ __host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU cons
     theOuterNeighbors.reset();
   }
 
-
-
   constexpr float get_inner_x() const { return theInnerX; }
   constexpr float get_outer_x() const { return theOuterX; }
   constexpr float get_inner_y() const { return theInnerY; }
@@ -58,20 +57,19 @@ __host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU cons
   }
 
   constexpr void print_cell() const {
-
     printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
            "%d, innerradius %f, outerRadius %f \n",
            theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId,
            theInnerR, theOuterR);
   }
 
-
-
-  __host__ __device__ bool check_alignment_and_tag(
+  __host__ __device__
+  bool check_alignment_and_tag(
       const GPUCACell *cells, unsigned int innerCellId, const float ptmin,
       const float region_origin_x, const float region_origin_y,
       const float region_origin_radius, const float thetaCut,
-      const float phiCut, const float hardPtCut) {
+      const float phiCut, const float hardPtCut)
+  {
     auto ro = get_outer_r();
     auto zo = get_outer_z();
     const auto &otherCell = cells[innerCellId];
@@ -187,14 +185,16 @@ __host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU cons
 
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
-  #if defined(__NVCC__) || defined(__CUDACC__)
 
-  __device__ inline void find_ntuplets(
+#ifdef __CUDACC__
+
+  __device__
+  inline void find_ntuplets(
       const GPUCACell *cells,
       GPU::SimpleVector<Quadruplet> *foundNtuplets,
       GPU::VecArray<unsigned int,3> &tmpNtuplet,
-      const unsigned int minHitsPerNtuplet) const {
-
+      const unsigned int minHitsPerNtuplet) const
+  {
     // the building process for a track ends if:
     // it has no right neighbor
     // it has no compatible neighbor
@@ -220,11 +220,10 @@ __host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU cons
       }
     }
     tmpNtuplet.pop_back();
-    assert(tmpNtuplet.size()<3);
+    assert(tmpNtuplet.size() < 3);
   }
 
-#endif
-
+#endif // __CUDACC__
 
   GPU::VecArray< unsigned int, 40> theOuterNeighbors;
 
@@ -244,4 +243,4 @@ __host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU cons
   float theOuterR;
 };
 
-#endif /*CACELL_H_ */
+#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h

From 979dbdb079a9c369fddf9ae63e007025623b3843 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 9 Aug 2018 12:30:51 +0200
Subject: [PATCH 014/102] Move all CUDA code to the plugins/ directory
 (cms-patatrack#123)

Keep RiemannFit.h in the interface, as it is include-only.
---
 .../PixelTrackFitting/plugins/BuildFile.xml            |  3 ++-
 .../PixelTrackFitting/plugins/PixelTrackProducer.h     | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
index be113d7a5a3dc..c549e05d69f55 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
@@ -1,4 +1,5 @@
+<use name="cuda"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
-<library file="*.cc" name="RecoPixelVertexingPixelTrackFittingPlugins">
+<library file="*.cc *.cu" name="RecoPixelVertexingPixelTrackFittingPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
index 0803131715af6..6bc6d2815c8e7 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
@@ -1,10 +1,11 @@
-#ifndef PixelTrackProducer_H
-#define PixelTrackProducer_H
+#ifndef PixelTrackProducer_h
+#define PixelTrackProducer_h
 
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstruction.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstructionGPU.h"
+
+#include "PixelTrackReconstructionGPU.h"
 
 namespace edm { class Event; class EventSetup; class ParameterSet; class ConfigurationDescriptions; }
 class TrackerTopology;
@@ -26,4 +27,5 @@ class PixelTrackProducer :  public edm::stream::EDProducer<> {
   PixelTrackReconstruction theReconstruction;
   PixelTrackReconstructionGPU theGPUReconstruction;
 };
-#endif
+
+#endif // PixelTrackProducer_h

From f5e68314d79424a7d72227eb2a1e7cf316cf9ad8 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 10 Aug 2018 10:48:05 +0200
Subject: [PATCH 015/102] Remove GPU-CellularAutomaton dependence on CPU
 doublets (cms-patatrack#126)

---
 RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index e1ccc16bf6430..16803a957c928 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -50,6 +50,8 @@
     doublets = "pixelTracksHitDoublets",
     SeedComparitorPSet = dict(clusterShapeCacheSrc = 'siPixelClusterShapeCachePreSplitting')
 )
+from Configuration.ProcessModifiers.gpu_cff import gpu
+gpu.toModify(pixelTracksHitQuadruplets, trackingRegions = "pixelTracksTrackingRegions")
 
 # for trackingLowPU
 pixelTracksHitTriplets = _pixelTripletHLTEDProducer.clone(

From 4f361d16873211b690fc1e3ecb5f33e798311626 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 17 Aug 2018 13:18:10 +0200
Subject: [PATCH 016/102] Cleanup after merging with CMSSW 10.2.2
 (cms-patatrack#134)

Clean up unnecessary changes, whitespaces, defines and include directives.
---
 .../customizePixelTracksForProfiling.py       |  2 +-
 .../PixelTriplets/plugins/RecHitsMap.h        | 23 +++++++++----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index dc6524babec9d..4713b64e5e48a 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -7,7 +7,7 @@ def customizePixelTracksForProfiling(process):
         ),
         verbosity = cms.untracked.uint32(0),
     )
-    
+
     process.outPath = cms.EndPath(process.out)
 
     process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.outPath)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
index 566a15591472c..d27a639a5a9bf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
@@ -1,19 +1,19 @@
-#ifndef RecHitsMap_H
-#define RecHitsMap_H
-  // store T for each cluster...
+//FIXME move it to a better place...
+
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
+#define RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
+
 
-#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
 #include <cstdint>
 #include <unordered_map>
 
+#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 
-
-//FIXME move it to a better place...
+// store T for each cluster
 template<typename T>
 class RecHitsMap {
-public: 
-
+public:
      explicit RecHitsMap(T const & d=T()) : dummy(d){}
 
      void clear() {m_map.clear();}
@@ -26,7 +26,7 @@ class RecHitsMap {
        auto const & thit = static_cast<BaseTrackerRecHit const&>(hit);
        auto const & clus = thit.firstClusterRef();
 
-       if (clus.isPixel()) 
+       if (clus.isPixel())
           add(clus.pixelCluster(), *thit.detUnit(),v);
        else
           add(clus.stripCluster(), *thit.detUnit(),v);
@@ -51,7 +51,7 @@ class RecHitsMap {
      }
 
      static uint64_t encode(uint32_t ind, uint16_t mr, uint16_t mc) {
-          uint64_t u1 = ind;                            
+          uint64_t u1 = ind;
           uint64_t u2 = mr;
           uint64_t u3 = mc;
           return (u1<<32) | (u2<<16) | u3;
@@ -69,9 +69,8 @@ class RecHitsMap {
        	  return (u1<<32) | u2;
      }
 
-
      std::unordered_map<uint64_t, T > m_map;
      T dummy;
   };
 
-#endif
+#endif // RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h

From 65ac243a0ef851838429153c134ebfeb0689f1b0 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 17 Aug 2018 16:16:42 +0200
Subject: [PATCH 017/102] Add optional flags to disable SOA->legacy conversion
 and GPU->CPU transfer (cms-patatrack#132)

Always produce the CPU cluster and rechit collections, since they are needed anyway.
Add transfer and conversion flags to clusterizer, rechits and CA.
Add a skeleton for the future pixel track producer.
Add customize functions to disable conversions to legacy formats, and to disable unnecessary GPU->CPU transfers.
---
 .../customizePixelTracksForProfiling.py       | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 4713b64e5e48a..99a3a9321062b 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -1,6 +1,8 @@
 import FWCore.ParameterSet.Config as cms
 
 def customizePixelTracksForProfiling(process):
+    process.MessageLogger.cerr.FwkReport.reportEvery = 100
+
     process.out = cms.OutputModule("AsciiOutputModule",
         outputCommands = cms.untracked.vstring(
             "keep *_pixelTracks_*_*",
@@ -13,3 +15,30 @@ def customizePixelTracksForProfiling(process):
     process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.outPath)
 
     return process
+
+def customizePixelTracksForProfilingDisableConversion(process):
+    process = customizePixelTracksForProfiling(process)
+
+    # Turn off cluster shape filter so that CA doesn't depend on clusters
+    process.pixelTracksHitQuadruplets.SeedComparitorPSet = cms.PSet(ComponentName = cms.string("none"))
+
+    # Replace pixel track producer with a dummy one for now
+    from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromCUDA_cfi import pixelTrackProducerFromCUDA as _pixelTrackProducerFromCUDA
+    process.pixelTracks = _pixelTrackProducerFromCUDA.clone()
+
+    # Disable conversions to legacy
+    process.siPixelClustersPreSplitting.gpuEnableConversion = False
+    process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
+    process.pixelTracksHitQuadruplets.gpuEnableConversion = False
+
+    return process
+
+def customizePixelTracksForProfilingDisableTransfer(process):
+    process = customizePixelTracksForProfilingDisableConversion(process)
+
+    # Disable "unnecessary" transfers to CPU
+    process.siPixelClustersPreSplitting.gpuEnableTransfer = False
+    process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
+    process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
+
+    return process

From 3ef1e8a740d04f2ad31f058a8fef5c05366460f7 Mon Sep 17 00:00:00 2001
From: Felice <felice.pantaleo@cern.ch>
Date: Thu, 23 Aug 2018 17:17:19 +0200
Subject: [PATCH 018/102] Reformat the Riemann fit code (cms-patatrack#143)

Apply clang-format reformatting to RiemannFit.h
---
 .../PixelTrackFitting/interface/RiemannFit.h  | 1373 +++++++++--------
 1 file changed, 702 insertions(+), 671 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 8a1e29f78041b..0ba579033ef83 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -7,13 +7,13 @@
 
 #ifndef RFIT_DEBUG
 #define RFIT_DEBUG 0
-#endif // RFIT_DEBUG
-
-namespace Rfit {
+#endif  // RFIT_DEBUG
 
+namespace Rfit
+{
 using namespace Eigen;
 
-constexpr double d = 1.e-4;         //!< used in numerical derivative (J2 in Circle_fit())
+constexpr double d = 1.e-4;          //!< used in numerical derivative (J2 in Circle_fit())
 constexpr unsigned int max_nop = 4;  //!< In order to avoid use of dynamic memory
 
 using MatrixNd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
@@ -33,67 +33,72 @@ using RowVector2Nd = Eigen::Matrix<double, 1, Dynamic, 1, 1, 2 * max_nop>;
 using Matrix5d = Eigen::Matrix<double, 5, 5>;
 using Matrix6d = Eigen::Matrix<double, 6, 6>;
 using Vector5d = Eigen::Matrix<double, 5, 1>;
-using u_int    = unsigned int;
+using u_int = unsigned int;
 
-struct circle_fit {
-  Vector3d par;  //!< parameter: (X0,Y0,R)
-  Matrix3d cov;
-  /*!< covariance matrix: \n
+struct circle_fit
+{
+    Vector3d par;  //!< parameter: (X0,Y0,R)
+    Matrix3d cov;
+    /*!< covariance matrix: \n
       |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
   */
-  int64_t q;  //!< particle charge
-  double chi2;
+    int64_t q;  //!< particle charge
+    double chi2;
 };
 
-struct line_fit {
-  Vector2d par;  //!<(cotan(theta),Zip)
-  Matrix2d cov;
-  /*!<
+struct line_fit
+{
+    Vector2d par;  //!<(cotan(theta),Zip)
+    Matrix2d cov;
+    /*!<
       |cov(c_t,c_t)|cov(Zip,c_t)| \n
       |cov(c_t,Zip)|cov(Zip,Zip)|
   */
-  double chi2;
+    double chi2;
 };
 
-struct helix_fit {
-  Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
-  Matrix5d cov;
-  /*!< ()->cov() \n
+struct helix_fit
+{
+    Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
+    Matrix5d cov;
+    /*!< ()->cov() \n
       |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
       |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
       |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
       |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
       |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
   */
-  double chi2_circle;
-  double chi2_line;
-  Vector4d fast_fit;
-  int64_t q;  //!< particle charge
-  //  VectorXd time;  // TO FIX just for profiling
-} __attribute__ ((aligned(16)) );
-
-
-template<class C>
-__host__ __device__
-void printIt(C * m, const char * prefix = "") {
+    double chi2_circle;
+    double chi2_line;
+    Vector4d fast_fit;
+    int64_t q;  //!< particle charge
+                //  VectorXd time;  // TO FIX just for profiling
+} __attribute__((aligned(16)));
+
+template <class C>
+__host__ __device__ void printIt(C* m, const char* prefix = "")
+{
 #if RFIT_DEBUG
-  for (u_int r = 0; r < m->rows(); ++r) {
-    for (u_int c = 0; c < m->cols(); ++c) {
-      printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r,c));
+    for (u_int r = 0; r < m->rows(); ++r)
+    {
+        for (u_int c = 0; c < m->cols(); ++c)
+        {
+            printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
+        }
     }
-  }
 #endif
 }
 
-
 /*!
     \brief raise to square.
 */
 template <typename T>
-__host__ __device__
-inline T sqr(const T a) { return a * a; }
+__host__ __device__ inline T sqr(const T a)
+{
+    return a * a;
+}
 
 /*!
     \brief Compute cross product of two 2D vector (assuming z component 0),
@@ -105,9 +110,9 @@ inline T sqr(const T a) { return a * a; }
     \return z component of the cross product.
 */
 
-__host__ __device__
-inline double cross2D(const Vector2d& a, const Vector2d& b) {
-  return a.x() * b.y() - a.y() * b.x();
+__host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
+{
+    return a.x() * b.y() - a.y() * b.x();
 }
 
 /*!
@@ -130,30 +135,33 @@ inline double cross2D(const Vector2d& a, const Vector2d& b) {
 
  */
 // X in input TO FIX
-__host__ __device__
-inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
-    const Vector4d& fast_fit,
-    VectorNd const & rad,
-    double B) {
-  u_int n = p2D.cols();
-  double X = 0.04;
-  double theta = atan(fast_fit(3));
-  double radlen_eff = X * sqrt(fast_fit(3) * fast_fit(3) + 1);
-  double p_t = fast_fit(2) * B;
-  double p_2 = p_t * p_t * (1. + 1./(fast_fit(3)*fast_fit(3)));
-
-  MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
-  const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff ;
-  for (u_int k = 0; k < n; ++k) {
-    for (u_int l = k; l < n; ++l) {
-      for (u_int i = 0; i < std::min(k, l); ++i) {
-        scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(sin(theta));
-        scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
-      }
+__host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
+                                                    const Vector4d& fast_fit,
+                                                    VectorNd const& rad,
+                                                    double B)
+{
+    u_int n = p2D.cols();
+    double X = 0.04;
+    double theta = atan(fast_fit(3));
+    double radlen_eff = X * sqrt(fast_fit(3) * fast_fit(3) + 1);
+    double p_t = fast_fit(2) * B;
+    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+
+    MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
+    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
+    for (u_int k = 0; k < n; ++k)
+    {
+        for (u_int l = k; l < n; ++l)
+        {
+            for (u_int i = 0; i < std::min(k, l); ++i)
+            {
+                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(sin(theta));
+                scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+            }
+        }
     }
-  }
-  printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
-  return scatter_cov_rad;
+    printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+    return scatter_cov_rad;
 }
 
 /*!
@@ -166,32 +174,34 @@ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
     \return cov_cart covariance matrix in Cartesian coordinates.
 */
 
-__host__ __device__
-inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
-    const MatrixNd& cov_rad,
-    const VectorNd &rad) {
+__host__ __device__ inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
+                                                   const MatrixNd& cov_rad,
+                                                   const VectorNd& rad)
+{
 #if RFIT_DEBUG
     printf("Address of p2D: %p\n", &p2D);
 #endif
-  printIt(&p2D, "cov_radtocart - p2D:");
-  u_int n = p2D.cols();
-  Matrix2Nd cov_cart = MatrixXd::Zero(2 * n, 2 * n);
-  VectorNd rad_inv = rad.cwiseInverse();
-  printIt(&rad_inv, "cov_radtocart - rad_inv:");
-  for (u_int i = 0; i < n; ++i) {
-    for (u_int j = i; j < n; ++j) {
-      cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
-      cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
-      cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
-      cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
-
-      cov_cart(j, i) = cov_cart(i, j);
-      cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
-      cov_cart(j + n, i) = cov_cart(i, j + n);
-      cov_cart(j, i + n) = cov_cart(i + n, j);
+    printIt(&p2D, "cov_radtocart - p2D:");
+    u_int n = p2D.cols();
+    Matrix2Nd cov_cart = MatrixXd::Zero(2 * n, 2 * n);
+    VectorNd rad_inv = rad.cwiseInverse();
+    printIt(&rad_inv, "cov_radtocart - rad_inv:");
+    for (u_int i = 0; i < n; ++i)
+    {
+        for (u_int j = i; j < n; ++j)
+        {
+            cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+            cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+            cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+            cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+
+            cov_cart(j, i) = cov_cart(i, j);
+            cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
+            cov_cart(j + n, i) = cov_cart(i, j + n);
+            cov_cart(j, i + n) = cov_cart(i + n, j);
+        }
     }
-  }
-  return cov_cart;
+    return cov_cart;
 }
 
 /*!
@@ -207,24 +217,24 @@ inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
 
     \warning correlation between different point are not computed.
 */
-__host__ __device__
-inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
-    const Matrix2Nd& cov_cart,
-    const VectorNd& rad) {
-  u_int n = p2D.cols();
-  MatrixNd cov_rad = MatrixXd::Zero(n, n);
-  const VectorNd rad_inv2 = rad.cwiseInverse().array().square();
-  for (u_int i = 0; i < n; ++i) {
-    //!< in case you have (0,0) to avoid dividing by 0 radius
-    if (rad(i) < 1.e-4)
-      cov_rad(i, i) = cov_cart(i, i);
-    else {
-      cov_rad(i, i) =
-          rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
-                         2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+__host__ __device__ inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
+                                                  const Matrix2Nd& cov_cart,
+                                                  const VectorNd& rad)
+{
+    u_int n = p2D.cols();
+    MatrixNd cov_rad = MatrixXd::Zero(n, n);
+    const VectorNd rad_inv2 = rad.cwiseInverse().array().square();
+    for (u_int i = 0; i < n; ++i)
+    {
+        //!< in case you have (0,0) to avoid dividing by 0 radius
+        if (rad(i) < 1.e-4)
+            cov_rad(i, i) = cov_cart(i, i);
+        else
+        {
+            cov_rad(i, i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) - 2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+        }
     }
-  }
-  return cov_rad;
+    return cov_rad;
 }
 
 /*!
@@ -243,29 +253,29 @@ inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
 
 */
 
-__host__ __device__
-inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
-    const Vector4d& fast_fit,
-    const VectorNd& rad) {
-  u_int n = p2D.cols();
-  MatrixNd cov_rad = MatrixXd::Zero(n, n);
-  for (u_int i = 0; i < n; ++i) {
-    //!< in case you have (0,0) to avoid dividing by 0 radius
-    if (rad(i) < 1.e-4)
-      cov_rad(i, i) = cov_cart(i, i);  // TO FIX
-    else {
-      Vector2d a = p2D.col(i);
-      Vector2d b = p2D.col(i) - fast_fit.head(2);
-      const double x2 = a.dot(b);
-      const double y2 = cross2D(a, b);
-      const double tan_c = - y2/x2;
-      const double tan_c2 = sqr(tan_c);
-      cov_rad(i, i) =
-          1. / (1. + tan_c2) *
-          (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+__host__ __device__ inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
+                                                         const Vector4d& fast_fit,
+                                                         const VectorNd& rad)
+{
+    u_int n = p2D.cols();
+    MatrixNd cov_rad = MatrixXd::Zero(n, n);
+    for (u_int i = 0; i < n; ++i)
+    {
+        //!< in case you have (0,0) to avoid dividing by 0 radius
+        if (rad(i) < 1.e-4)
+            cov_rad(i, i) = cov_cart(i, i);  // TO FIX
+        else
+        {
+            Vector2d a = p2D.col(i);
+            Vector2d b = p2D.col(i) - fast_fit.head(2);
+            const double x2 = a.dot(b);
+            const double y2 = cross2D(a, b);
+            const double tan_c = -y2 / x2;
+            const double tan_c2 = sqr(tan_c);
+            cov_rad(i, i) = 1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+        }
     }
-  }
-  return cov_rad;
+    return cov_rad;
 }
 
 /*!
@@ -282,9 +292,9 @@ inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov
     diagonal cov matrix. Further investigation needed.
 */
 
-__host__ __device__
-inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv) {
-  return cov_rad_inv.colwise().sum().transpose();
+__host__ __device__ inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv)
+{
+    return cov_rad_inv.colwise().sum().transpose();
 }
 
 /*!
@@ -299,9 +309,9 @@ inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv) {
     \return weight points' weights' vector for the line fit (ODR).
 */
 
-__host__ __device__
-inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta) {
-  return (1. + sqr(tan_theta)) * 1. / (x_err2 + y_err2 * sqr(tan_theta));
+__host__ __device__ inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta)
+{
+    return (1. + sqr(tan_theta)) * 1. / (x_err2 + y_err2 * sqr(tan_theta));
 }
 
 /*!
@@ -315,13 +325,9 @@ inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const
     \return q int 1 or -1.
 */
 
-__host__ __device__
-inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
-  return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) -
-              (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
-          0)
-             ? -1
-             : 1;
+__host__ __device__ inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr)
+{
+    return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) > 0)? -1 : 1;
 }
 
 /*!
@@ -334,22 +340,22 @@ inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr) {
     \param error flag for errors computation.
 */
 
-__host__ __device__
-inline void par_uvrtopak(circle_fit& circle, const double B, const bool& error) {
-  Vector3d par_pak;
-  const double temp0 = circle.par.head(2).squaredNorm();
-  const double temp1 = sqrt(temp0);
-  par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)),
-      circle.q * (temp1 - circle.par(2)), circle.par(2) * B;
-  if (error) {
-    const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-    const double temp3 = 1. / temp1 * circle.q;
-    Matrix3d J4;
-    J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0.,
-        circle.par(0) * temp3, circle.par(1) * temp3, -circle.q, 0., 0., B;
-    circle.cov = J4 * circle.cov * J4.transpose();
-  }
-  circle.par = par_pak;
+__host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool& error)
+{
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)),
+        circle.q * (temp1 - circle.par(2)), circle.par(2) * B;
+    if (error)
+    {
+        const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+        const double temp3 = 1. / temp1 * circle.q;
+        Matrix3d J4;
+        J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3, circle.par(1) * temp3, -circle.q, 0., 0., B;
+        circle.cov = J4 * circle.cov * J4.transpose();
+    }
+    circle.par = par_pak;
 }
 
 /*!
@@ -367,21 +373,23 @@ inline void par_uvrtopak(circle_fit& circle, const double B, const bool& error)
     \return x_err2 squared errors in the x axis.
 */
 
-__host__ __device__
-inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
-                const bool& error, u_int n) {
-  VectorNd x_err2(n);
-  for (u_int i = 0; i < n; ++i) {
-    Matrix5d Cov = MatrixXd::Zero(5, 5);
-    if (error) Cov.block(0, 0, 3, 3) = circle.cov;
-    Cov(3, 3) = V(i, i);
-    Cov(4, 4) = V(i + n, i + n);
-    Cov(3, 4) = Cov(4, 3) = V(i, i + n);
-    Eigen::Matrix<double, 1, 1> tmp;
-    tmp = J.row(i) * Cov * J.row(i).transpose().eval();
-    x_err2(i) = tmp(0,0);
-  }
-  return x_err2;
+__host__ __device__ inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
+                                           const bool& error, u_int n)
+{
+    VectorNd x_err2(n);
+    for (u_int i = 0; i < n; ++i)
+    {
+        Matrix5d Cov = MatrixXd::Zero(5, 5);
+        if (error)
+            Cov.block(0, 0, 3, 3) = circle.cov;
+        Cov(3, 3) = V(i, i);
+        Cov(4, 4) = V(i + n, i + n);
+        Cov(3, 4) = Cov(4, 3) = V(i, i + n);
+        Eigen::Matrix<double, 1, 1> tmp;
+        tmp = J.row(i) * Cov * J.row(i).transpose().eval();
+        x_err2(i) = tmp(0, 0);
+    }
+    return x_err2;
 }
 
 /*!
@@ -404,19 +412,19 @@ inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const Matri
 
 */
 
-__host__ __device__
-inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
+__host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2)
+{
 #if RFIT_DEBUG
-  printf("min_eigen3D - enter\n");
+    printf("min_eigen3D - enter\n");
 #endif
-  SelfAdjointEigenSolver<Matrix3d> solver(3);
-  solver.computeDirect(A);
-  int min_index;
-  chi2 = solver.eigenvalues().minCoeff(&min_index);
+    SelfAdjointEigenSolver<Matrix3d> solver(3);
+    solver.computeDirect(A);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
 #if RFIT_DEBUG
-  printf("min_eigen3D - exit\n");
+    printf("min_eigen3D - exit\n");
 #endif
-  return solver.eigenvectors().col(min_index);
+    return solver.eigenvectors().col(min_index);
 }
 
 /*!
@@ -433,13 +441,13 @@ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
     speed up in  single precision.
 */
 
-__host__ __device__
-inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
-  SelfAdjointEigenSolver<Matrix3f> solver(3);
-  solver.computeDirect(A.cast<float>());
-  int min_index;
-  solver.eigenvalues().minCoeff(&min_index);
-  return solver.eigenvectors().col(min_index).cast<double>();
+__host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A)
+{
+    SelfAdjointEigenSolver<Matrix3f> solver(3);
+    solver.computeDirect(A.cast<float>());
+    int min_index;
+    solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index).cast<double>();
 }
 
 /*!
@@ -455,13 +463,13 @@ inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
     significantly in single precision.
 */
 
-__host__ __device__
-inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
-  SelfAdjointEigenSolver<Matrix2d> solver(2);
-  solver.computeDirect(A);
-  int min_index;
-  chi2 = solver.eigenvalues().minCoeff(&min_index);
-  return solver.eigenvectors().col(min_index);
+__host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2)
+{
+    SelfAdjointEigenSolver<Matrix2d> solver(2);
+    solver.computeDirect(A);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index);
 }
 
 /*!
@@ -481,64 +489,67 @@ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
     - computation of error due to multiple scattering.
 */
 
-__host__ __device__
-inline Vector4d Fast_fit(const Matrix3xNd& hits) {
-  Vector4d result;
-  u_int n = hits.cols(); // get the number of hits
-  printIt(&hits, "Fast_fit - hits: ");
-
-  // CIRCLE FIT
-  // Make segments between middle-to-first(b) and last-to-first(c) hits
-  const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-  const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
-  printIt(&b, "Fast_fit - b: ");
-  printIt(&c, "Fast_fit - c: ");
-  // Compute their lengths
-  const double b2 = b.squaredNorm();
-  const double c2 = c.squaredNorm();
-  double X0;
-  double Y0;
-  // The algebra has been verified (MR). The usual approach has been followed:
-  // * use an orthogonal reference frame passing from the first point.
-  // * build the segments (chords)
-  // * build orthogonal lines through mid points
-  // * make a system and solve for X0 and Y0.
-  // * add the initial point
-  if (abs(b.x()) > abs(b.y())) {  //!< in case b.x is 0 (2 hits with same x)
-    const double k = c.x() / b.x();
-    const double div = 2. * (k * b.y() - c.y());
-    // if aligned TO FIX
-    Y0 = (k * b2 - c2) / div;
-    X0 = b2 / (2 * b.x()) - b.y() / b.x() * Y0;
-  } else {
-    const double k = c.y() / b.y();
-    const double div = 2. * (k * b.x() - c.x());
-    // if aligned TO FIX
-    X0 = (k * b2 - c2) / div;
-    Y0 = b2 / (2 * b.y()) - b.x() / b.y() * X0;
-  }
-
-  result(0) = X0 + hits(0, 0);
-  result(1) = Y0 + hits(1, 0);
-  result(2) = sqrt(sqr(X0) + sqr(Y0));
-  printIt(&result, "Fast_fit - result: ");
-
-  // LINE FIT
-  const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-  const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
-  printIt(&e, "Fast_fit - e: ");
-  printIt(&d, "Fast_fit - d: ");
-  // Compute the arc-length between first and last point: L = R * theta = R *  atan (tan (Theta) )
-  const double dr = result(2) * atan2(cross2D(d, e), d.dot(e));
-  // Simple difference in Z between last and first hit
-  const double dz = hits(2, n - 1) - hits(2, 0);
-
-  result(3) = (dr / dz);
+__host__ __device__ inline Vector4d Fast_fit(const Matrix3xNd& hits)
+{
+    Vector4d result;
+    u_int n = hits.cols();  // get the number of hits
+    printIt(&hits, "Fast_fit - hits: ");
+
+    // CIRCLE FIT
+    // Make segments between middle-to-first(b) and last-to-first(c) hits
+    const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+    printIt(&b, "Fast_fit - b: ");
+    printIt(&c, "Fast_fit - c: ");
+    // Compute their lengths
+    const double b2 = b.squaredNorm();
+    const double c2 = c.squaredNorm();
+    double X0;
+    double Y0;
+    // The algebra has been verified (MR). The usual approach has been followed:
+    // * use an orthogonal reference frame passing from the first point.
+    // * build the segments (chords)
+    // * build orthogonal lines through mid points
+    // * make a system and solve for X0 and Y0.
+    // * add the initial point
+    if (abs(b.x()) > abs(b.y()))
+    {  //!< in case b.x is 0 (2 hits with same x)
+        const double k = c.x() / b.x();
+        const double div = 2. * (k * b.y() - c.y());
+        // if aligned TO FIX
+        Y0 = (k * b2 - c2) / div;
+        X0 = b2 / (2 * b.x()) - b.y() / b.x() * Y0;
+    }
+    else
+    {
+        const double k = c.y() / b.y();
+        const double div = 2. * (k * b.x() - c.x());
+        // if aligned TO FIX
+        X0 = (k * b2 - c2) / div;
+        Y0 = b2 / (2 * b.y()) - b.x() / b.y() * X0;
+    }
+
+    result(0) = X0 + hits(0, 0);
+    result(1) = Y0 + hits(1, 0);
+    result(2) = sqrt(sqr(X0) + sqr(Y0));
+    printIt(&result, "Fast_fit - result: ");
+
+    // LINE FIT
+    const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+    printIt(&e, "Fast_fit - e: ");
+    printIt(&d, "Fast_fit - d: ");
+    // Compute the arc-length between first and last point: L = R * theta = R *  atan (tan (Theta) )
+    const double dr = result(2) * atan2(cross2D(d, e), d.dot(e));
+    // Simple difference in Z between last and first hit
+    const double dz = hits(2, n - 1) - hits(2, 0);
+
+    result(3) = (dr / dz);
 
 #if RFIT_DEBUG
-  printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
+    printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
 #endif
-  return result;
+    return result;
 }
 
 /*!
@@ -573,331 +584,348 @@ inline Vector4d Fast_fit(const Matrix3xNd& hits) {
     scattering.
 */
 
-__host__ __device__
-inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
-    const Matrix2Nd & hits_cov2D,
-    const Vector4d  & fast_fit,
-    const VectorNd  & rad,
-    const double B,
-    const bool error = true,
-    const bool scattering = false)
+__host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
+                                                 const Matrix2Nd& hits_cov2D,
+                                                 const Vector4d& fast_fit,
+                                                 const VectorNd& rad,
+                                                 const double B,
+                                                 const bool error = true,
+                                                 const bool scattering = false)
 {
 #if RFIT_DEBUG
-  printf("circle_fit - enter\n");
+    printf("circle_fit - enter\n");
 #endif
-  // INITIALIZATION
-  Matrix2Nd V = hits_cov2D;
-  u_int n = hits2D.cols();
-  printIt(&hits2D, "circle_fit - hits2D:");
-  printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
+    // INITIALIZATION
+    Matrix2Nd V = hits_cov2D;
+    u_int n = hits2D.cols();
+    printIt(&hits2D, "circle_fit - hits2D:");
+    printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
 
 #if RFIT_DEBUG
-  printf("circle_fit - WEIGHT COMPUTATION\n");
+    printf("circle_fit - WEIGHT COMPUTATION\n");
 #endif
-  // WEIGHT COMPUTATION
-  VectorNd weight;
-  MatrixNd G;
-  double renorm;
-  {
-    MatrixNd cov_rad;
-    cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad);
-    printIt(&cov_rad, "circle_fit - cov_rad:");
-    // cov_rad = cov_carttorad(hits2D, V);
-
-    if (scattering) {
-      MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
-      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
-      printIt(&hits2D, "circle_fit - hits2D bis:");
+    // WEIGHT COMPUTATION
+    VectorNd weight;
+    MatrixNd G;
+    double renorm;
+    {
+        MatrixNd cov_rad;
+        cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad);
+        printIt(&cov_rad, "circle_fit - cov_rad:");
+        // cov_rad = cov_carttorad(hits2D, V);
+
+        if (scattering)
+        {
+            MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
+            printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
+            printIt(&hits2D, "circle_fit - hits2D bis:");
 #if RFIT_DEBUG
-      printf("Address of hits2D: a) %p\n", &hits2D);
+            printf("Address of hits2D: a) %p\n", &hits2D);
 #endif
-      V += cov_radtocart(hits2D, scatter_cov_rad, rad);
-      printIt(&V, "circle_fit - V:");
-      cov_rad += scatter_cov_rad;
-      printIt(&cov_rad, "circle_fit - cov_rad:");
-      Matrix4d cov_rad4 = cov_rad;
-      Matrix4d G4;
-      G4 = cov_rad4.inverse();
-      printIt(&G4, "circle_fit - G4:");
-      renorm = G4.sum();
-      G4 *= 1. / renorm;
-      printIt(&G4, "circle_fit - G4:");
-      G = G4;
-      weight = Weight_circle(G);
-    } else {
-      weight = cov_rad.diagonal().cwiseInverse();
-      renorm = weight.sum();
-      weight *= 1. / renorm;
+            V += cov_radtocart(hits2D, scatter_cov_rad, rad);
+            printIt(&V, "circle_fit - V:");
+            cov_rad += scatter_cov_rad;
+            printIt(&cov_rad, "circle_fit - cov_rad:");
+            Matrix4d cov_rad4 = cov_rad;
+            Matrix4d G4;
+            G4 = cov_rad4.inverse();
+            printIt(&G4, "circle_fit - G4:");
+            renorm = G4.sum();
+            G4 *= 1. / renorm;
+            printIt(&G4, "circle_fit - G4:");
+            G = G4;
+            weight = Weight_circle(G);
+        }
+        else
+        {
+            weight = cov_rad.diagonal().cwiseInverse();
+            renorm = weight.sum();
+            weight *= 1. / renorm;
+        }
     }
-  }
-  printIt(&weight, "circle_fit - weight:");
+    printIt(&weight, "circle_fit - weight:");
 
-  // SPACE TRANSFORMATION
+    // SPACE TRANSFORMATION
 #if RFIT_DEBUG
-  printf("circle_fit - SPACE TRANSFORMATION\n");
+    printf("circle_fit - SPACE TRANSFORMATION\n");
 #endif
 
-  // center
+    // center
 #if RFIT_DEBUG
-  printf("Address of hits2D: b) %p\n", &hits2D);
+    printf("Address of hits2D: b) %p\n", &hits2D);
 #endif
-  const Vector2d h_ = hits2D.rowwise().mean();  // centroid
-  printIt(&h_, "circle_fit - h_:");
-  Matrix3xNd p3D(3, n);
-  p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
-  printIt(&p3D, "circle_fit - p3D: a)");
-  Vector2Nd mc(2 * n);  // centered hits, used in error computation
-  mc << p3D.row(0).transpose(), p3D.row(1).transpose();
-  printIt(&mc, "circle_fit - mc(centered hits):");
-
-  // scale
-  const double q = mc.squaredNorm();
-  const double s = sqrt(n * 1. / q);  // scaling factor
-  p3D *= s;
-
-  // project on paraboloid
-  p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
-  printIt(&p3D, "circle_fit - p3D: b)");
+    const Vector2d h_ = hits2D.rowwise().mean();  // centroid
+    printIt(&h_, "circle_fit - h_:");
+    Matrix3xNd p3D(3, n);
+    p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
+    printIt(&p3D, "circle_fit - p3D: a)");
+    Vector2Nd mc(2 * n);  // centered hits, used in error computation
+    mc << p3D.row(0).transpose(), p3D.row(1).transpose();
+    printIt(&mc, "circle_fit - mc(centered hits):");
+
+    // scale
+    const double q = mc.squaredNorm();
+    const double s = sqrt(n * 1. / q);  // scaling factor
+    p3D *= s;
+
+    // project on paraboloid
+    p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
+    printIt(&p3D, "circle_fit - p3D: b)");
 
 #if RFIT_DEBUG
-  printf("circle_fit - COST FUNCTION\n");
+    printf("circle_fit - COST FUNCTION\n");
 #endif
-  // COST FUNCTION
-
-  // compute
-  Matrix3d A = Matrix3d::Zero();
-  const Vector3d r0 = p3D * weight;  // center of gravity
-  const Matrix3xNd X = p3D.colwise() - r0;
-  if (scattering)
-    A = X * G * X.transpose();
-  else {
-    for (u_int i = 0; i < n; ++i) A += weight(i) * (X.col(i) * X.col(i).transpose());
-  }
-  printIt(&A, "circle_fit - A:");
+    // COST FUNCTION
+
+    // compute
+    Matrix3d A = Matrix3d::Zero();
+    const Vector3d r0 = p3D * weight;  // center of gravity
+    const Matrix3xNd X = p3D.colwise() - r0;
+    if (scattering)
+        A = X * G * X.transpose();
+    else
+    {
+        for (u_int i = 0; i < n; ++i)
+            A += weight(i) * (X.col(i) * X.col(i).transpose());
+    }
+    printIt(&A, "circle_fit - A:");
 
 #if RFIT_DEBUG
-  printf("circle_fit - MINIMIZE\n");
+    printf("circle_fit - MINIMIZE\n");
 #endif
-  // minimize
-  double chi2;
-  Vector3d v = min_eigen3D(A, chi2);
+    // minimize
+    double chi2;
+    Vector3d v = min_eigen3D(A, chi2);
 #if RFIT_DEBUG
-  printf("circle_fit - AFTER MIN_EIGEN\n");
+    printf("circle_fit - AFTER MIN_EIGEN\n");
 #endif
-  printIt(&v, "v BEFORE INVERSION");
-  v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
-  printIt(&v, "v AFTER INVERSION");
-  // This hack to be able to run on GPU where the automatic assignment to a
-  // double from the vector multiplication is not working.
+    printIt(&v, "v BEFORE INVERSION");
+    v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+    printIt(&v, "v AFTER INVERSION");
+    // This hack to be able to run on GPU where the automatic assignment to a
+    // double from the vector multiplication is not working.
 #if RFIT_DEBUG
-  printf("circle_fit - AFTER MIN_EIGEN 1\n");
+    printf("circle_fit - AFTER MIN_EIGEN 1\n");
 #endif
-  Matrix<double, 1, 1> cm;
+    Matrix<double, 1, 1> cm;
 #if RFIT_DEBUG
-  printf("circle_fit - AFTER MIN_EIGEN 2\n");
+    printf("circle_fit - AFTER MIN_EIGEN 2\n");
 #endif
-  cm = -v.transpose() * r0;
+    cm = -v.transpose() * r0;
 #if RFIT_DEBUG
-  printf("circle_fit - AFTER MIN_EIGEN 3\n");
+    printf("circle_fit - AFTER MIN_EIGEN 3\n");
 #endif
-  const double c = cm(0,0);
-//  const double c = -v.transpose() * r0;
+    const double c = cm(0, 0);
+    //  const double c = -v.transpose() * r0;
 
 #if RFIT_DEBUG
-  printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
+    printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
 #endif
-  // COMPUTE CIRCLE PARAMETER
-
-  // auxiliary quantities
-  const double h = sqrt(1. - sqr(v(2)) - 4. * c * v(2));
-  const double v2x2_inv = 1. / (2. * v(2));
-  const double s_inv = 1. / s;
-  Vector3d par_uvr_;  // used in error propagation
-  par_uvr_ << -v(0) * v2x2_inv, -v(1) * v2x2_inv, h * v2x2_inv;
-
-  circle_fit circle;
-  circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
-  circle.q = Charge(hits2D, circle.par);
-  circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
-  printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
-  printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
+    // COMPUTE CIRCLE PARAMETER
+
+    // auxiliary quantities
+    const double h = sqrt(1. - sqr(v(2)) - 4. * c * v(2));
+    const double v2x2_inv = 1. / (2. * v(2));
+    const double s_inv = 1. / s;
+    Vector3d par_uvr_;  // used in error propagation
+    par_uvr_ << -v(0) * v2x2_inv, -v(1) * v2x2_inv, h * v2x2_inv;
+
+    circle_fit circle;
+    circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
+    circle.q = Charge(hits2D, circle.par);
+    circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
+    printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
+    printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
 #if RFIT_DEBUG
-  printf("circle_fit - CIRCLE CHARGE: %ld\n", circle.q);
+    printf("circle_fit - CIRCLE CHARGE: %ld\n", circle.q);
 #endif
 
 #if RFIT_DEBUG
-  printf("circle_fit - ERROR PROPAGATION\n");
+    printf("circle_fit - ERROR PROPAGATION\n");
 #endif
-  // ERROR PROPAGATION
-  if (error) {
+    // ERROR PROPAGATION
+    if (error)
+    {
 #if RFIT_DEBUG
-    printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+        printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
 #endif
-    ArrayNd Vcs_[2][2];  // cov matrix of center & scaled points
+        ArrayNd Vcs_[2][2];  // cov matrix of center & scaled points
 #if RFIT_DEBUG
-    printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+        printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
 #endif
-    {
-      Matrix<double, 1, 1> cm;
-      Matrix<double, 1, 1> cm2;
-      cm = mc.transpose() * V * mc;
-//      cm2 = mc * mc.transpose();
-      const double c = cm(0,0);
-//      const double c2 = cm2(0,0);
-      const Matrix2Nd Vcs = sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
-                                             (2. * V.squaredNorm() + 4. * c) * // mc.transpose() * V * mc) *
-                                             mc * mc.transpose();
-      printIt(&Vcs, "circle_fit - Vcs:");
-      Vcs_[0][0] = Vcs.block(0, 0, n, n);
-      Vcs_[0][1] = Vcs.block(0, n, n, n);
-      Vcs_[1][1] = Vcs.block(n, n, n, n);
-      Vcs_[1][0] = Vcs_[0][1].transpose();
-      printIt(&Vcs, "circle_fit - Vcs:");
-    }
-
-    MatrixNd C[3][3];  // cov matrix of 3D transformed points
-    {
-      const ArrayNd t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
-      const ArrayNd t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
-      const ArrayNd t00 = p3D.row(0).transpose() * p3D.row(0);
-      const ArrayNd t01 = p3D.row(0).transpose() * p3D.row(1);
-      const ArrayNd t11 = p3D.row(1).transpose() * p3D.row(1);
-      const ArrayNd t10 = t01.transpose();
-      C[0][0] = Vcs_[0][0];
-      C[0][1] = Vcs_[0][1];
-      C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
-      C[1][1] = Vcs_[1][1];
-      C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
-      C[2][2] = 2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
-                      Vcs_[1][1] * Vcs_[1][1]) +
-                4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11);
-    }
-    printIt(&C[0][0], "circle_fit - C[0][0]:");
-
-    Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
-    for (u_int i = 0; i < 3; ++i) {
-      for (u_int j = i; j < 3; ++j) {
-        Matrix<double, 1, 1> tmp;
-        tmp = weight.transpose() * C[i][j] * weight;
-        const double c = tmp(0,0);
-        C0(i, j) = c; //weight.transpose() * C[i][j] * weight;
-        C0(j, i) = C0(i, j);
-      }
-    }
-    printIt(&C0, "circle_fit - C0:");
-
-    const MatrixNd W = weight * weight.transpose();
-    const MatrixNd H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
-    const MatrixNx3d s_v = H * p3D.transpose();
-    printIt(&W, "circle_fit - W:");
-    printIt(&H, "circle_fit - H:");
-    printIt(&s_v, "circle_fit - s_v:");
-
-    MatrixNd D_[3][3];  // cov(s_v)
-    {
-      D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
-      D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
-      D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
-      D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
-      D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
-      D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
-      D_[1][0] = D_[0][1].transpose();
-      D_[2][0] = D_[0][2].transpose();
-      D_[2][1] = D_[1][2].transpose();
-    }
-    printIt(&D_[0][0], "circle_fit - D_[0][0]:");
-
-    constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
-
-    Matrix6d E;  // cov matrix of the 6 independent elements of A
-    for (u_int a = 0; a < 6; ++a) {
-      const u_int i = nu[a][0], j = nu[a][1];
-      for (u_int b = a; b < 6; ++b) {
-        const u_int k = nu[b][0], l = nu[b][1];
-        VectorNd t0(n);
-        VectorNd t1(n);
-        if (l == k) {
-          t0 = 2. * D_[j][l] * s_v.col(l);
-          if (i == j)
-            t1 = t0;
-          else
-            t1 = 2. * D_[i][l] * s_v.col(l);
-        } else {
-          t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
-          if (i == j)
-            t1 = t0;
-          else
-            t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
+        {
+            Matrix<double, 1, 1> cm;
+            Matrix<double, 1, 1> cm2;
+            cm = mc.transpose() * V * mc;
+            //      cm2 = mc * mc.transpose();
+            const double c = cm(0, 0);
+            //      const double c2 = cm2(0,0);
+            const Matrix2Nd Vcs = sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
+                                                   (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
+                                                   mc * mc.transpose();
+            printIt(&Vcs, "circle_fit - Vcs:");
+            Vcs_[0][0] = Vcs.block(0, 0, n, n);
+            Vcs_[0][1] = Vcs.block(0, n, n, n);
+            Vcs_[1][1] = Vcs.block(n, n, n, n);
+            Vcs_[1][0] = Vcs_[0][1].transpose();
+            printIt(&Vcs, "circle_fit - Vcs:");
         }
 
-        if (i == j) {
-          Matrix<double, 1, 1> cm;
-          cm = s_v.col(i).transpose() * (t0 + t1);
-          const double c = cm(0,0);
-          E(a, b) = 0. + c;
-        } else {
-          Matrix<double, 1, 1> cm;
-          cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
-          const double c = cm(0,0);
-          E(a, b) = 0. + c;//(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+        MatrixNd C[3][3];  // cov matrix of 3D transformed points
+        {
+            const ArrayNd t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+            const ArrayNd t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+            const ArrayNd t00 = p3D.row(0).transpose() * p3D.row(0);
+            const ArrayNd t01 = p3D.row(0).transpose() * p3D.row(1);
+            const ArrayNd t11 = p3D.row(1).transpose() * p3D.row(1);
+            const ArrayNd t10 = t01.transpose();
+            C[0][0] = Vcs_[0][0];
+            C[0][1] = Vcs_[0][1];
+            C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
+            C[1][1] = Vcs_[1][1];
+            C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
+            C[2][2] = 2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
+                            Vcs_[1][1] * Vcs_[1][1]) +
+                      4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11);
         }
-        if (b != a) E(b, a) = E(a, b);
-      }
-    }
-    printIt(&E, "circle_fit - E:");
-
-    Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
-    for (u_int a = 0; a < 6; ++a) {
-      const u_int i = nu[a][0], j = nu[a][1];
-      Matrix3d Delta = Matrix3d::Zero();
-      Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
-      J2.col(a) = min_eigen3D_fast(A + Delta);
-      const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
-      J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
-    }
-    printIt(&J2, "circle_fit - J2:");
-
-    Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
-    {
-      Matrix3d t0 = J2 * E * J2.transpose();
-      Vector3d t1 = -t0 * r0;
-      Cvc.block(0, 0, 3, 3) = t0;
-      Cvc.block(0, 3, 3, 1) = t1;
-      Cvc.block(3, 0, 1, 3) = t1.transpose();
-      Matrix<double, 1, 1> cm1;
-//      Matrix<double, 1, 1> cm2;
-      Matrix<double, 1, 1> cm3;
-      cm1 = (v.transpose() * C0 * v);
-//      cm2 = (C0.cwiseProduct(t0)).sum();
-      cm3 = (r0.transpose() * t0 * r0);
-      const double c = cm1(0,0) + (C0.cwiseProduct(t0)).sum() + cm3(0,0);
-      Cvc(3, 3) = c;
-         // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
-    }
-    printIt(&Cvc, "circle_fit - Cvc:");
+        printIt(&C[0][0], "circle_fit - C[0][0]:");
+
+        Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+        for (u_int i = 0; i < 3; ++i)
+        {
+            for (u_int j = i; j < 3; ++j)
+            {
+                Matrix<double, 1, 1> tmp;
+                tmp = weight.transpose() * C[i][j] * weight;
+                const double c = tmp(0, 0);
+                C0(i, j) = c;  //weight.transpose() * C[i][j] * weight;
+                C0(j, i) = C0(i, j);
+            }
+        }
+        printIt(&C0, "circle_fit - C0:");
+
+        const MatrixNd W = weight * weight.transpose();
+        const MatrixNd H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
+        const MatrixNx3d s_v = H * p3D.transpose();
+        printIt(&W, "circle_fit - W:");
+        printIt(&H, "circle_fit - H:");
+        printIt(&s_v, "circle_fit - s_v:");
+
+        MatrixNd D_[3][3];  // cov(s_v)
+        {
+            D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
+            D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
+            D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
+            D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
+            D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
+            D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
+            D_[1][0] = D_[0][1].transpose();
+            D_[2][0] = D_[0][2].transpose();
+            D_[2][1] = D_[1][2].transpose();
+        }
+        printIt(&D_[0][0], "circle_fit - D_[0][0]:");
+
+        constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+        Matrix6d E;  // cov matrix of the 6 independent elements of A
+        for (u_int a = 0; a < 6; ++a)
+        {
+            const u_int i = nu[a][0], j = nu[a][1];
+            for (u_int b = a; b < 6; ++b)
+            {
+                const u_int k = nu[b][0], l = nu[b][1];
+                VectorNd t0(n);
+                VectorNd t1(n);
+                if (l == k)
+                {
+                    t0 = 2. * D_[j][l] * s_v.col(l);
+                    if (i == j)
+                        t1 = t0;
+                    else
+                        t1 = 2. * D_[i][l] * s_v.col(l);
+                }
+                else
+                {
+                    t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
+                    if (i == j)
+                        t1 = t0;
+                    else
+                        t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
+                }
+
+                if (i == j)
+                {
+                    Matrix<double, 1, 1> cm;
+                    cm = s_v.col(i).transpose() * (t0 + t1);
+                    const double c = cm(0, 0);
+                    E(a, b) = 0. + c;
+                }
+                else
+                {
+                    Matrix<double, 1, 1> cm;
+                    cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+                    const double c = cm(0, 0);
+                    E(a, b) = 0. + c;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+                }
+                if (b != a)
+                    E(b, a) = E(a, b);
+            }
+        }
+        printIt(&E, "circle_fit - E:");
+
+        Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
+        for (u_int a = 0; a < 6; ++a)
+        {
+            const u_int i = nu[a][0], j = nu[a][1];
+            Matrix3d Delta = Matrix3d::Zero();
+            Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
+            J2.col(a) = min_eigen3D_fast(A + Delta);
+            const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
+            J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
+        }
+        printIt(&J2, "circle_fit - J2:");
+
+        Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
+        {
+            Matrix3d t0 = J2 * E * J2.transpose();
+            Vector3d t1 = -t0 * r0;
+            Cvc.block(0, 0, 3, 3) = t0;
+            Cvc.block(0, 3, 3, 1) = t1;
+            Cvc.block(3, 0, 1, 3) = t1.transpose();
+            Matrix<double, 1, 1> cm1;
+            //      Matrix<double, 1, 1> cm2;
+            Matrix<double, 1, 1> cm3;
+            cm1 = (v.transpose() * C0 * v);
+            //      cm2 = (C0.cwiseProduct(t0)).sum();
+            cm3 = (r0.transpose() * t0 * r0);
+            const double c = cm1(0, 0) + (C0.cwiseProduct(t0)).sum() + cm3(0, 0);
+            Cvc(3, 3) = c;
+            // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+        }
+        printIt(&Cvc, "circle_fit - Cvc:");
 
-    Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
-    {
-      const double t = 1. / h;
-      J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
-          0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
-    }
-    printIt(&J3, "circle_fit - J3:");
+        Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+        {
+            const double t = 1. / h;
+            J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
+                0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+        }
+        printIt(&J3, "circle_fit - J3:");
 
-    const RowVector2Nd Jq = mc.transpose() * s * 1. / n;  // var(q)
-    printIt(&Jq, "circle_fit - Jq:");
+        const RowVector2Nd Jq = mc.transpose() * s * 1. / n;  // var(q)
+        printIt(&Jq, "circle_fit - Jq:");
 
-    Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
-                       + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
+        Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                           + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
 
-    circle.cov = cov_uvr;
-  }
+        circle.cov = cov_uvr;
+    }
 
-  printIt(&circle.cov, "Circle cov:");
+    printIt(&circle.cov, "Circle cov:");
 #if RFIT_DEBUG
-  printf("circle_fit - exit\n");
+    printf("circle_fit - exit\n");
 #endif
-  return circle;
+    return circle;
 }
 
 /*!
@@ -935,142 +963,143 @@ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     errors.
 */
 
-__host__ __device__
-inline line_fit Line_fit(const Matrix3xNd& hits,
-    const Matrix3Nd& hits_cov,
-    const circle_fit& circle,
-    const Vector4d& fast_fit,
-    const bool error = true) {
-  u_int n = hits.cols();
-  // PROJECTION ON THE CILINDER
-  Matrix2xNd p2D(2, n);
-  MatrixNx5d Jx(n, 5);
-
-  printIt(&hits, "Line_fit points: ");
-  printIt(&hits_cov, "Line_fit covs: ");
-
-  // x & associated Jacobian
-  // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
-  // Slide 11
-  // a ==> -o i.e. the origin of the circle in XY plane, negative
-  // b ==> p i.e. distances of the points wrt the origin of the circle.
-  const Vector2d o(circle.par(0), circle.par(1));
-  for (u_int i = 0; i < n; ++i) {  // x
-    Vector2d p = hits.block(0, i, 2, 1) - o;
-    const double cross = cross2D(-o, p);
-    const double dot = (-o).dot(p);
-    // atan2(cross, dot) give back the angle in the transverse plane so tha the final equation reads:
-    // x_i = -q*R*theta (theta = angle returned by atan2)
-    const double atan2_ = -circle.q * atan2(cross, dot);
-    p2D(0, i) = atan2_ * circle.par(2);
-
-    // associated Jacobian, used in weights and errors computation
-    const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-    double d_X0 = 0, d_Y0 = 0, d_R = 0.;  // good approximation for big pt and eta
-    if (error) {
-      d_X0 = - temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
-      d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
-      d_R = atan2_;
+__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
+                                             const Matrix3Nd& hits_cov,
+                                             const circle_fit& circle,
+                                             const Vector4d& fast_fit,
+                                             const bool error = true)
+{
+    u_int n = hits.cols();
+    // PROJECTION ON THE CILINDER
+    Matrix2xNd p2D(2, n);
+    MatrixNx5d Jx(n, 5);
+
+    printIt(&hits, "Line_fit points: ");
+    printIt(&hits_cov, "Line_fit covs: ");
+
+    // x & associated Jacobian
+    // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+    // Slide 11
+    // a ==> -o i.e. the origin of the circle in XY plane, negative
+    // b ==> p i.e. distances of the points wrt the origin of the circle.
+    const Vector2d o(circle.par(0), circle.par(1));
+    for (u_int i = 0; i < n; ++i)
+    {  // x
+        Vector2d p = hits.block(0, i, 2, 1) - o;
+        const double cross = cross2D(-o, p);
+        const double dot = (-o).dot(p);
+        // atan2(cross, dot) give back the angle in the transverse plane so tha the final equation reads:
+        // x_i = -q*R*theta (theta = angle returned by atan2)
+        const double atan2_ = -circle.q * atan2(cross, dot);
+        p2D(0, i) = atan2_ * circle.par(2);
+
+        // associated Jacobian, used in weights and errors computation
+        const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+        double d_X0 = 0, d_Y0 = 0, d_R = 0.;  // good approximation for big pt and eta
+        if (error)
+        {
+            d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
+            d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
+            d_R = atan2_;
+        }
+        const double d_x = temp0 * (o(1) * dot + o(0) * cross);
+        const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
+        Jx.row(i) << d_X0, d_Y0, d_R, d_x, d_y;
     }
-    const double d_x = temp0 * (o(1) * dot + o(0) * cross);
-    const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-    Jx.row(i) << d_X0, d_Y0, d_R, d_x, d_y;
-  }
-  // Math of d_{X0,Y0,R,x,y} all verified by hand
-
-  // y
-  p2D.row(1) = hits.row(2);
-
-
-  // WEIGHT COMPUTATION
-  VectorNd x_err2 = X_err2(hits_cov, circle, Jx, error, n);
-  VectorNd y_err2 = hits_cov.block(2 * n, 2 * n, n, n).diagonal();
-
-  const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
-  const VectorNd weight = err2_inv * 1. / err2_inv.sum();
-
-  printIt(&x_err2, "Line_fit - x_err2: ");
-  printIt(&y_err2, "Line_fit - y_err2: ");
-  printIt(&err2_inv, "Line_fit - err2_inv: ");
-  printIt(&weight, "Line_fit - weight: ");
-
-
-  // COST FUNCTION
-
-  // compute
-  // r0 represents the weighted mean of "x" and "y".
-  const Vector2d r0 = p2D * weight;
-  // This is the X  vector that will be used to build the
-  // scatter matrix S = X^T * X
-  const Matrix2xNd X = p2D.colwise() - r0;
-  Matrix2d A = Matrix2d::Zero();
-  for (u_int i = 0; i < n; ++i) {
-    A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
-  }
-
-  printIt(&A, "Line_fit - A: ");
-
-  // minimize
-  double chi2;
-  Vector2d v = min_eigen2D(A, chi2);
-  printIt(&v, "Line_fit - v: ");
-
-  // n *= (chi2>0) ? 1 : -1; //TO FIX
-  // This hack to be able to run on GPU where the automatic assignment to a
-  // double from the vector multiplication is not working.
-  Matrix<double, 1, 1> cm;
-  cm = -v.transpose() * r0;
-  const double c = cm(0,0);
-
-  // COMPUTE LINE PARAMETER
-  line_fit line;
-  line.par << -v(0) / v(1),                          // cotan(theta))
-      -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
-  line.chi2 = abs(chi2);
-  printIt(&(line.par), "Line_fit - line.par: ");
-
-  // ERROR PROPAGATION
-  if (error) {
-    const double v0_2 = sqr(v(0));
-    const double v1_2 = sqr(v(1));
-
-    Matrix3d C;  // cov(v,c)
+    // Math of d_{X0,Y0,R,x,y} all verified by hand
+
+    // y
+    p2D.row(1) = hits.row(2);
+
+    // WEIGHT COMPUTATION
+    VectorNd x_err2 = X_err2(hits_cov, circle, Jx, error, n);
+    VectorNd y_err2 = hits_cov.block(2 * n, 2 * n, n, n).diagonal();
+
+    const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
+    const VectorNd weight = err2_inv * 1. / err2_inv.sum();
+
+    printIt(&x_err2, "Line_fit - x_err2: ");
+    printIt(&y_err2, "Line_fit - y_err2: ");
+    printIt(&err2_inv, "Line_fit - err2_inv: ");
+    printIt(&weight, "Line_fit - weight: ");
+
+    // COST FUNCTION
+
+    // compute
+    // r0 represents the weighted mean of "x" and "y".
+    const Vector2d r0 = p2D * weight;
+    // This is the X  vector that will be used to build the
+    // scatter matrix S = X^T * X
+    const Matrix2xNd X = p2D.colwise() - r0;
+    Matrix2d A = Matrix2d::Zero();
+    for (u_int i = 0; i < n; ++i)
     {
-      double norm_chernov = 0.;
-      for (u_int i = 0; i < n; ++i)
-        norm_chernov += err2_inv(i) * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c)
-          * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c);
-      norm_chernov /= float(n);
-      // Indeed it should read:
-      // * compute the average error in the orthogonal direction: err2_inv.cwiseInverse().sum()/sqr(n)
-      // * normalize the A(0,0)+A(1,1) dividing by err2_inv.sum(), since those have been weighted
-      const double norm = (err2_inv.cwiseInverse().sum())*err2_inv.sum()*1./sqr(n);
-      const double sig2 = 1./(A(0,0) + A(1,1))*norm;
-//      const double sig2 = 1. / (A(0, 0) + A(1, 1));
-      C(0, 0) = sig2 * v1_2;
-      C(1, 1) = sig2 * v0_2;
-      C(0, 1) = C(1, 0) = -sig2 * v(0) * v(1);
-      const VectorNd weight_2 = (weight).array().square();
-      const Vector2d C0(weight_2.dot(x_err2), weight_2.dot(y_err2));
-      C.block(0, 2, 2, 1) = C.block(2, 0, 1, 2).transpose() = -C.block(0, 0, 2, 2) * r0;
-      Matrix<double, 1, 1> tmp = (r0.transpose() * C.block(0, 0, 2, 2) * r0);
-      C(2, 2) = v0_2 * C0(0) + v1_2 * C0(1) + C0(0) * C(0, 0) + C0(1) * C(1, 1) + tmp(0,0);
+        A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
     }
 
-    Matrix<double, 2, 3> J;  // Jacobian of (v,c) -> (cotan(theta)),Zip)
+    printIt(&A, "Line_fit - A: ");
+
+    // minimize
+    double chi2;
+    Vector2d v = min_eigen2D(A, chi2);
+    printIt(&v, "Line_fit - v: ");
+
+    // n *= (chi2>0) ? 1 : -1; //TO FIX
+    // This hack to be able to run on GPU where the automatic assignment to a
+    // double from the vector multiplication is not working.
+    Matrix<double, 1, 1> cm;
+    cm = -v.transpose() * r0;
+    const double c = cm(0, 0);
+
+    // COMPUTE LINE PARAMETER
+    line_fit line;
+    line.par << -v(0) / v(1),                          // cotan(theta))
+        -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
+    line.chi2 = abs(chi2);
+    printIt(&(line.par), "Line_fit - line.par: ");
+
+    // ERROR PROPAGATION
+    if (error)
     {
-      const double t0 = 1. / v(1);
-      const double t1 = sqr(t0);
-      const double sqrt_ = sqrt(v1_2 + v0_2);
-      const double t2 = 1. / sqrt_;
-      J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
+        const double v0_2 = sqr(v(0));
+        const double v1_2 = sqr(v(1));
+
+        Matrix3d C;  // cov(v,c)
+        {
+            double norm_chernov = 0.;
+            for (u_int i = 0; i < n; ++i)
+                norm_chernov += err2_inv(i) * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c) * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c);
+            norm_chernov /= float(n);
+            // Indeed it should read:
+            // * compute the average error in the orthogonal direction: err2_inv.cwiseInverse().sum()/sqr(n)
+            // * normalize the A(0,0)+A(1,1) dividing by err2_inv.sum(), since those have been weighted
+            const double norm = (err2_inv.cwiseInverse().sum()) * err2_inv.sum() * 1. / sqr(n);
+            const double sig2 = 1. / (A(0, 0) + A(1, 1)) * norm;
+            //      const double sig2 = 1. / (A(0, 0) + A(1, 1));
+            C(0, 0) = sig2 * v1_2;
+            C(1, 1) = sig2 * v0_2;
+            C(0, 1) = C(1, 0) = -sig2 * v(0) * v(1);
+            const VectorNd weight_2 = (weight).array().square();
+            const Vector2d C0(weight_2.dot(x_err2), weight_2.dot(y_err2));
+            C.block(0, 2, 2, 1) = C.block(2, 0, 1, 2).transpose() = -C.block(0, 0, 2, 2) * r0;
+            Matrix<double, 1, 1> tmp = (r0.transpose() * C.block(0, 0, 2, 2) * r0);
+            C(2, 2) = v0_2 * C0(0) + v1_2 * C0(1) + C0(0) * C(0, 0) + C0(1) * C(1, 1) + tmp(0, 0);
+        }
+
+        Matrix<double, 2, 3> J;  // Jacobian of (v,c) -> (cotan(theta)),Zip)
+        {
+            const double t0 = 1. / v(1);
+            const double t1 = sqr(t0);
+            const double sqrt_ = sqrt(v1_2 + v0_2);
+            const double t2 = 1. / sqrt_;
+            J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
+        }
+        Matrix<double, 3, 2> JT = J.transpose().eval();
+        line.cov = J * C * JT;
     }
-    Matrix<double, 3, 2> JT = J.transpose().eval();
-    line.cov = J * C * JT;
-  }
 
-  printIt(&line.cov, "Line cov:");
-  return line;
+    printIt(&line.cov, "Line cov:");
+    return line;
 }
 
 /*!
@@ -1113,34 +1142,36 @@ inline line_fit Line_fit(const Matrix3xNd& hits,
 */
 
 inline helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const double B,
-                    const bool& error = true, const bool& scattering = false) {
-  u_int n = hits.cols();
-  VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+                           const bool& error = true, const bool& scattering = false)
+{
+    u_int n = hits.cols();
+    VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
 
-  // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
-  const Vector4d fast_fit = Fast_fit(hits);
+    // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
+    const Vector4d fast_fit = Fast_fit(hits);
 
-  circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov.block(0, 0, 2 * n, 2 * n),
-                                 fast_fit, rad, B, error, scattering);
+    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov.block(0, 0, 2 * n, 2 * n),
+                                   fast_fit, rad, B, error, scattering);
 
-  const line_fit line = Line_fit(hits, hits_cov, circle, fast_fit, error);
+    const line_fit line = Line_fit(hits, hits_cov, circle, fast_fit, error);
 
-  par_uvrtopak(circle, B, error);
+    par_uvrtopak(circle, B, error);
 
-  helix_fit helix;
-  helix.par << circle.par, line.par;
-  if (error) {
-    helix.cov = MatrixXd::Zero(5, 5);
-    helix.cov.block(0, 0, 3, 3) = circle.cov;
-    helix.cov.block(3, 3, 2, 2) = line.cov;
-  }
-  helix.q = circle.q;
-  helix.chi2_circle = circle.chi2;
-  helix.chi2_line = line.chi2;
+    helix_fit helix;
+    helix.par << circle.par, line.par;
+    if (error)
+    {
+        helix.cov = MatrixXd::Zero(5, 5);
+        helix.cov.block(0, 0, 3, 3) = circle.cov;
+        helix.cov.block(3, 3, 2, 2) = line.cov;
+    }
+    helix.q = circle.q;
+    helix.chi2_circle = circle.chi2;
+    helix.chi2_line = line.chi2;
 
-  return helix;
+    return helix;
 }
 
 }  // namespace Rfit
 
-#endif // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h

From 14a9169735e2e0f38f2e0362c11d225284962f30 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 30 Aug 2018 12:23:01 +0200
Subject: [PATCH 019/102] Clean up and bugfixes for the Riemann fit
 (cms-patatrack#148)

Fix for uninitialised variables.
Always assume multiple scattering treatment and remove unused parameters.
Remove test that has diverged from the actual implementation.
---
 .../PixelTrackFitting/interface/RiemannFit.h  |  88 +++---
 .../PixelTrackFitting/test/BuildFile.xml      |   5 -
 .../test/PixelTrackRiemannFit.cc              |   2 +-
 .../PixelTrackFitting/test/testEigenGPU.cu    | 269 ------------------
 4 files changed, 39 insertions(+), 325 deletions(-)
 delete mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 0ba579033ef83..0eed4d7a12faf 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -45,7 +45,7 @@ struct circle_fit
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
   */
     int64_t q;  //!< particle charge
-    double chi2;
+    double chi2 = 0.0;
 };
 
 struct line_fit
@@ -56,7 +56,7 @@ struct line_fit
       |cov(c_t,c_t)|cov(Zip,c_t)| \n
       |cov(c_t,Zip)|cov(Zip,Zip)|
   */
-    double chi2;
+    double chi2 = 0.0;
 };
 
 struct helix_fit
@@ -70,8 +70,8 @@ struct helix_fit
       |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
       |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
   */
-    double chi2_circle;
-    double chi2_line;
+    double chi2_circle = 0.0;
+    double chi2_line = 0.0;
     Vector4d fast_fit;
     int64_t q;  //!< particle charge
                 //  VectorXd time;  // TO FIX just for profiling
@@ -160,7 +160,8 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
             }
         }
     }
-    printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+    Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+
     return scatter_cov_rad;
 }
 
@@ -340,7 +341,7 @@ __host__ __device__ inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d&
     \param error flag for errors computation.
 */
 
-__host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool& error)
+__host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool error)
 {
     Vector3d par_pak;
     const double temp0 = circle.par.head(2).squaredNorm();
@@ -374,7 +375,7 @@ __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B,
 */
 
 __host__ __device__ inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
-                                           const bool& error, u_int n)
+                                           const bool error, u_int n)
 {
     VectorNd x_err2(n);
     for (u_int i = 0; i < n; ++i)
@@ -539,7 +540,7 @@ __host__ __device__ inline Vector4d Fast_fit(const Matrix3xNd& hits)
     const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
     printIt(&e, "Fast_fit - e: ");
     printIt(&d, "Fast_fit - d: ");
-    // Compute the arc-length between first and last point: L = R * theta = R *  atan (tan (Theta) )
+    // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
     const double dr = result(2) * atan2(cross2D(d, e), d.dot(e));
     // Simple difference in Z between last and first hit
     const double dz = hits(2, n - 1) - hits(2, 0);
@@ -589,8 +590,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
                                                  const Vector4d& fast_fit,
                                                  const VectorNd& rad,
                                                  const double B,
-                                                 const bool error = true,
-                                                 const bool scattering = false)
+                                                 const bool error = true)
 {
 #if RFIT_DEBUG
     printf("circle_fit - enter\n");
@@ -614,34 +614,25 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         printIt(&cov_rad, "circle_fit - cov_rad:");
         // cov_rad = cov_carttorad(hits2D, V);
 
-        if (scattering)
-        {
-            MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
-            printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
-            printIt(&hits2D, "circle_fit - hits2D bis:");
+        MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
+        printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
+        printIt(&hits2D, "circle_fit - hits2D bis:");
 #if RFIT_DEBUG
-            printf("Address of hits2D: a) %p\n", &hits2D);
+        printf("Address of hits2D: a) %p\n", &hits2D);
 #endif
-            V += cov_radtocart(hits2D, scatter_cov_rad, rad);
-            printIt(&V, "circle_fit - V:");
-            cov_rad += scatter_cov_rad;
-            printIt(&cov_rad, "circle_fit - cov_rad:");
-            Matrix4d cov_rad4 = cov_rad;
-            Matrix4d G4;
-            G4 = cov_rad4.inverse();
-            printIt(&G4, "circle_fit - G4:");
-            renorm = G4.sum();
-            G4 *= 1. / renorm;
-            printIt(&G4, "circle_fit - G4:");
-            G = G4;
-            weight = Weight_circle(G);
-        }
-        else
-        {
-            weight = cov_rad.diagonal().cwiseInverse();
-            renorm = weight.sum();
-            weight *= 1. / renorm;
-        }
+        V += cov_radtocart(hits2D, scatter_cov_rad, rad);
+        printIt(&V, "circle_fit - V:");
+        cov_rad += scatter_cov_rad;
+        printIt(&cov_rad, "circle_fit - cov_rad:");
+        Matrix4d cov_rad4 = cov_rad;
+        Matrix4d G4;
+        G4 = cov_rad4.inverse();
+        printIt(&G4, "circle_fit - G4:");
+        renorm = G4.sum();
+        G4 *= 1. / renorm;
+        printIt(&G4, "circle_fit - G4:");
+        G = G4;
+        weight = Weight_circle(G);
     }
     printIt(&weight, "circle_fit - weight:");
 
@@ -681,13 +672,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     Matrix3d A = Matrix3d::Zero();
     const Vector3d r0 = p3D * weight;  // center of gravity
     const Matrix3xNd X = p3D.colwise() - r0;
-    if (scattering)
-        A = X * G * X.transpose();
-    else
-    {
-        for (u_int i = 0; i < n; ++i)
-            A += weight(i) * (X.col(i) * X.col(i).transpose());
-    }
+    A = X * G * X.transpose();
     printIt(&A, "circle_fit - A:");
 
 #if RFIT_DEBUG
@@ -967,6 +952,7 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
                                              const Matrix3Nd& hits_cov,
                                              const circle_fit& circle,
                                              const Vector4d& fast_fit,
+                                             const double B,
                                              const bool error = true)
 {
     u_int n = hits.cols();
@@ -983,13 +969,15 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     // a ==> -o i.e. the origin of the circle in XY plane, negative
     // b ==> p i.e. distances of the points wrt the origin of the circle.
     const Vector2d o(circle.par(0), circle.par(1));
+
+    // associated Jacobian, used in weights and errors computation
     for (u_int i = 0; i < n; ++i)
     {  // x
         Vector2d p = hits.block(0, i, 2, 1) - o;
         const double cross = cross2D(-o, p);
         const double dot = (-o).dot(p);
-        // atan2(cross, dot) give back the angle in the transverse plane so tha the final equation reads:
-        // x_i = -q*R*theta (theta = angle returned by atan2)
+        // atan2(cross, dot) give back the angle in the transverse plane so tha the
+        // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
         const double atan2_ = -circle.q * atan2(cross, dot);
         p2D(0, i) = atan2_ * circle.par(2);
 
@@ -1142,7 +1130,7 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
 */
 
 inline helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const double B,
-                           const bool& error = true, const bool& scattering = false)
+                           const bool error = true)
 {
     u_int n = hits.cols();
     VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
@@ -1150,10 +1138,10 @@ inline helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, co
     // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
     const Vector4d fast_fit = Fast_fit(hits);
 
-    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov.block(0, 0, 2 * n, 2 * n),
-                                   fast_fit, rad, B, error, scattering);
-
-    const line_fit line = Line_fit(hits, hits_cov, circle, fast_fit, error);
+    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n),
+                                   hits_cov.block(0, 0, 2 * n, 2 * n),
+                                   fast_fit, rad, B, error);
+    line_fit line = Line_fit(hits, hits_cov, circle, fast_fit, B, error);
 
     par_uvrtopak(circle, B, error);
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index d6beb57b862b8..21e227ea3e7e7 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -11,11 +11,6 @@
 <use   name="RecoTracker/TkTrackingRegions"/>
 <use   name="RecoPixelVertexing/PixelTriplets"/>
 <use   name="RecoPixelVertexing/PixelTrackFitting"/>
-<bin file="testEigenGPU.cu" name="testEigenGPU_t">
-  <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
-  <flags CXXFLAGS="-g"/>
-</bin>
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
index 9f60b2f431e96..b27ed52473388 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -214,7 +214,7 @@ void test_helix_fit() {
 //      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
 //      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
 //      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
-      helix[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err, false);
+      helix[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err);
 
       if (debug)
         cout << std::setprecision(10)
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
deleted file mode 100644
index e5cd889321f6e..0000000000000
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ /dev/null
@@ -1,269 +0,0 @@
-#include <iostream>
-
-#include <Eigen/Core>
-#include <Eigen/Eigenvalues>
-
-#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-#include "test_common.h"
-
-using namespace Eigen;
-
-__global__
-void kernelFullFit(Rfit::Matrix3xNd * hits,
-    Rfit::Matrix3Nd * hits_cov,
-    double B,
-    bool errors,
-    bool scattering,
-    Rfit::circle_fit * circle_fit_resultsGPU,
-    Rfit::line_fit * line_fit_resultsGPU) {
-  Vector4d fast_fit = Rfit::Fast_fit(*hits);
-
-  u_int n = hits->cols();
-  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
-
-  Rfit::Matrix2xNd hits2D_local = (hits->block(0,0,2,n)).eval();
-  Rfit::Matrix2Nd hits_cov2D_local = (hits_cov->block(0, 0, 2 * n, 2 * n)).eval();
-  Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ");
-  Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ");
-  /*
-  printf("kernelFullFit - hits address: %p\n", hits);
-  printf("kernelFullFit - hits_cov address: %p\n", hits_cov);
-  printf("kernelFullFit - hits_cov2D address: %p\n", &hits2D_local);
-  printf("kernelFullFit - hits_cov2D_local address: %p\n", &hits_cov2D_local);
-  */
-  /* At some point I gave up and locally construct block on the stack, so that
-     the next invocation to Rfit::Circle_fit works properly. Failing to do so
-     implied basically an empty collection of hits and covariances. That could
-     have been partially fixed if values of the passed in matrices would have
-     been printed on screen since that, maybe, triggered internally the real
-     creations of the blocks. To be understood and compared against the myriad
-     of compilation warnings we have.
-     */
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-      fast_fit, rad, B, errors, scattering);
-  /*
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
-      fast_fit, rad, B, errors, scattering);
-   */
-  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, errors);
-
-  return;
-}
-
-__global__
-void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
-  (*results) = Rfit::Fast_fit(*hits);
-}
-
-__global__
-void kernelCircleFit(Rfit::Matrix3xNd * hits,
-    Rfit::Matrix3Nd * hits_cov, Vector4d * fast_fit_input, double B,
-    Rfit::circle_fit * circle_fit_resultsGPU) {
-  u_int n = hits->cols();
-  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
-
-#if TEST_DEBUG
-  printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
-  printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
-  printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
-  printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
-  printf("rad(0,0): %f\n", rad(0,0));
-  printf("rad(1,1): %f\n", rad(1,1));
-  printf("rad(2,2): %f\n", rad(2,2));
-  printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
-  printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
-  printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
-  printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
-  printf("B: %f\n", B);
-#endif
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-      *fast_fit_input, rad, B, false, false);
-}
-
-__global__
-void kernelLineFit(Rfit::Matrix3xNd * hits,
-                   Rfit::Matrix3Nd * hits_cov,
-                   Rfit::circle_fit * circle_fit,
-                   Vector4d * fast_fit,
-                   Rfit::line_fit * line_fit)
-{
-  (*line_fit) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit, *fast_fit, true);
-}
-
-void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
-  hits << 1.98645, 4.72598, 7.65632, 11.3151,
-          2.18002, 4.88864, 7.75845, 11.3134,
-          2.46338, 6.99838,  11.808,  17.793;
-  hits_cov(0,0) = 7.14652e-06;
-  hits_cov(1,1) = 2.15789e-06;
-  hits_cov(2,2) = 1.63328e-06;
-  hits_cov(3,3) = 6.27919e-06;
-  hits_cov(4,4) = 6.10348e-06;
-  hits_cov(5,5) = 2.08211e-06;
-  hits_cov(6,6) = 1.61672e-06;
-  hits_cov(7,7) = 6.28081e-06;
-  hits_cov(8,8) = 5.184e-05;
-  hits_cov(9,9) = 1.444e-05;
-  hits_cov(10,10) = 6.25e-06;
-  hits_cov(11,11) = 3.136e-05;
-  hits_cov(0,4) = hits_cov(4,0) = -5.60077e-06;
-  hits_cov(1,5) = hits_cov(5,1) = -1.11936e-06;
-  hits_cov(2,6) = hits_cov(6,2) = -6.24945e-07;
-  hits_cov(3,7) = hits_cov(7,3) = -5.28e-06;
-}
-
-void testFit() {
-  constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd hits(3,4);
-  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
-  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
-  Rfit::Matrix3Nd * hits_covGPU = nullptr;
-  Vector4d * fast_fit_resultsGPU = new Vector4d();
-  Vector4d * fast_fit_resultsGPUret = new Vector4d();
-  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
-  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
-
-  fillHitsAndHitsCov(hits, hits_cov);
-
-  // FAST_FIT_CPU
-  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
-#if TEST_DEBUG
-  std::cout << "Generated hits:\n" << hits << std::endl;
-#endif
-  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
-
-  // FAST_FIT GPU
-  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
-  cudaMalloc((void**)&fast_fit_resultsGPU, sizeof(Vector4d));
-  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
-
-  kernelFastFit<<<1, 1>>>(hitsGPU, fast_fit_resultsGPU);
-  cudaDeviceSynchronize();
-  
-  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, sizeof(Vector4d), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << *fast_fit_resultsGPUret << std::endl;
-  assert(isEqualFuzzy(fast_fit_results, (*fast_fit_resultsGPUret)));
-
-  // CIRCLE_FIT CPU
-  u_int n = hits.cols();
-  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
-
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
-      hits_cov.block(0, 0, 2 * n, 2 * n),
-      fast_fit_results, rad, B, false, false);
-  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
-
-  // CIRCLE_FIT GPU
-  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
-  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
-  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
-
-  kernelCircleFit<<<1,1>>>(hitsGPU, hits_covGPU,
-      fast_fit_resultsGPU, B, circle_fit_resultsGPU);
-  cudaDeviceSynchronize();
-
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
-      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
-
-  // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
-  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
-
-  // LINE_FIT GPU
-  Rfit::line_fit * line_fit_resultsGPU = nullptr;
-  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
-
-  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
-
-  kernelLineFit<<<1,1>>>(hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
-  cudaDeviceSynchronize();
-
-  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
-}
-
-void testFitOneGo(bool errors, bool scattering, double epsilon=1e-6) {
-  constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd hits(3,4);
-  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
-
-  fillHitsAndHitsCov(hits, hits_cov);
-
-  // FAST_FIT_CPU
-  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
-  // CIRCLE_FIT CPU
-  u_int n = hits.cols();
-  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
-
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n), 
-      hits_cov.block(0, 0, 2 * n, 2 * n),
-      fast_fit_results, rad, B, errors, scattering);
-  // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results,
-      fast_fit_results, errors);
-
-  // FIT GPU
-  std::cout << "GPU FIT" << std::endl;
-  Rfit::Matrix3xNd * hitsGPU = nullptr; // new Rfit::Matrix3xNd(3,4);
-  Rfit::Matrix3Nd * hits_covGPU = nullptr;
-  Rfit::line_fit * line_fit_resultsGPU = nullptr;
-  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
-  Rfit::circle_fit * circle_fit_resultsGPU = nullptr; // new Rfit::circle_fit();
-  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
-
-  cudaCheck(cudaMalloc((void **)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4))));
-  cudaCheck(cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12))));
-  cudaCheck(cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit)));
-  cudaCheck(cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit)));
-  cudaCheck(cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice));
-  cudaCheck(cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice));
-
-  kernelFullFit<<<1, 1>>>(hitsGPU, hits_covGPU, B, errors, scattering,
-      circle_fit_resultsGPU, line_fit_resultsGPU);
-  cudaCheck(cudaDeviceSynchronize());
-
-  cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
-  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
-
-  std::cout << "Fitted values (CircleFit) CPU:\n" << circle_fit_results.par << std::endl;
-  std::cout << "Fitted values (LineFit): CPU\n" << line_fit_results.par << std::endl;
-  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
-  std::cout << "Fitted values (LineFit): GPU\n" << line_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par, epsilon));
-  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, epsilon));
-
-  cudaCheck(cudaFree(hitsGPU));
-  cudaCheck(cudaFree(hits_covGPU));
-  cudaCheck(cudaFree(line_fit_resultsGPU));
-  cudaCheck(cudaFree(circle_fit_resultsGPU));
-  delete line_fit_resultsGPUret;
-  delete circle_fit_resultsGPUret;
-
-  cudaDeviceReset();
-}
-
-int main (int argc, char * argv[]) {
-//  testFit();
-  std::cout << "TEST FIT, NO ERRORS, NO SCATTERING" << std::endl;
-  testFitOneGo(false, false);
-
-  // The default 1e-6 is failing....
-  std::cout << "TEST FIT, ERRORS, NO SCATTER" << std::endl;
-  testFitOneGo(true, false, 1e-5);
-
-  std::cout << "TEST FIT, NO ERRORS, SCATTER" << std::endl;
-  testFitOneGo(false, true);
-
-  std::cout << "TEST FIT, ERRORS AND SCATTER" << std::endl;
-  testFitOneGo(true, true, 1e-5);
-
-  return 0;
-}

From c82fc4d8d73e9169e5bafc51189ae9efa350e832 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 5 Sep 2018 10:16:34 +0200
Subject: [PATCH 020/102] Reduce CA memory need (cms-patatrack#159)

---
 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 6290c47b9e1ef..b17ed42cfb390 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -23,7 +23,7 @@ namespace gpuPixelDoublets {
   void doubletsFromHisto(uint8_t const * layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
                          int16_t const * iphi, Hist const * hist, uint32_t const * offsets,
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
-                         GPU::VecArray< unsigned int, 2048>  * isOuterHitOfCell,
+                         GPU::VecArray< unsigned int, 256>  * isOuterHitOfCell,
                          int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {
 
     auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
@@ -125,7 +125,7 @@ namespace gpuPixelDoublets {
   }
   __global__
   void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp,                
-                            GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell) {
+                            GPU::VecArray< unsigned int, 256> *isOuterHitOfCell) {
 
     uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3 
                                      // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6  

From 23966339d8694c6bff31150cb1442cf23789f19c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 12 Sep 2018 09:30:00 +0200
Subject: [PATCH 021/102] Add MTV instance for pixel tracks from PV
 (cms-patatrack#156)

Add separate plots for tracks associated to the primary vertex.
---
 .../python/PostProcessorTracker_cfi.py        | 117 +++-------------
 .../RecoTrack/python/TrackValidation_cff.py   |  34 ++++-
 Validation/RecoTrack/python/plotting/html.py  |  23 +---
 .../python/plotting/trackingPlots.py          | 127 +++++++-----------
 4 files changed, 104 insertions(+), 197 deletions(-)

diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index cec8d3cd3cff9..a926b19d4321a 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -1,30 +1,11 @@
 import FWCore.ParameterSet.Config as cms
 from DQMServices.Core.DQMEDHarvester import DQMEDHarvester
-from Configuration.Eras.Modifier_fastSim_cff import fastSim
-
-def _addNoFlow(module):
-    _noflowSeen = set()
-    for eff in module.efficiency.value():
-        tmp = eff.split(" ")
-        if "cut" in tmp[0]:
-            continue
-        ind = -1
-        if tmp[ind] == "fake" or tmp[ind] == "simpleratio":
-            ind = -2
-        if not tmp[ind] in _noflowSeen:
-            module.noFlowDists.append(tmp[ind])
-        if not tmp[ind-1] in _noflowSeen:
-            module.noFlowDists.append(tmp[ind-1])
-
-_defaultSubdirs = ["Tracking/Track/*", "Tracking/TrackTPPtLess09/*", "Tracking/TrackFromPV/*", "Tracking/TrackFromPVAllTP/*", "Tracking/TrackAllTPEffic/*", "Tracking/TrackBuilding/*","Tracking/TrackConversion/*", "Tracking/TrackGsf/*"]
-_defaultSubdirsSummary = [e.replace("/*","") for e in _defaultSubdirs]
 
 postProcessorTrack = DQMEDHarvester("DQMGenericClient",
-    subDirs = cms.untracked.vstring(_defaultSubdirs),
+    subDirs = cms.untracked.vstring("Tracking/Track/*", "Tracking/TrackTPPtLess09/*", "Tracking/TrackFromPV/*", "Tracking/TrackFromPVAllTP/*", "Tracking/TrackAllTPEffic/*", "Tracking/TrackBuilding/*", "Tracking/TrackConversion/*", "Tracking/TrackGsf/*", "Tracking/TrackBHadron/*"),
     efficiency = cms.vstring(
     "effic 'Efficiency vs #eta' num_assoc(simToReco)_eta num_simul_eta",
     "efficPt 'Efficiency vs p_{T}' num_assoc(simToReco)_pT num_simul_pT",
-#    "efficPtvseta 'Efficiency in p_{T}-#eta plane' num_assoc(simToReco)_pTvseta num_simul_pTvseta",
     "effic_vs_hit 'Efficiency vs hit' num_assoc(simToReco)_hit num_simul_hit",
     "effic_vs_layer 'Efficiency vs layer' num_assoc(simToReco)_layer num_simul_layer",
     "effic_vs_pixellayer 'Efficiency vs pixel layer' num_assoc(simToReco)_pixellayer num_simul_pixellayer",
@@ -39,7 +20,6 @@ def _addNoFlow(module):
     "effic_vs_dzpv_zoomed 'Efficiency vs Dz(PV)' num_assoc(simToReco)_dzpv_zoomed num_simul_dzpv_zoomed",
     "duplicatesRate 'Duplicates Rate vs #eta' num_duplicate_eta num_reco_eta",
     "duplicatesRate_Pt 'Duplicates Rate vs p_{T}' num_duplicate_pT num_reco_pT",
-#    "duplicatesRate_Ptvseta 'Duplicates Rate in (p_{T}-#eta) plane' num_duplicate_pTvseta num_reco_pTvseta",
     "duplicatesRate_hit 'Duplicates Rate vs hit' num_duplicate_hit num_reco_hit",
     "duplicatesRate_layer 'Duplicates Rate vs layer' num_duplicate_layer num_reco_layer",
     "duplicatesRate_pixellayer 'Duplicates Rate vs pixel layer' num_duplicate_pixellayer num_reco_pixellayer",
@@ -55,11 +35,10 @@ def _addNoFlow(module):
     "duplicatesRate_vertpos 'Duplicates Rate vs vertpos' num_duplicate_vertpos num_reco_vertpos",
     "duplicatesRate_zpos 'Duplicates Rate vs zpos' num_duplicate_zpos num_reco_zpos",
     "duplicatesRate_dr 'Duplicates Rate vs dr' num_duplicate_dr num_reco_dr",
-    "duplicatesRate_drj 'Duplicates Rate vs dr (track,jet)' num_duplicate_drj num_reco_drj",
     "duplicatesRate_chi2 'Duplicates Rate vs normalized #chi^{2}' num_duplicate_chi2 num_reco_chi2",
     "duplicatesRate_seedingLayerSet 'Duplicates rate vs. seedingLayerSet' num_duplicate_seedingLayerSet num_reco_seedingLayerSet",
     "chargeMisIdRate 'Charge MisID Rate vs #eta' num_chargemisid_eta num_reco_eta",
-#    "chargeMisIdRate_Ptvseta 'Charge MisID Rate in (p_{T}-#eta) plane' num_chargemisid_pTvseta num_reco_pTvseta",
+    "chargeMisIdRate_Pt 'Charge MisID Rate vs p_{T}' num_chargemisid_pT num_reco_pT",
     "chargeMisIdRate_hit 'Charge MisID Rate vs hit' num_chargemisid_hit num_reco_hit",
     "chargeMisIdRate_layer 'Charge MisID Rate vs layer' num_chargemisid_hit num_reco_layer",
     "chargeMisIdRate_pixellayer 'Charge MisID Rate vs pixel layer' num_chargemisid_hit num_reco_pixellayer",
@@ -76,7 +55,6 @@ def _addNoFlow(module):
     "effic_vs_vertpos 'Efficiency vs vertpos' num_assoc(simToReco)_vertpos num_simul_vertpos",
     "effic_vs_zpos 'Efficiency vs zpos' num_assoc(simToReco)_zpos num_simul_zpos",
     "effic_vs_dr 'Efficiency vs dr' num_assoc(simToReco)_dr num_simul_dr",
-    "effic_vs_drj 'Efficiency vs dr (track,jet)' num_assoc(simToReco)_drj num_simul_drj",
     "effic_vertcount_barrel 'efficiency in barrel vs N of pileup vertices' num_assoc(simToReco)_vertcount_barrel num_simul_vertcount_barrel",
     "effic_vertcount_fwdpos 'efficiency in endcap(+) vs N of pileup vertices' num_assoc(simToReco)_vertcount_fwdpos num_simul_vertcount_fwdpos",
     "effic_vertcount_fwdneg 'efficiency in endcap(-) vs N of pileup vertices' num_assoc(simToReco)_vertcount_fwdneg num_simul_vertcount_fwdneg",
@@ -85,7 +63,6 @@ def _addNoFlow(module):
     "effic_vertz_fwdneg 'efficiency in endcap(-) vs z of primary interaction vertex' num_assoc(simToReco)_vertz_fwdneg num_simul_vertz_fwdneg",
     "pileuprate 'Pileup Rate vs #eta' num_pileup_eta num_reco_eta",
     "pileuprate_Pt 'Pileup rate vs p_{T}' num_pileup_pT num_reco_pT",
-#    "pileuprate_Ptvseta 'Pileup rate in (p_{T}-#eta) plane' num_pileup_pTvseta num_reco_pTvseta",
     "pileuprate_hit 'Pileup rate vs hit' num_pileup_hit num_reco_hit",
     "pileuprate_layer 'Pileup rate vs layer' num_pileup_layer num_reco_layer",
     "pileuprate_pixellayer 'Pileup rate vs layer' num_pileup_pixellayer num_reco_pixellayer",
@@ -101,12 +78,10 @@ def _addNoFlow(module):
     "pileuprate_vertpos 'Pileup rate vs vertpos' num_pileup_vertpos num_reco_vertpos",
     "pileuprate_zpos 'Pileup rate vs zpos' num_pileup_zpos num_reco_zpos",
     "pileuprate_dr 'Pileup rate vs dr' num_pileup_dr num_reco_dr",
-    "pileuprate_drj 'Pileup rate vs dr (track,jet)' num_pileup_drj num_reco_drj",
     "pileuprate_chi2 'Pileup rate vs normalized #chi^{2}' num_pileup_chi2 num_reco_chi2",
     "pileuprate_seedingLayerSet 'Pileup rate vs. seedingLayerSet' num_pileup_seedingLayerSet num_reco_seedingLayerSet",
     "fakerate 'Fake rate vs #eta' num_assoc(recoToSim)_eta num_reco_eta fake",
     "fakeratePt 'Fake rate vs p_{T}' num_assoc(recoToSim)_pT num_reco_pT fake",
-#    "fakeratePtvseta 'Fake rate in (p_{T}-#eta) plane' num_assoc(recoToSim)_pTvseta num_reco_pTvseta fake",
     "fakerate_vs_hit 'Fake rate vs hit' num_assoc(recoToSim)_hit num_reco_hit fake",
     "fakerate_vs_layer 'Fake rate vs layer' num_assoc(recoToSim)_layer num_reco_layer fake",
     "fakerate_vs_pixellayer 'Fake rate vs layer' num_assoc(recoToSim)_pixellayer num_reco_pixellayer fake",
@@ -122,7 +97,6 @@ def _addNoFlow(module):
     "fakerate_vs_vertpos 'Fake rate vs vertpos' num_assoc(recoToSim)_vertpos num_reco_vertpos fake",
     "fakerate_vs_zpos 'Fake rate vs vertpos' num_assoc(recoToSim)_zpos num_reco_zpos fake",
     "fakerate_vs_dr 'Fake rate vs dr' num_assoc(recoToSim)_dr num_reco_dr fake",
-    "fakerate_vs_drj 'Fake rate vs dr (track,jet)' num_assoc(recoToSim)_drj num_reco_drj fake",
     "fakerate_vs_chi2 'Fake rate vs normalized #chi^{2}' num_assoc(recoToSim)_chi2 num_reco_chi2 fake",
     "fakerate_vs_seedingLayerSet 'Fake rate vs. seedingLayerSet' num_assoc(recoToSim)_seedingLayerSet num_reco_seedingLayerSet fake",
     "fakerate_vertcount_barrel 'fake rate in barrel vs N of pileup vertices' num_assoc(recoToSim)_vertcount_barrel num_reco_vertcount_barrel fake",
@@ -177,30 +151,23 @@ def _addNoFlow(module):
                              "cotThetares_vs_eta '#sigma(cot(#theta)) vs #eta' cotThetares_vs_eta",
                              "cotThetares_vs_pt '#sigma(cot(#theta)) vs p_{T}' cotThetares_vs_pt",
                              "h_dxypulleta 'd_{xy} Pull vs #eta' dxypull_vs_eta",
-                             "h_dxypullpt 'd_{xy} Pull vs p_{T}' dxypull_vs_pt",
                              "dxyres_vs_eta '#sigma(d_{xy}) vs #eta' dxyres_vs_eta",
-                             "dxyres_vs_phi '#sigma(d_{xy}) vs #phi' dxyres_vs_phi",
                              "dxyres_vs_pt '#sigma(d_{xy}) vs p_{T}' dxyres_vs_pt",
                              "h_dzpulleta 'd_{z} Pull vs #eta' dzpull_vs_eta",
-                             "h_dzpullpt 'd_{z} Pull vs p_{T}' dzpull_vs_pt",
                              "dzres_vs_eta '#sigma(d_{z}) vs #eta' dzres_vs_eta",
-                             "dzres_vs_phi '#sigma(d_{z}) vs #phi' dzres_vs_phi",
                              "dzres_vs_pt '#sigma(d_{z}) vs p_{T}' dzres_vs_pt",
                              "etares_vs_eta '#sigma(#eta) vs #eta' etares_vs_eta",
                              "h_phipulleta '#phi Pull vs #eta' phipull_vs_eta",
-                             "h_phipullpt '#phi Pull vs p_{T}' phipull_vs_pt",
                              "h_phipullphi '#phi Pull vs #phi' phipull_vs_phi",
                              "phires_vs_eta '#sigma(#phi) vs #eta' phires_vs_eta",
                              "phires_vs_phi '#sigma(#phi) vs #phi' phires_vs_phi",
                              "phires_vs_pt '#sigma(#phi) vs p_{T}' phires_vs_pt",
                              "h_ptpulleta 'p_{T} Pull vs #eta' ptpull_vs_eta",
-                             "h_ptpullpt 'p_{T} Pull vs p_{T}' ptpull_vs_pt",
                              "h_ptpullphi 'p_{T} Pull vs #phi' ptpull_vs_phi",
                              "ptres_vs_eta '#sigma(p_{T}) vs #eta' ptres_vs_eta",
                              "ptres_vs_phi '#sigma(p_{T}) vs #phi' ptres_vs_phi",
                              "ptres_vs_pt '#sigma(p_{T}) vs p_{T}' ptres_vs_pt",
                              "h_thetapulleta '#theta Pull vs #eta' thetapull_vs_eta",
-                             "h_thetapullpt '#theta Pull vs p_{T}' thetapull_vs_pt",
                              "h_thetapullphi '#theta Pull vs #phi' thetapull_vs_phi"
                              ),
     cumulativeDists = cms.untracked.vstring(
@@ -252,24 +219,21 @@ def _addNoFlow(module):
     noFlowDists = cms.untracked.vstring(),
     outputFileName = cms.untracked.string("")
 )
+def _addNoFlow(module):
+    _noflowSeen = set()
+    for eff in module.efficiency.value():
+        tmp = eff.split(" ")
+        if "cut" in tmp[0]:
+            continue
+        ind = -1
+        if tmp[ind] == "fake" or tmp[ind] == "simpleratio":
+            ind = -2
+        if not tmp[ind] in _noflowSeen:
+            module.noFlowDists.append(tmp[ind])
+        if not tmp[ind-1] in _noflowSeen:
+            module.noFlowDists.append(tmp[ind-1])
 _addNoFlow(postProcessorTrack)
 
-postProcessorTrack2D = DQMEDHarvester("DQMGenericClient",
-    makeGlobalEffienciesPlot = cms.untracked.bool(False),
-    subDirs = cms.untracked.vstring(_defaultSubdirs),
-    efficiency = cms.vstring(
-    "efficPtvseta 'Efficiency in p_{T}-#eta plane' num_assoc(simToReco)_pTvseta num_simul_pTvseta",
-    "duplicatesRate_Ptvseta 'Duplicates Rate in (p_{T}-#eta) plane' num_duplicate_pTvseta num_reco_pTvseta",
-    "chargeMisIdRate_Ptvseta 'Charge MisID Rate in (p_{T}-#eta) plane' num_chargemisid_pTvseta num_reco_pTvseta",
-    "pileuprate_Ptvseta 'Pileup rate in (p_{T}-#eta) plane' num_pileup_pTvseta num_reco_pTvseta",
-    "fakeratePtvseta 'Fake rate in (p_{T}-#eta) plane' num_assoc(recoToSim)_pTvseta num_reco_pTvseta fake",
-    ),
-    resolution = cms.vstring(),
-    noFlowDists = cms.untracked.vstring(),
-    outputFileName = cms.untracked.string("")
-)
-_addNoFlow(postProcessorTrack2D)
-
 # nrec/nsim makes sense only for
 # - all tracks vs. all in-time TrackingParticles
 # - PV tracks vs. signal TrackingParticles
@@ -278,27 +242,15 @@ def _addNoFlow(module):
     efficiency = cms.vstring(
         "nrecPerNsim 'Tracks/TrackingParticles vs #eta' num_reco2_eta num_simul_eta simpleratio",
         "nrecPerNsimPt 'Tracks/TrackingParticles vs p_{T}' num_reco2_pT num_simul_pT simpleratio",
-#        "nrecPerNsimPtvseta 'Tracks/TrackingParticles in (p_{T}-#eta) plane' num_reco2_pTvseta num_simul_pTvseta simpleratio",
         "nrecPerNsim_vs_pu 'Tracks/TrackingParticles vs pu' num_reco2_pu num_simul_pu simpleratio",
     ),
     resolution = cms.vstring(),
     noFlowDists = cms.untracked.vstring(),
 )
 _addNoFlow(postProcessorTrackNrecVsNsim)
-postProcessorTrackNrecVsNsim2D = DQMEDHarvester("DQMGenericClient",
-    makeGlobalEffienciesPlot = cms.untracked.bool(False),
-    subDirs = cms.untracked.vstring("Tracking/TrackFromPV/*", "Tracking/TrackAllTPEffic/*"),
-    efficiency = cms.vstring(
-        "nrecPerNsimPtvseta 'Tracks/TrackingParticles in (p_{T}-#eta) plane' num_reco2_pTvseta num_simul_pTvseta simpleratio",
-    ),
-    resolution = cms.vstring(),
-    noFlowDists = cms.untracked.vstring(),
-)
-_addNoFlow(postProcessorTrackNrecVsNsim2D)
-
 
 postProcessorTrackSummary = DQMEDHarvester("DQMGenericClient",
-    subDirs = cms.untracked.vstring(_defaultSubdirsSummary),
+    subDirs = cms.untracked.vstring("Tracking/Track", "Tracking/TrackTPPtLess09", "Tracking/TrackFromPV", "Tracking/TrackFromPVAllTP", "Tracking/TrackAllTPEffic", "Tracking/TrackBuilding", "Tracking/TrackConversion", "Tracking/TrackGsf", "Tracking/TrackBHadron"),
     efficiency = cms.vstring(
     "effic_vs_coll 'Efficiency vs track collection' num_assoc(simToReco)_coll num_simul_coll",
     "effic_vs_coll_allPt 'Efficiency vs track collection' num_assoc(simToReco)_coll_allPt num_simul_coll_allPt",
@@ -317,46 +269,13 @@ def _addNoFlow(module):
     postProcessorTrackSummary
 )
 
-fastSim.toModify(postProcessorTrack, subDirs = [e for e in _defaultSubdirs if e not in ["Tracking/TrackGsf/*","Tracking/TrackConversion/*"]])
-fastSim.toModify(postProcessorTrackSummary, subDirs = [e for e in _defaultSubdirsSummary if e not in ["Tracking/TrackGsf","Tracking/TrackConversion"]])
-
-#######
-# Define a standalone seuquence to support the Standalone harvesting mode
-# see https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideMultiTrackValidator#cmsDriver_MTV_alone_i_e_standalone for more information
-########
-
-postProcessorTrackStandalone = postProcessorTrack.clone(
-    subDirs = _defaultSubdirs+["Tracking/TrackBHadron/*"]
-)
-postProcessorTrackSummaryStandalone = postProcessorTrackSummary.clone(
-    subDirs = _defaultSubdirs+["Tracking/TrackBHadron"]
-)
-
-postProcessorTrackSequenceStandalone = cms.Sequence(
-    postProcessorTrackStandalone+
-    postProcessorTrackNrecVsNsim+
-    postProcessorTrackSummaryStandalone
-)
-
-postProcessorTrackPhase2 = postProcessorTrack.clone()
-postProcessorTrackPhase2.subDirs.extend(["Tracking/TrackTPEtaGreater2p7/*"])
-postProcessorTrackSummaryPhase2 = postProcessorTrackSummary.clone()
-postProcessorTrackSummaryPhase2.subDirs.extend(["Tracking/TrackTPEtaGreater2p7/*"])
-
-from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
-phase2_tracker.toReplaceWith(postProcessorTrack,postProcessorTrackPhase2)
-phase2_tracker.toReplaceWith(postProcessorTrackSummary,postProcessorTrackSummaryPhase2)
-
 postProcessorTrackTrackingOnly = postProcessorTrack.clone()
-postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackBHadron/*","Tracking/TrackSeeding/*", "Tracking/PixelTrack/*"])
+postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*"])
 postProcessorTrackSummaryTrackingOnly = postProcessorTrackSummary.clone()
-postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackBHadron","Tracking/TrackSeeding", "Tracking/PixelTrack"])
+postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackSeeding", "Tracking/PixelTrack", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*"])
 
 postProcessorTrackSequenceTrackingOnly = cms.Sequence(
     postProcessorTrackTrackingOnly+
     postProcessorTrackNrecVsNsim+
     postProcessorTrackSummaryTrackingOnly
 )
-
-fastSim.toModify(postProcessorTrackTrackingOnly,subDirs = [e for e in _defaultSubdirs if e not in ["Tracking/TrackGsf/*","Tracking/TrackConversion/*","Tracking/TrackBHadron/*"]])
-fastSim.toModify(postProcessorTrackSummaryTrackingOnly,subDirs = [e for e in _defaultSubdirsSummary if e not in ["Tracking/TrackGsf","Tracking/TrackConversion","Tracking/TrackBHadron"]])
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index c8fb7653f0e43..a90d22e8518cf 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -733,6 +733,12 @@ def _uniqueFirstLayers(layerList):
     trackAssociation = "trackingParticlePixelTrackAsssociation"
 )
 
+pixelTracksFromPV = generalTracksFromPV.clone(
+    src = "pixelTracks",
+    vertexTag = "pixelVertices",
+    quality = "undefQuality",
+)
+
 trackValidatorPixelTrackingOnly = trackValidator.clone(
     dirName = "Tracking/PixelTrack/",
     label = ["pixelTracks"],
@@ -743,6 +749,28 @@ def _uniqueFirstLayers(layerList):
     vertexAssociator = "PixelVertexAssociatorByPositionAndTracks",
     dodEdxPlots = False,
 )
+trackValidatorFromPVPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone(
+    dirName = "Tracking/PixelTrackFromPV/",
+    label = ["pixelTracksFromPV"],
+    label_tp_effic = "trackingParticlesSignal",
+    label_tp_fake = "trackingParticlesSignal",
+    label_tp_effic_refvector = True,
+    label_tp_fake_refvector = True,
+    trackCollectionForDrCalculation = "pixelTracksFromPV",
+    doPlotsOnlyForTruePV = True,
+    doPVAssociationPlots = False,
+    doResolutionPlotsForLabels = ["disabled"],
+)
+trackValidatorFromPVAllTPPixelTrackingOnly = trackValidatorFromPVPixelTrackingOnly.clone(
+    dirName = "Tracking/PixelTrackFromPVAllTP/",
+    label_tp_effic = trackValidatorPixelTrackingOnly.label_tp_effic.value(),
+    label_tp_fake = trackValidatorPixelTrackingOnly.label_tp_fake.value(),
+    label_tp_effic_refvector = False,
+    label_tp_fake_refvector = False,
+    doSimPlots = False,
+    doSimTrackPlots = False,
+)
+
 
 tracksValidationTruthPixelTrackingOnly = tracksValidationTruth.copy()
 tracksValidationTruthPixelTrackingOnly.replace(tpClusterProducer, tpClusterProducerPixelTrackingOnly)
@@ -756,7 +784,11 @@ def _uniqueFirstLayers(layerList):
 
 tracksValidationPixelTrackingOnly = cms.Sequence(
     tracksValidationTruthPixelTrackingOnly +
-    trackValidatorPixelTrackingOnly
+    cms.ignore(trackingParticlesSignal) +
+    pixelTracksFromPV +
+    trackValidatorPixelTrackingOnly +
+    trackValidatorFromPVPixelTrackingOnly +
+    trackValidatorFromPVAllTPPixelTrackingOnly
 )
 
 
diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py
index a76601813fc3f..d3f593f6a7586 100644
--- a/Validation/RecoTrack/python/plotting/html.py
+++ b/Validation/RecoTrack/python/plotting/html.py
@@ -48,7 +48,6 @@ def _lowerFirst(s):
 _fromPVName = "Tracks from PV"
 _fromPVAllTPName = "Tracks from PV (all TPs)"
 _tpPtLess09Name = "All tracks (TP pT &lt; 0.9 GeV)"
-_tpEtaGreater2p7Name = "All tracks (TP |eta| &gt; 2.7)"
 _conversionName = "Tracks for conversions"
 _gsfName = "Electron GSF tracks"
 _bhadronName = "All tracks (B-hadron TPs)"
@@ -64,6 +63,8 @@ def _allToBTV(s):
     return s.replace("All", "BTV-like")
 def _ptCut(s):
     return s.replace("Tracks", "Tracks pT &gt; 0.9 GeV").replace("tracks", "tracks pT &gt; 0.9 GeV")
+def _toPixel(s):
+    return s.replace("Tracks", "Pixel tracks")
 _trackQualityNameOrder = collections.OrderedDict([
     ("seeding_seeds", "Seeds"),
     ("seeding_seedsa", "Seeds A"),
@@ -86,8 +87,6 @@ def _ptCut(s):
     ("tpPtLess09_highPurityByOriginalAlgo", _toOriAlgo(_allToHP(_tpPtLess09Name))),
     ("tpPtLess09_ByAlgoMask", _toAlgoMask(_tpPtLess09Name)),
     ("tpPtLess09_highPurityByAlgoMask", _toAlgoMask(_allToHP(_tpPtLess09Name))),
-    ("tpEtaGreater2p7_", _tpEtaGreater2p7Name),
-    ("tpEtaGreater2p7_highPurity", _allToHP(_tpEtaGreater2p7Name)),
     ("btvLike", _allToBTV(_allName)),
     ("ak4PFJets", "AK4 PF jets"),
     ("allTPEffic_", _allTPEfficName),
@@ -171,7 +170,6 @@ def _ptCut(s):
     "timing": "Timing",
     "hlt": "HLT",
     "pixel": "Pixel tracks",
-    "pf": "PF",
 }
 
 _sectionNameMapOrder = collections.OrderedDict([
@@ -184,8 +182,6 @@ def _ptCut(s):
     ("highPurityPt09", "High purity tracks (pT&gt;0.9 GeV)"),
     ("tpPtLess09", _tpPtLess09Name),
     ("tpPtLess09_highPurity", _allToHP(_tpPtLess09Name)),
-    ("tpEtaGreater2p7", _tpEtaGreater2p7Name),
-    ("tpEtaGreater2p7_highPurity", _allToHP(_tpEtaGreater2p7Name)),
     ("btvLike", "BTV-like"),
     ("ak4PFJets", "AK4 PF jets"),
     ("allTPEffic", _allTPEfficName),
@@ -200,6 +196,8 @@ def _ptCut(s):
     ("bhadron_highPurity", _allToHP(_bhadronName)),
     # Pixel tracks
     ("pixel", "Pixel tracks"),
+    ("pixelFromPV", _toPixel(_fromPVName)),
+    ("pixelFromPVAllTP", _toPixel(_fromPVAllTPName)),
     # These are for vertices
     ("genvertex", "Gen vertices"),
     ("pixelVertices", "Pixel vertices"),
@@ -248,6 +246,8 @@ def _sectionNameLegend():
         "bhadron_": _bhadronLegend,
         "bhadron_highPurity": _allToHP(_bhadronLegend),
         "bhadron_btvLike": _bhadronLegend.replace("All tracks", _btvLegend),
+        "pixelFromPV": _fromPVLegend,
+        "pixelFromPVAllTP": _fromPVAllTPLegend,
     }
 
 class Table:
@@ -305,7 +305,6 @@ class MiniAOD: pass
     class Timing: pass
     class HLT: pass
     class Pixel: pass
-    class PF: pass
 
 class Page(object):
     def __init__(self, title, sampleName):
@@ -425,12 +424,6 @@ def _formatPlotSets(self):
             '  </table>',
         ])
 
-        if len(fileTable):
-            first_row = fileTable[0]
-            self._content.extend([
-              '  <a href="%s">Browse Folder</a>' % (first_row[1][0:first_row[1].rfind('/')])
-            ])
-
     def _appendColumnHeader(self, header):
         leg = ""
         if header in self._columnHeadersIndex:
@@ -689,7 +682,6 @@ def __init__(self, sample, title, fastVsFull, pileupComparison):
         self._vertexPage = PageSet(*params)
         self._miniaodPage = PageSet(*params)
         self._timingPage = PageSet(*params)
-        self._pfPages = PageSet(*params)
         self._hltPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0])
         self._pixelPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0])
         self._otherPages = PageSet(*params)
@@ -700,7 +692,6 @@ def __init__(self, sample, title, fastVsFull, pileupComparison):
             PlotPurpose.Vertexing: self._vertexPage,
             PlotPurpose.MiniAOD: self._miniaodPage,
             PlotPurpose.Timing: self._timingPage,
-            PlotPurpose.PF: self._pfPages,
             PlotPurpose.HLT: self._hltPages,
             PlotPurpose.Pixel: self._pixelPages,
         }
@@ -724,7 +715,7 @@ def write(self, baseDir):
             "  <ul>",
             ]
 
-        for pages in [self._summaryPage, self._iterationPages, self._pixelPages, self._vertexPage, self._miniaodPage, self._timingPage, self._hltPages, self._pfPages, self._otherPages]:
+        for pages in [self._summaryPage, self._iterationPages, self._pixelPages, self._vertexPage, self._miniaodPage, self._timingPage, self._hltPages, self._otherPages]:
             labelFiles = pages.write(baseDir)
             for label, fname in labelFiles:
                 ret.append('   <li><a href="%s">%s</a></li>' % (fname, label))
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index 374c83cf50ac8..ac81473c843cb 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from builtins import range
 import os
 import copy
 import collections
@@ -9,11 +7,11 @@
 ROOT.gROOT.SetBatch(True)
 ROOT.PyConfig.IgnoreCommandLineOptions = True
 
-from .plotting import Subtract, FakeDuplicate, CutEfficiency, Transform, AggregateBins, ROC, Plot, PlotEmpty, PlotGroup, PlotOnSideGroup, PlotFolder, Plotter
-from .html import PlotPurpose
-from . import plotting
-from . import validation
-from . import html
+from plotting import Subtract, FakeDuplicate, CutEfficiency, Transform, AggregateBins, ROC, Plot, PlotEmpty, PlotGroup, PlotOnSideGroup, PlotFolder, Plotter
+from html import PlotPurpose
+import plotting
+import validation
+import html
 
 ########################################
 #
@@ -41,7 +39,6 @@
 _maxPU = [20, 50, 65, 80, 100, 150, 200, 250]
 _minMaxTracks = [0, 200, 500, 1000, 1500, 2000]
 _minMaxMVA = [-1.025, -0.5, 0, 0.5, 1.025]
-_maxDRJ = 0.1
 
 def _minMaxResidual(ma):
     return ([-x for x in ma], ma)
@@ -217,9 +214,8 @@ def _makeMVAPlots(num, hp=False):
 )
 _effandfakeDeltaRPU = PlotGroup("effandfakeDeltaRPU",
                                 _makeEffFakeDupPlots("dr"     , "#DeltaR", effopts=dict(xtitle="TP min #DeltaR"), fakeopts=dict(xtitle="track min #DeltaR"), common=dict(xlog=True)) +
-                                _makeEffFakeDupPlots("drj" , "#DeltaR(track, jet)", effopts=dict(xtitle="#DeltaR(TP, jet)", ytitle="efficiency vs #DeltaR(TP, jet"), fakeopts=dict(xtitle="#DeltaR(track, jet)"), common=dict(xlog=True, xmax=_maxDRJ))+
                                 _makeEffFakeDupPlots("pu"     , "PU"     , common=dict(xtitle="Pileup", xmin=_minPU, xmax=_maxPU)),
-                                legendDy=_legendDy_4rows
+                                legendDy=_legendDy_2rows
 )
 
 
@@ -263,9 +259,8 @@ def _makeMVAPlots(num, hp=False):
 )
 _dupandfakeDeltaRPU = PlotGroup("dupandfakeDeltaRPU",
                                 _makeFakeDupPileupPlots("dr"     , "#DeltaR", xquantity="min #DeltaR", common=dict(xlog=True)) +
-                                _makeFakeDupPileupPlots("drj"     , "#DeltaR(track, jet)", xtitle="#DeltaR(track, jet)", common=dict(xlog=True, xmax=_maxDRJ)) +
                                 _makeFakeDupPileupPlots("pu"     , "PU"     , xtitle="Pileup", common=dict(xmin=_minPU, xmax=_maxPU)),
-                                ncols=3
+                                ncols=3, legendDy=_legendDy_2rows_3cols
 )
 _seedingLayerSet_common = dict(removeEmptyBins=True, xbinlabelsize=8, xbinlabeloption="d", adjustMarginRight=0.1)
 _dupandfakeSeedingPlots = _makeFakeDupPileupPlots("seedingLayerSet", "seeding layers", xtitle="", common=_seedingLayerSet_common)
@@ -372,10 +367,7 @@ def _makeMVAPlots(num, hp=False):
     Plot("chi2_prob", stat=True, normalizeToUnitArea=True, drawStyle="hist", xtitle="Prob(#chi^{2})"),
     Plot("chi2mean", title="", xtitle="#eta", ytitle="< #chi^{2} / ndf >", ymin=[0, 0.5], ymax=[2, 2.5, 3, 5],
          fallback={"name": "chi2_vs_eta", "profileX": True}),
-    Plot("ptres_vs_eta_Mean", scale=100, title="", xtitle="TP #eta (PCA to beamline)", ytitle="< #delta p_{T} / p_{T} > (%)", ymin=_minResidualPt, ymax=_maxResidualPt),
-    Plot("chi2mean_vs_pt", title="", xtitle="p_{T}", ytitle="< #chi^{2} / ndf >", ymin=[0, 0.5], ymax=[2, 2.5, 3, 5], xlog=True, fallback={"name": "chi2_vs_pt", "profileX": True}),
-    Plot("chi2mean_vs_drj", title="", xtitle="#DeltaR(track, jet)", ytitle="< #chi^{2} / ndf >", ymin=[0, 0.5], ymax=[2, 2.5, 3, 5], xlog=True, xmax=_maxDRJ, fallback={"name": "chi2_vs_drj", "profileX": True}),
-    Plot("ptres_vs_pt_Mean", title="", xtitle="p_{T}", ytitle="< #delta p_{T}/p_{T} > (%)", scale=100, ymin=_minResidualPt, ymax=_maxResidualPt,xlog=True)
+    Plot("ptres_vs_eta_Mean", scale=100, title="", xtitle="TP #eta (PCA to beamline)", ytitle="< #delta p_{T} / p_{T} > (%)", ymin=_minResidualPt, ymax=_maxResidualPt)
 ])
 _common = {"stat": True, "fit": True, "normalizeToUnitArea": True, "drawStyle": "hist", "drawCommand": "", "xmin": -10, "xmax": 10, "ylog": True, "ymin": 5e-5, "ymax": [0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1.025], "ratioUncertainty": False}
 _pulls = PlotGroup("pulls", [
@@ -404,13 +396,6 @@ def _makeMVAPlots(num, hp=False):
     Plot("dzres_vs_pt_Sigma", ytitle="#sigma(#delta d_{z}) (cm)", **_common),
     Plot("ptres_vs_pt_Sigma", ytitle="#sigma(#delta p_{T}/p_{T})", **_common),
 ])
-_common = {"title": "", "ylog": True, "xtitle": "TP #Phi (PCA to beamline)", "ymin": _minMaxResol, "ymax": _minMaxResol}
-_resolutionsPhi = PlotGroup("resolutionsPhi", [
-    Plot("dxyres_vs_phi_Sigma", ytitle="#sigma(#delta d_{xy}) (cm)", **_common),
-    Plot("dzres_vs_phi_Sigma", ytitle="#sigma(#delta d_{z}) (cm)", **_common),
-    Plot("phires_vs_phi_Sigma", ytitle="#sigma(#delta #phi) (rad)", **_common),
-    Plot("ptres_vs_phi_Sigma", ytitle="#sigma(#delta p_{T}/p_{T})", **_common),
-])
 
 ## Extended set of plots
 _extDistPtEtaPhi = PlotGroup("distPtEtaPhi",
@@ -435,16 +420,12 @@ def _makeMVAPlots(num, hp=False):
                                _makeDistPlots("3Dlayer"   , "3D layers"   , common=dict(xmin=_min3DLayers, xmax=_max3DLayers)),
                                ncols=4, legendDy=_legendDy_4rows,
 )
-_extDistPos = PlotGroup("distPos",
+_extDistPosDeltaR = PlotGroup("distPosDeltaR",
                               _makeDistPlots("vertpos", "ref. point r (cm)", common=dict(xlog=True)) +
                               _makeDistPlots("zpos"   , "ref. point z (cm)") +
-                              _makeDistPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)),
-                              ncols=3,
-)
-_extDistDeltaR = PlotGroup("distDeltaR",
-                              _makeDistPlots("dr"     , "min #DeltaR", common=dict(xlog=True)) +
-                              _makeDistPlots("drj"     , "#DeltaR(track, jet)", common=dict(xlog=True, xmax=_maxDRJ)),
-                              ncols=2, legendDy=_legendDy_2rows,
+                              _makeDistPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)) +
+                              _makeDistPlots("dr"     , "min #DeltaR", common=dict(xlog=True)),
+                              ncols=4, legendDy=_legendDy_4rows,
 )
 _extDistSeedingPlots = _makeDistPlots("seedingLayerSet", "seeding layers", common=dict(xtitle="", **_seedingLayerSet_common))
 _extDistChi2Seeding = PlotGroup("distChi2Seeding",
@@ -506,16 +487,12 @@ def _makeMVAPlots(num, hp=False):
                                   _makeDistSimPlots("3Dlayer"   , "3D layers"   , common=dict(xmin=_min3DLayers, xmax=_max3DLayers)),
                                   ncols=2, legendDy=_legendDy_4rows,
 )
-_extDistSimPos = PlotGroup("distsimPos",
+_extDistSimPosDeltaR = PlotGroup("distsimPosDeltaR",
                                  _makeDistSimPlots("vertpos", "vert r (cm)", common=dict(xlog=True)) +
                                  _makeDistSimPlots("zpos"   , "vert z (cm)") +
-                                 _makeDistSimPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)),
-                                 ncols=3,
-)
-_extDistSimDeltaR = PlotGroup("distsimDeltaR",
-                                 _makeDistSimPlots("dr"     , "min #DeltaR", common=dict(xlog=True)) +
-                                 _makeDistSimPlots("drj" , "#DeltaR(TP, jet)", common=dict(xlog=True, xmax=_maxDRJ)),
-                                 ncols=2, legendDy=_legendDy_2rows,
+                                 _makeDistSimPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)) +
+                                 _makeDistSimPlots("dr"     , "min #DeltaR", common=dict(xlog=True)),
+                                 ncols=2, legendDy=_legendDy_4rows,
 )
 
 ########################################
@@ -595,8 +572,6 @@ def _trackingSubFoldersFallbackFromPV(subfolder):
     return subfolder.replace("trackingParticleRecoAsssociation", "trackingParticleRecoAsssociationSignal")
 def _trackingSubFoldersFallbackConversion(subfolder):
     return subfolder.replace("quickAssociatorByHits", "quickAssociatorByHitsConversion")
-def _trackingSubFoldersFallbackPreSplitting(subfolder):
-    return subfolder.replace("quickAssociatorByHits", "quickAssociatorByHitsPreSplitting")
 
 # Additional "quality" flags than highPurity. In a separate list to
 # allow customization.
@@ -634,8 +609,8 @@ def _mapCollectionToAlgoQuality(collName):
         collNameLow = collNameLow[:i_seeds]
 
     algo = None
-    prefixes = ["cutsreco", "cutsrecofrompv", "cutsrecofrompv2", "cutsrecofrompvalltp", "cutsrecoetagreater2p7"]
-    if collNameLow in ["general", "generalfrompv", "generaletagreater2p7"]+prefixes:
+    prefixes = ["cutsreco", "cutsrecofrompv", "cutsrecofrompv2", "cutsrecofrompvalltp"]
+    if collNameLow in ["general", "generalfrompv"]+prefixes:
         algo = "ootb"
     else:
         def testColl(coll):
@@ -1088,7 +1063,7 @@ def draw(self, legendLabels, prefix=None, directory="", *args, **kwargs):
         legendLabels = legendLabels[:]
         if max(map(len, legendLabels)) > 20:
             haveShortLabels = True
-            labels_short = [str(chr(ord('A')+i)) for i in range(len(legendLabels))]
+            labels_short = [str(chr(ord('A')+i)) for i in xrange(len(legendLabels))]
             for i, ls in enumerate(labels_short):
                 legendLabels[i] = "%s: %s" % (ls, legendLabels[i])
         else:
@@ -1142,7 +1117,7 @@ def draw(self, legendLabels, prefix=None, directory="", *args, **kwargs):
         if len(histos_linear) == 0:
             return []
 
-        data = [ [h.GetBinContent(i) for i in range(1, h.GetNbinsX()+1)] for h in histos_linear]
+        data = [ [h.GetBinContent(i) for i in xrange(1, h.GetNbinsX()+1)] for h in histos_linear]
         table = html.Table(["dummy"]*len(histos_linear), xbinlabels, data, None, None, None)
         data = table.tableAsRowColumn()
 
@@ -1229,7 +1204,6 @@ def _trackingFolders(lastDirName="Track"):
     _hitsAndPt,
     _pulls,
     _resolutionsEta,
-    _resolutionsPhi,
     _resolutionsPt,
     _tuning,
 ]
@@ -1249,20 +1223,12 @@ def _trackingFolders(lastDirName="Track"):
   + _makeMVAPlots(3) \
   + _makeMVAPlots(3, hp=True)
 # add more if needed
-_buildingExtendedPlots = [
-    _pulls,
-    _resolutionsEta,
-    _resolutionsPhi,
-    _resolutionsPt,
-    _tuning,
-]
 _extendedPlots = [
     _extDistPtEtaPhi,
     _extDistDxyDzBS,
     _extDistDxyDzPV,
     _extDistHitsLayers,
-    _extDistPos,
-    _extDistDeltaR,
+    _extDistPosDeltaR,
     _extDistChi2Seeding,
     _extDistSeedingTable,
     _extResidualEta,
@@ -1273,8 +1239,7 @@ def _trackingFolders(lastDirName="Track"):
     _extDistSimDxyDzBS,
     _extDistSimDxyDzPV,
     _extDistSimHitsLayers,
-    _extDistSimPos,
-    _extDistSimDeltaR,
+    _extDistSimPosDeltaR,
 ]
 _summaryPlots = [
     _summary,
@@ -1314,7 +1279,7 @@ def _trackingFolders(lastDirName="Track"):
 ]
 plotter = Plotter()
 plotterExt = Plotter()
-def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, onlyForElectron=False, onlyForConversion=False, onlyForBHadron=False, seeding=False, building=False, rawSummary=False, highPuritySummary=True):
+def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, onlyForElectron=False, onlyForConversion=False, onlyForBHadron=False, seeding=False, rawSummary=False, highPuritySummary=True):
     folders = _trackingFolders(lastDirName)
     # to keep backward compatibility, this set of plots has empty name
     limiters = dict(onlyForPileup=onlyForPileup, onlyForElectron=onlyForElectron, onlyForConversion=onlyForConversion, onlyForBHadron=onlyForBHadron)
@@ -1323,14 +1288,9 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
     ], **limiters)
     common = dict(fallbackDqmSubFolders=[
         _trackingSubFoldersFallbackSLHC_Phase1PU140,
-        _trackingSubFoldersFallbackFromPV, _trackingSubFoldersFallbackConversion,
-        _trackingSubFoldersFallbackPreSplitting])
+        _trackingSubFoldersFallbackFromPV, _trackingSubFoldersFallbackConversion])
     plotter.append(name, folders, TrackingPlotFolder(*algoPlots, **commonForTPF), **common)
-    extendedPlots = []
-    if building:
-        extendedPlots.extend(_buildingExtendedPlots)
-    extendedPlots.extend(_extendedPlots)
-    plotterExt.append(name, folders, TrackingPlotFolder(*extendedPlots, **commonForTPF), **common)
+    plotterExt.append(name, folders, TrackingPlotFolder(*_extendedPlots, **commonForTPF), **common)
 
     summaryName = ""
     if name != "":
@@ -1365,22 +1325,28 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
         plotter.appendTable(summaryName, folders, TrackingSummaryTable(section="ak4PFJets", collection=TrackingSummaryTable.AK4PFJets))
 _appendTrackingPlots("Track", "", _simBasedPlots+_recoBasedPlots)
 _appendTrackingPlots("TrackTPPtLess09", "tpPtLess09", _simBasedPlots)
-_appendTrackingPlots("TrackTPEtaGreater2p7", "tpEtaGreater2p7", _simBasedPlots+_recoBasedPlots)
 _appendTrackingPlots("TrackAllTPEffic", "allTPEffic", _simBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackFromPV", "fromPV", _simBasedPlots+_recoBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackFromPVAllTP", "fromPVAllTP", _simBasedPlots+_recoBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackFromPVAllTP2", "fromPVAllTP2", _simBasedPlots+_recoBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackSeeding", "seeding", _seedingBuildingPlots, seeding=True)
-_appendTrackingPlots("TrackBuilding", "building", _seedingBuildingPlots, building=True)
+_appendTrackingPlots("TrackBuilding", "building", _seedingBuildingPlots)
 _appendTrackingPlots("TrackConversion", "conversion", _simBasedPlots+_recoBasedPlots, onlyForConversion=True, rawSummary=True, highPuritySummary=False)
 _appendTrackingPlots("TrackGsf", "gsf", _simBasedPlots+_recoBasedPlots, onlyForElectron=True, rawSummary=True, highPuritySummary=False)
 _appendTrackingPlots("TrackBHadron", "bhadron", _simBasedPlots+_recoBasedPlots, onlyForBHadron=True)
 # Pixel tracks
-_common = dict(purpose=PlotPurpose.Pixel, page="pixel")
-plotter.append("pixelTrack", _trackingFolders("PixelTrack"), TrackingPlotFolder(*(_simBasedPlots+_recoBasedPlots), **_common))
-plotterExt.append("pixelTrack", _trackingFolders("PixelTrack"), TrackingPlotFolder(*_extendedPlots, **_common))
-plotter.append("pixelTrack_summary",  _trackingFolders("PixelTrack"), PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section="pixel"))
-plotter.appendTable("pixelTrack_summary", _trackingFolders("PixelTrack"), TrackingSummaryTable(section="pixel", collection=TrackingSummaryTable.Pixel))
+def _appendPixelTrackingPlots(lastDirName, name):
+    _common = dict(section=name, purpose=PlotPurpose.Pixel, page="pixel")
+    _folders = _trackingFolders(lastDirName)
+
+    plotter.append(name, _folders, TrackingPlotFolder(*(_simBasedPlots+_recoBasedPlots), **_common))
+    plotterExt.append(name, _folders, TrackingPlotFolder(*_extendedPlots, **_common))
+
+    plotter.append(name+"_summary",  _folders, PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section=name))
+    plotter.appendTable(name+"_summary", _folders, TrackingSummaryTable(section=name, collection=TrackingSummaryTable.Pixel))
+_appendPixelTrackingPlots("PixelTrack", "pixel")
+_appendPixelTrackingPlots("PixelTrackFromPV", "pixelFromPV")
+_appendPixelTrackingPlots("PixelTrackFromPVAllTP", "pixelFromPVAllTP")
 
 
 # MiniAOD
@@ -1479,9 +1445,6 @@ def modules(self):
                          "initialStepClassifier3",
                          "initialStep",
                          "initialStepSelector"],
-              building=["initialStepTrackCandidatesMkFitInput",
-                        "initialStepTrackCandidatesMkFit",
-                        "initialStepTrackCandidates"],
               other=["firstStepPrimaryVerticesUnsorted",
                      "initialStepTrackRefsForJets",
                      "caloTowerForTrk",
@@ -1655,7 +1618,7 @@ def _create(self, tdirectory):
 
         ret = timeTh1.Clone(self._name)
         xaxis = ret.GetXaxis()
-        for i in range(1, ret.GetNbinsX()+1):
+        for i in xrange(1, ret.GetNbinsX()+1):
             ret.SetBinContent(i, ret.GetBinContent(i)/nevents)
             ret.SetBinError(i, ret.GetBinError(i)/nevents)
             xaxis.SetBinLabel(i, xaxis.GetBinLabel(i).replace(" (unscheduled)", ""))
@@ -1696,7 +1659,7 @@ def create(self, tdirectory):
             return None
 
         iterMap = copy.copy(_collLabelMapHp)
-        del iterMap["generalTracks"]
+        del iterMap["generalTracks"] 
         del iterMap["jetCoreRegionalStep"] # this is expensive per track on purpose
         if self._selectedTracks:
             renameBin = lambda bl: _summaryBinRename(bl, highPurity=True, byOriginalAlgo=False, byAlgoMask=True, ptCut=False, seeds=False)
@@ -1707,12 +1670,12 @@ def create(self, tdirectory):
         if h_reco_per_iter is None:
             return None
         values = {}
-        for i in range(1, h_reco_per_iter.GetNbinsX()+1):
+        for i in xrange(1, h_reco_per_iter.GetNbinsX()+1):
             values[h_reco_per_iter.GetXaxis().GetBinLabel(i)] = h_reco_per_iter.GetBinContent(i)
 
 
         result = []
-        for i in range(1, timeTh1.GetNbinsX()+1):
+        for i in xrange(1, timeTh1.GetNbinsX()+1):
             iterName = timeTh1.GetXaxis().GetBinLabel(i)
             if iterName in values:
                 ntrk = values[iterName]
@@ -1744,10 +1707,10 @@ def _edit(s):
             # remove "Tracks" from the track producer name to get the iteration name
             # muonSeeded iterations do not have "Step" in the producer name, so add it here
             return s.replace("Tracks", "").replace("muonSeeded", "muonSeededStep")
-        return [_edit(xaxis.GetBinLabel(i)) for i in range(1, h.GetNbinsX()+1)]
+        return [_edit(xaxis.GetBinLabel(i)) for i in xrange(1, h.GetNbinsX()+1)]
 
     def __call__(self, tdirectory, labels):
-        ret = list(range(0, len(labels)))
+        ret = range(0, len(labels))
         f = tdirectory.GetFile()
         if not f:
             return ret
@@ -1933,3 +1896,5 @@ def headers(self):
 ], PlotFolder(
     _tplifetime,
 ))
+
+

From f76d1a982f017bebde023cf752609994547ed487 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 12 Sep 2018 09:47:46 +0200
Subject: [PATCH 022/102] Tune and speed up doublet algo (cms-patatrack#158)

Tune and speed up the pixel doublet alforithm, and take advantage of GPU read-only memory for a further speedup.

Includes a python notebook to tune the cuts for doublets and triplets.
---
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  40 ++--
 .../PixelTriplets/test/BuildFile.xml          |  31 +--
 .../PixelTriplets/test/fastDPHI_t.cpp         | 197 ++++++++++++++++++
 3 files changed, 240 insertions(+), 28 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index b17ed42cfb390..e46627bf2c322 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -20,11 +20,11 @@ namespace gpuPixelDoublets {
 
   template<typename Hist>
   __device__
-  void doubletsFromHisto(uint8_t const * layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
-                         int16_t const * iphi, Hist const * hist, uint32_t const * offsets,
-                         siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
+  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
+                         int16_t const * __restrict__ iphi, Hist const * __restrict__ hist, uint32_t const * __restrict__ offsets,
+                         siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
                          GPU::VecArray< unsigned int, 256>  * isOuterHitOfCell,
-                         int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {
+                        int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {
 
     auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
 
@@ -41,7 +41,6 @@ namespace gpuPixelDoublets {
 
     auto idx = blockIdx.x*blockDim.x + threadIdx.x;
   for(auto j=idx;j<ntot;j+=blockDim.x*gridDim.x) {
-    auto j = idx; 
 
     uint32_t pairLayerId=0;
     while(j>=innerLayerCumulativeSize[pairLayerId++]);  --pairLayerId; // move to lower_bound ??
@@ -68,18 +67,28 @@ namespace gpuPixelDoublets {
     auto mep = iphi[i];
     auto mez = hh.zg_d[i];
     auto mer = hh.rg_d[i];
-    auto cutoff = [&](int j) { return 
+    auto cutoff = [&](int j) { return
         abs(hh.zg_d[j]-mez) > maxz[pairLayerId] ||
       	abs(hh.zg_d[j]-mez) < minz[pairLayerId] ||
         hh.rg_d[j]-mer > maxr[pairLayerId];
     };
 
     constexpr float z0cut = 12.f;
+    constexpr float hardPtCut = 0.5f;
+    constexpr float minRadius = hardPtCut * 87.f;
+    constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
+    auto ptcut = [&](int j) {
+      auto r2t4 = minRadius2T4;
+      auto ri = mer;
+      auto ro = hh.rg_d[j];
+      auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])),abs(int16_t(iphi[j]-mep)) ) );
+      return dphi*dphi*(r2t4 -ri*ro) > (ro-ri)*(ro-ri);
+    };
     auto z0cutoff = [&](int j) {
       auto zo =	hh.zg_d[j];
-      auto ro = hh.rg_d[j]; 
+      auto ro = hh.rg_d[j];
       auto dr = ro-mer;
-      return dr > maxr[pairLayerId] || 
+      return dr > maxr[pairLayerId] ||
              dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
     };
 
@@ -92,7 +101,7 @@ namespace gpuPixelDoublets {
     int nmin = 0;
     auto khh = kh;
     incr(khh);
-    
+
     int tooMany=0;
     for (auto kk=kl; kk!=khh; incr(kk)) {
       if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk);
@@ -103,7 +112,7 @@ namespace gpuPixelDoublets {
 
         if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
           continue;
-        if (z0cutoff(oi)) continue;
+        if (z0cutoff(oi) || ptcut(oi)) continue;
         auto ind = atomicInc(nCells,MaxNumOfDoublets);
         // int layerPairId, int doubletId, int innerHitId,int outerHitId)
         cells[ind].init(hh,pairLayerId,ind,i,oi);
@@ -123,12 +132,13 @@ namespace gpuPixelDoublets {
 
   }  // loop in block...
   }
+
   __global__
-  void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp,                
+  void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
                             GPU::VecArray< unsigned int, 256> *isOuterHitOfCell) {
 
-    uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3 
-                                     // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6  
+    uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3
+                                     // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
                                      ,0,7 ,1,7 ,2,7 ,7,8 ,8,9
                                      ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
                                      };
@@ -158,8 +168,8 @@ namespace gpuPixelDoublets {
                            };
 
 
-    auto const & hh = *hhp;
-    doubletsFromHisto(layerPairs, 13, cells, nCells, 
+    auto const &  __restrict__ hh = *hhp;
+    doubletsFromHisto(layerPairs, 13, cells, nCells,
                       hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,
                       hh, isOuterHitOfCell,
                       phicuts, minz, maxz, maxr);
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 6d6f1553b32f3..1de3629887ec9 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -1,18 +1,23 @@
-<library file="HitTripletProducer.cc" name="HitTripletProducer">
-  <use name="boost"/>
-  <use name="root"/>
-  <use name="FWCore/Framework"/>
-  <use name="FWCore/ParameterSet"/>
-  <use name="Geometry/Records"/>
-  <use name="RecoTracker/TkTrackingRegions"/>
-  <use name="RecoPixelVertexing/PixelTriplets"/>
-  <flags EDM_PLUGIN="1"/>
+<library   file="HitTripletProducer.cc" name="HitTripletProducer">
+<use   name="boost"/>
+<use   name="root"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/PluginManager"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="Geometry/Records"/>
+<use   name="Geometry/CommonDetUnit"/>
+<use   name="Geometry/TrackerGeometryBuilder"/>
+<use   name="DataFormats/TrackerRecHit2D"/>
+<use   name="RecoTracker/TkHitPairs"/>
+<use   name="RecoTracker/TkTrackingRegions"/>
+<use   name="RecoPixelVertexing/PixelTriplets"/>
+  <flags   EDM_PLUGIN="1"/>
 </library>
-
 <bin file="PixelTriplets_InvPrbl_t.cpp">
-  <use name="RecoPixelVertexing/PixelTriplets"/>
+  <use   name="RecoPixelVertexing/PixelTriplets"/>
 </bin>
-
 <bin file="PixelTriplets_InvPrbl_prec.cpp">
-  <use name="RecoPixelVertexing/PixelTriplets"/>
+  <use   name="RecoPixelVertexing/PixelTriplets"/>
 </bin>
+
+<bin file="fastDPHI_t.cpp"/>
diff --git a/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp
new file mode 100644
index 0000000000000..58c7f832627fb
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp
@@ -0,0 +1,197 @@
+// this test documents the derivation of the fast deltaphi used in gpu doublet code..
+//
+//
+//
+#include<cmath>
+#include<algorithm>
+#include<numeric>
+#include<cassert>
+
+/**
+| 1) circle is parameterized as:                                              |
+|    C*[(X-Xp)**2+(Y-Yp)**2] - 2*alpha*(X-Xp) - 2*beta*(Y-Yp) = 0             |
+|    Xp,Yp is a point on the track (Yp is at the center of the chamber);      |
+|    C = 1/r0 is the curvature  ( sign of C is charge of particle );          |
+|    alpha & beta are the direction cosines of the radial vector at Xp,Yp     |
+|    i.e.  alpha = C*(X0-Xp),                                                 |
+|          beta  = C*(Y0-Yp),                                                 |
+|    where center of circle is at X0,Y0.                                      |
+|    Alpha > 0                                                                |
+|    Slope dy/dx of tangent at Xp,Yp is -alpha/beta.                          |
+| 2) the z dimension of the helix is parameterized by gamma = dZ/dSperp       |
+|    this is also the tangent of the pitch angle of the helix.                |
+|    with this parameterization, (alpha,beta,gamma) rotate like a vector.     |
+| 3) For tracks going inward at (Xp,Yp), C, alpha, beta, and gamma change sign|
+|
+*/
+
+template<typename T>
+class FastCircle {
+
+public:
+
+  FastCircle(){}
+  FastCircle(T x1, T y1,
+	     T x2, T y2,
+	     T x3, T y3) { 
+    compute(x1,y1,x2,y2,x3,y3);
+  }
+
+  void compute(T x1, T y1,
+	       T x2, T y2,
+	       T x3, T y3);
+  
+
+  T m_xp;
+  T m_yp;
+  T m_c;
+  T m_alpha;
+  T m_beta;
+
+};
+
+
+template<typename T>
+void FastCircle<T>::compute(T x1, T y1,
+			    T x2, T y2,
+			    T x3, T y3) {
+  bool flip = std::abs(x3-x1) > std::abs(y3-y1);
+   
+  auto x1p = x1-x2;
+  auto y1p = y1-y2;
+  auto d12 = x1p*x1p + y1p*y1p;
+  auto x3p = x3-x2;
+  auto y3p = y3-y2;
+  auto d32 = x3p*x3p + y3p*y3p;
+
+  if (flip) {
+    std::swap(x1p,y1p);
+    std::swap(x3p,y3p);
+  }
+
+  auto num = x1p*y3p-y1p*x3p;  // num also gives correct sign for CT
+  auto det = d12*y3p-d32*y1p;
+  if( std::abs(det)==0 ) {
+    // and why we flip????
+  }
+  auto ct  = num/det;
+  auto sn  = det>0 ? T(1.) : T(-1.);  
+  auto st2 = (d12*x3p-d32*x1p)/det;
+  auto seq = T(1.) +st2*st2;
+  auto al2 = sn/std::sqrt(seq);
+  auto be2 = -st2*al2;
+  ct *= T(2.)*al2;
+  
+  if (flip) {
+    std::swap(x1p,y1p);
+    std::swap(al2,be2);
+    al2 = -al2;
+    be2 = -be2;
+    ct = -ct;
+  }
+  
+  m_xp = x1;
+  m_yp = y1;
+  m_c= ct;
+  m_alpha = al2 - ct*x1p;
+  m_beta = be2 - ct*y1p;
+  
+}
+
+
+
+// compute curvature given two points (and origin)
+float fastDPHI(float ri, float ro, float dphi) {
+
+  /*
+  x3=0 y1=0 x1=0;
+  y3=ro
+  */
+
+  // auto x2 = ri*dphi;
+  // auto y2 = ri*(1.f-0.5f*dphi*dphi);
+
+
+  /*
+  auto x1p = x1-x2;
+  auto y1p = y1-y2;
+  auto d12 = x1p*x1p + y1p*y1p;
+  auto x3p = x3-x2;
+  auto y3p = y3-y2;
+  auto d32 = x3p*x3p + y3p*y3p;
+  */
+   
+  /*
+  auto x1p = -x2;
+  auto y1p = -y2;
+  auto d12 = ri*ri;
+  auto x3p = -x2;
+  auto y3p = ro-y2;
+  auto d32 = ri*ri + ro*ro - 2.f*ro*y2;
+  */
+  
+
+  // auto rat = (ro -2.f*y2);
+  // auto det =  ro - ri - (ro - 2.f*ri -0.5f*ro)*dphi*dphi;
+
+  //auto det2 = (ro-ri)*(ro-ri) -2.*(ro-ri)*(ro - 2.f*ri -0.5f*ro)*dphi*dphi;
+  // auto seq = det2 +  dphi*dphi*(ro-2.f*ri)*(ro-2.f*ri);    // *rat2;
+  // auto seq = (ro-ri)*(ro-ri) +  dphi*dphi*ri*ro;
+
+  // and little by little simplifing and removing higher over terms 
+  // we get
+  auto r2 = (ro-ri)*(ro-ri)/(dphi*dphi) + ri*ro;
+
+
+  // d2 = (ro-ri)*(ro-ri)/(4.f*r2 -ri*ro);  
+  // return -2.f*dphi/std::sqrt(seq);
+
+  return -1.f/std::sqrt(r2/4.f);
+  
+}
+
+
+
+#include<iostream>
+
+template<typename T>
+bool equal(T a, T b) {
+  //  return float(a-b)==0;
+  return std::abs(float(a-b)) < std::abs(0.01f*a);
+}
+
+
+
+int n=0;
+void go(float ri, float ro, float dphi, bool print=false) {
+  ++n;
+  float x3 = 0.f, y3 = ro;
+  float x2 = ri*sin(dphi);
+  float y2 = ri*cos(dphi);
+
+  
+  FastCircle<float> c(0,0,
+		  x2,y2,
+                  x3,y3);
+
+  auto cc = fastDPHI(ri,ro,dphi);
+  if (print) std::cout << c.m_c << ' ' << cc << std::endl;
+  assert(equal(c.m_c,cc));
+
+  
+}
+
+int main() {
+
+
+  go(4.,7.,0.1, true);
+
+  for (float r1=2; r1<15; r1+=1)
+    for (float dr=0.5; dr<10; dr+=0.5)
+      for (float dphi=0.02; dphi<0.2; dphi+=0.2)
+	go(r1,r1+dr,dphi);
+
+  std::cout << "done " << n << std::endl;
+  return 0;
+};
+

From 9b538fcc12a13cd07494c96e58747d0f73597616 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 13 Sep 2018 14:49:03 +0200
Subject: [PATCH 023/102] Optimise gpuPixelDoublets::doubletsFromHisto() kernel
 (cms-patatrack#167)

Pre-compute few constants that could not be declared constexpr.
Reduce temporary buffer size.
Reduce the block size of the calls to gpuPixelDoublets::getDoubletsFromHisto() from 256 to 64, to make better usage of the GPU processors.
---
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 274 +++++++++---------
 1 file changed, 144 insertions(+), 130 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index e46627bf2c322..31844f39f9727 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -20,157 +20,171 @@ namespace gpuPixelDoublets {
 
   template<typename Hist>
   __device__
-  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
-                         int16_t const * __restrict__ iphi, Hist const * __restrict__ hist, uint32_t const * __restrict__ offsets,
+  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
+                         uint32_t nPairs,
+                         GPUCACell * cells,
+                         uint32_t * nCells,
+                         int16_t const * __restrict__ iphi,
+                         Hist const * __restrict__ hist,
+                         uint32_t const * __restrict__ offsets,
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
-                         GPU::VecArray< unsigned int, 256>  * isOuterHitOfCell,
-                        int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {
-
+                         GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
+                         int16_t const * __restrict__ phicuts,
+                         float const * __restrict__ minz,
+                         float const * __restrict__ maxz,
+                         float const * __restrict__ maxr)
+  {
     auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
 
-    // to be optimized later
-    uint32_t innerLayerCumulativeSize[64];
-    assert(nPairs<=64);
+    // nPairsMax to be optimized later (originally was 64).
+    // If it should be much bigger, consider using a block-wide parallel prefix scan,
+    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+    const int nPairsMax = 16;
+    assert(nPairs <= nPairsMax);
+    uint32_t innerLayerCumulativeSize[nPairsMax];
     innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
-    for (uint32_t i=1; i<nPairs; ++i) {
-       innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
+    for (uint32_t i = 1; i < nPairs; ++i) {
+      innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
     }
-
     auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-
-    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
-  for(auto j=idx;j<ntot;j+=blockDim.x*gridDim.x) {
-
-    uint32_t pairLayerId=0;
-    while(j>=innerLayerCumulativeSize[pairLayerId++]);  --pairLayerId; // move to lower_bound ??
-
-    assert(pairLayerId<nPairs);
-    assert(j<innerLayerCumulativeSize[pairLayerId]);
-    assert(0==pairLayerId || j>=innerLayerCumulativeSize[pairLayerId-1]);
-
-    uint8_t inner = layerPairs[2*pairLayerId];
-    uint8_t outer = layerPairs[2*pairLayerId+1];
-    assert(outer>inner);
-
-    auto i = (0==pairLayerId) ? j :  j-innerLayerCumulativeSize[pairLayerId-1];
-    i += offsets[inner];
-
-    // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
-
-    assert(i>=offsets[inner]);
-    assert(i<offsets[inner+1]);
-
-    // found hit corresponding to our cuda thread!!!!!
-    // do the job
-
-    auto mep = iphi[i];
-    auto mez = hh.zg_d[i];
-    auto mer = hh.rg_d[i];
-    auto cutoff = [&](int j) { return
-        abs(hh.zg_d[j]-mez) > maxz[pairLayerId] ||
-      	abs(hh.zg_d[j]-mez) < minz[pairLayerId] ||
-        hh.rg_d[j]-mer > maxr[pairLayerId];
-    };
-
-    constexpr float z0cut = 12.f;
-    constexpr float hardPtCut = 0.5f;
-    constexpr float minRadius = hardPtCut * 87.f;
-    constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
-    auto ptcut = [&](int j) {
-      auto r2t4 = minRadius2T4;
-      auto ri = mer;
-      auto ro = hh.rg_d[j];
-      auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])),abs(int16_t(iphi[j]-mep)) ) );
-      return dphi*dphi*(r2t4 -ri*ro) > (ro-ri)*(ro-ri);
-    };
-    auto z0cutoff = [&](int j) {
-      auto zo =	hh.zg_d[j];
-      auto ro = hh.rg_d[j];
-      auto dr = ro-mer;
-      return dr > maxr[pairLayerId] ||
-             dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
-    };
-
-    auto iphicut = phicuts[pairLayerId];
-
-    auto kl = hist[outer].bin(int16_t(mep-iphicut));
-    auto kh = hist[outer].bin(int16_t(mep+iphicut));
-    auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
-    int tot  = 0;
-    int nmin = 0;
-    auto khh = kh;
-    incr(khh);
-
-    int tooMany=0;
-    for (auto kk=kl; kk!=khh; incr(kk)) {
-      if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk);
-      for (auto p=hist[outer].begin(kk); p<hist[outer].end(kk); ++p) {
-        auto oi=*p;
-        assert(oi>=offsets[outer]);
-        assert(oi<offsets[outer+1]);
-
-        if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
-          continue;
-        if (z0cutoff(oi) || ptcut(oi)) continue;
-        auto ind = atomicInc(nCells,MaxNumOfDoublets);
-        // int layerPairId, int doubletId, int innerHitId,int outerHitId)
-        cells[ind].init(hh,pairLayerId,ind,i,oi);
-        isOuterHitOfCell[oi].push_back(ind);
-        if (isOuterHitOfCell[oi].full()) ++tooMany;
-        ++tot;
+    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
+
+      uint32_t pairLayerId=0;
+      while (j >= innerLayerCumulativeSize[pairLayerId++]);
+      --pairLayerId; // move to lower_bound ??
+
+      assert(pairLayerId < nPairs);
+      assert(j < innerLayerCumulativeSize[pairLayerId]);
+      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);
+
+      uint8_t inner = layerPairs[2*pairLayerId];
+      uint8_t outer = layerPairs[2*pairLayerId+1];
+      assert(outer > inner);
+
+      auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
+      i += offsets[inner];
+
+      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
+
+      assert(i >= offsets[inner]);
+      assert(i < offsets[inner+1]);
+
+      // found hit corresponding to our cuda thread, now do the job
+      auto mep = iphi[i];
+      auto mez = hh.zg_d[i];
+      auto mer = hh.rg_d[i];
+
+      constexpr float z0cut = 12.f;                     // cm
+      constexpr float hardPtCut = 0.5f;                 // GeV
+      constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
+      auto ptcut = [&](int j) {
+        auto r2t4 = minRadius2T4;
+        auto ri = mer;
+        auto ro = hh.rg_d[j];
+        auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
+        return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
+      };
+      auto z0cutoff = [&](int j) {
+        auto zo = hh.zg_d[j];
+        auto ro = hh.rg_d[j];
+        auto dr = ro-mer;
+        return dr > maxr[pairLayerId] ||
+          dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
+      };
+
+      auto iphicut = phicuts[pairLayerId];
+
+      auto kl = hist[outer].bin(int16_t(mep-iphicut));
+      auto kh = hist[outer].bin(int16_t(mep+iphicut));
+      auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
+      int  tot  = 0;
+      int  nmin = 0;
+      auto khh = kh;
+      incr(khh);
+
+      int tooMany=0;
+      for (auto kk = kl; kk != khh; incr(kk)) {
+        if (kk != kl && kk != kh)
+          nmin += hist[outer].size(kk);
+        for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
+          auto oi=*p;
+          assert(oi>=offsets[outer]);
+          assert(oi<offsets[outer+1]);
+
+          if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
+            continue;
+          if (z0cutoff(oi) || ptcut(oi)) continue;
+          auto ind = atomicInc(nCells, MaxNumOfDoublets);
+          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
+          cells[ind].init(hh, pairLayerId, ind, i, oi);
+          isOuterHitOfCell[oi].push_back(ind);
+          if (isOuterHitOfCell[oi].full()) ++tooMany;
+          ++tot;
+        }
       }
-    }
-    if (tooMany>0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner,outer, kl,kh,nmin,tot);
+      if (tooMany > 0)
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
 
-    if (hist[outer].nspills>0)
-      printf("spill bin to be checked in %d %d\n",outer,hist[outer].nspills);
+      if (hist[outer].nspills > 0)
+        printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);
 
-    // if (0==hist[outer].nspills) assert(tot>=nmin);
-    // look in spill bin as well....
+      // if (0==hist[outer].nspills) assert(tot>=nmin);
+      // look in spill bin as well....
 
-
-  }  // loop in block...
+    }  // loop in block...
   }
 
-  __global__
-  void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
-                            GPU::VecArray< unsigned int, 256> *isOuterHitOfCell) {
-
-    uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3
-                                     // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
-                                     ,0,7 ,1,7 ,2,7 ,7,8 ,8,9
-                                     ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
-                                     };
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
 
-    const int16_t phi0p05 = phi2short(0.05);
-    const int16_t phi0p06 = phi2short(0.06);
-    const int16_t phi0p07 = phi2short(0.07);
+  __global__
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize)
+  void getDoubletsFromHisto(GPUCACell * cells,
+                            uint32_t * nCells,
+                            siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
+                            GPU::VecArray<unsigned int, 256> * isOuterHitOfCell)
+  {
+    constexpr int nPairs = 13;
+    constexpr const uint8_t layerPairs[2*nPairs] = {
+      0, 1,  1, 2,  2, 3,
+   // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
+      0, 7,  1, 7,  2, 7,  7, 8,  8, 9,
+      0, 4,  1, 4,  2, 4,  4, 5,  5, 6
+    };
 
-    int16_t const phicuts[13] { phi0p05, phi0p05, phi0p06
-                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
-                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
-                              };
+    constexpr int16_t phi0p05 = 522;    // round(521.52189...) = phi2short(0.05);
+    constexpr int16_t phi0p06 = 626;    // round(625.82270...) = phi2short(0.06);
+    constexpr int16_t phi0p07 = 730;    // round(730.12648...) = phi2short(0.07);
 
-    float const minz[13] = { 0., 0., 0.
-                            ,0., 0., 0., 0., 0.
-      	       	       	    ,0., 0., 0., 0., 0.
-                           };
+    constexpr const int16_t phicuts[nPairs] {
+      phi0p05, phi0p05, phi0p06,
+      phi0p07, phi0p06, phi0p06, phi0p05, phi0p05,
+      phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
+    };
 
-    float const	maxz[13] = { 20.,15.,12.
-                            ,30.,20.,20., 50., 50.
-       	       	       	    ,30.,20.,20., 50., 50.
-                           };
+    float const minz[nPairs] = {
+      0., 0., 0.,
+      0., 0., 0., 0., 0.,
+      0., 0., 0., 0., 0.
+    };
 
-    float const maxr[13] = { 20., 20., 20.
-                            ,9., 7., 6., 5., 5.
-      	       	       	    ,9., 7., 6., 5., 5.
-                           };
+    float const maxz[nPairs] = {
+      20., 15., 12.,
+      30., 20., 20., 50., 50.,
+      30., 20., 20., 50., 50.
+    };
 
+    float const maxr[nPairs] = {
+      20., 20., 20.,
+       9.,  7.,  6.,  5.,  5.,
+       9.,  7.,  6.,  5.,  5.
+    };
 
     auto const &  __restrict__ hh = *hhp;
-    doubletsFromHisto(layerPairs, 13, cells, nCells,
-                      hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,
+    doubletsFromHisto(layerPairs, nPairs, cells, nCells,
+                      hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
                       hh, isOuterHitOfCell,
                       phicuts, minz, maxz, maxr);
   }

From 0d80e179aee5ab11ef379ec3b40a10e1e0c50664 Mon Sep 17 00:00:00 2001
From: Marco Rovere <rovere@users.noreply.github.com>
Date: Tue, 25 Sep 2018 14:48:01 +0200
Subject: [PATCH 024/102] Add Rieman fit to the CA (cms-patatrack#169)

Also, add back the stand-alone GPU fit test.
---
 .../PixelTrackFitting/test/BuildFile.xml      |   5 +
 .../PixelTrackFitting/test/testEigenGPU.cu    | 265 ++++++++++++++++++
 2 files changed, 270 insertions(+)
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu

diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 21e227ea3e7e7..d6beb57b862b8 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -11,6 +11,11 @@
 <use   name="RecoTracker/TkTrackingRegions"/>
 <use   name="RecoPixelVertexing/PixelTriplets"/>
 <use   name="RecoPixelVertexing/PixelTrackFitting"/>
+<bin file="testEigenGPU.cu" name="testEigenGPU_t">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
new file mode 100644
index 0000000000000..7b1125eebc312
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -0,0 +1,265 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "test_common.h"
+
+using namespace Eigen;
+
+__global__
+void kernelFullFit(Rfit::Matrix3xNd * hits,
+    Rfit::Matrix3Nd * hits_cov,
+    double B,
+    bool errors,
+    Rfit::circle_fit * circle_fit_resultsGPU,
+    Rfit::line_fit * line_fit_resultsGPU) {
+
+  printf("hits size: %d,%d\n", hits->rows(), hits->cols());
+  Rfit::printIt(hits, "KernelFulFit - input hits: ");
+  Vector4d fast_fit = Rfit::Fast_fit(*hits);
+
+  u_int n = hits->cols();
+  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
+
+  Rfit::Matrix2xNd hits2D_local = (hits->block(0,0,2,n)).eval();
+  Rfit::Matrix2Nd hits_cov2D_local = (hits_cov->block(0, 0, 2 * n, 2 * n)).eval();
+  Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ");
+  Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ");
+  /*
+  printf("kernelFullFit - hits address: %p\n", hits);
+  printf("kernelFullFit - hits_cov address: %p\n", hits_cov);
+  printf("kernelFullFit - hits_cov2D address: %p\n", &hits2D_local);
+  printf("kernelFullFit - hits_cov2D_local address: %p\n", &hits_cov2D_local);
+  */
+  /* At some point I gave up and locally construct block on the stack, so that
+     the next invocation to Rfit::Circle_fit works properly. Failing to do so
+     implied basically an empty collection of hits and covariances. That could
+     have been partially fixed if values of the passed in matrices would have
+     been printed on screen since that, maybe, triggered internally the real
+     creations of the blocks. To be understood and compared against the myriad
+     of compilation warnings we have.
+     */
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
+      fast_fit, rad, B, errors);
+  /*
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
+      fast_fit, rad, B, errors, scattering);
+   */
+  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, errors);
+
+  return;
+}
+
+__global__
+void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
+  (*results) = Rfit::Fast_fit(*hits);
+}
+
+__global__
+void kernelCircleFit(Rfit::Matrix3xNd * hits,
+    Rfit::Matrix3Nd * hits_cov, Vector4d * fast_fit_input, double B,
+    Rfit::circle_fit * circle_fit_resultsGPU) {
+  u_int n = hits->cols();
+  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
+
+#if TEST_DEBUG
+  printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
+  printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
+  printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
+  printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
+  printf("rad(0,0): %f\n", rad(0,0));
+  printf("rad(1,1): %f\n", rad(1,1));
+  printf("rad(2,2): %f\n", rad(2,2));
+  printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
+  printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
+  printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
+  printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
+  printf("B: %f\n", B);
+#endif
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
+      *fast_fit_input, rad, B, false);
+}
+
+__global__
+void kernelLineFit(Rfit::Matrix3xNd * hits,
+                   Rfit::Matrix3Nd * hits_cov,
+                   Rfit::circle_fit * circle_fit,
+                   Vector4d * fast_fit,
+                   Rfit::line_fit * line_fit)
+{
+  (*line_fit) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit, *fast_fit, true);
+}
+
+void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
+  hits << 1.98645, 4.72598, 7.65632, 11.3151,
+          2.18002, 4.88864, 7.75845, 11.3134,
+          2.46338, 6.99838,  11.808,  17.793;
+  hits_cov(0,0) = 7.14652e-06;
+  hits_cov(1,1) = 2.15789e-06;
+  hits_cov(2,2) = 1.63328e-06;
+  hits_cov(3,3) = 6.27919e-06;
+  hits_cov(4,4) = 6.10348e-06;
+  hits_cov(5,5) = 2.08211e-06;
+  hits_cov(6,6) = 1.61672e-06;
+  hits_cov(7,7) = 6.28081e-06;
+  hits_cov(8,8) = 5.184e-05;
+  hits_cov(9,9) = 1.444e-05;
+  hits_cov(10,10) = 6.25e-06;
+  hits_cov(11,11) = 3.136e-05;
+  hits_cov(0,4) = hits_cov(4,0) = -5.60077e-06;
+  hits_cov(1,5) = hits_cov(5,1) = -1.11936e-06;
+  hits_cov(2,6) = hits_cov(6,2) = -6.24945e-07;
+  hits_cov(3,7) = hits_cov(7,3) = -5.28e-06;
+}
+
+void testFit() {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd hits(3,4);
+  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
+  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
+  Rfit::Matrix3Nd * hits_covGPU = nullptr;
+  Vector4d * fast_fit_resultsGPU = new Vector4d();
+  Vector4d * fast_fit_resultsGPUret = new Vector4d();
+  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
+  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
+
+  fillHitsAndHitsCov(hits, hits_cov);
+
+  // FAST_FIT_CPU
+  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
+#if TEST_DEBUG
+  std::cout << "Generated hits:\n" << hits << std::endl;
+#endif
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+  // FAST_FIT GPU
+  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
+  cudaMalloc((void**)&fast_fit_resultsGPU, sizeof(Vector4d));
+  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
+
+  kernelFastFit<<<1, 1>>>(hitsGPU, fast_fit_resultsGPU);
+  cudaDeviceSynchronize();
+  
+  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, sizeof(Vector4d), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << *fast_fit_resultsGPUret << std::endl;
+  assert(isEqualFuzzy(fast_fit_results, (*fast_fit_resultsGPUret)));
+
+  // CIRCLE_FIT CPU
+  u_int n = hits.cols();
+  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
+      hits_cov.block(0, 0, 2 * n, 2 * n),
+      fast_fit_results, rad, B, false);
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
+
+  // CIRCLE_FIT GPU
+  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
+  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
+  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
+
+  kernelCircleFit<<<1,1>>>(hitsGPU, hits_covGPU,
+      fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
+      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
+
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+
+  // LINE_FIT GPU
+  Rfit::line_fit * line_fit_resultsGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
+
+  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
+
+  kernelLineFit<<<1,1>>>(hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
+}
+
+void testFitOneGo(bool errors, double epsilon=1e-6) {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd hits(3,4);
+  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
+
+  fillHitsAndHitsCov(hits, hits_cov);
+
+  // FAST_FIT_CPU
+  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
+  // CIRCLE_FIT CPU
+  u_int n = hits.cols();
+  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n), 
+      hits_cov.block(0, 0, 2 * n, 2 * n),
+      fast_fit_results, rad, B, errors);
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results,
+      fast_fit_results, errors);
+
+  // FIT GPU
+  std::cout << "GPU FIT" << std::endl;
+  Rfit::Matrix3xNd * hitsGPU = nullptr; // new Rfit::Matrix3xNd(3,4);
+  Rfit::Matrix3Nd * hits_covGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
+  Rfit::circle_fit * circle_fit_resultsGPU = nullptr; // new Rfit::circle_fit();
+  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
+
+  cudaCheck(cudaMalloc((void **)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4))));
+  cudaCheck(cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12))));
+  cudaCheck(cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice));
+
+  kernelFullFit<<<1, 1>>>(hitsGPU, hits_covGPU, B, errors,
+      circle_fit_resultsGPU, line_fit_resultsGPU);
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
+
+  std::cout << "Fitted values (CircleFit) CPU:\n" << circle_fit_results.par << std::endl;
+  std::cout << "Fitted values (LineFit): CPU\n" << line_fit_results.par << std::endl;
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  std::cout << "Fitted values (LineFit): GPU\n" << line_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par, epsilon));
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, epsilon));
+
+  cudaCheck(cudaFree(hitsGPU));
+  cudaCheck(cudaFree(hits_covGPU));
+  cudaCheck(cudaFree(line_fit_resultsGPU));
+  cudaCheck(cudaFree(circle_fit_resultsGPU));
+  delete line_fit_resultsGPUret;
+  delete circle_fit_resultsGPUret;
+
+  cudaDeviceReset();
+}
+
+int main (int argc, char * argv[]) {
+//  testFit();
+  std::cout << "TEST FIT, NO ERRORS" << std::endl;
+  testFitOneGo(false);
+
+  std::cout << "TEST FIT, ERRORS AND SCATTER" << std::endl;
+  testFitOneGo(true, 1e-5);
+
+  return 0;
+}
+

From 6d9630c6e58f56b0e480dd3567322dc6a0230d7e Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 26 Sep 2018 17:53:25 +0200
Subject: [PATCH 025/102] Introduce Cluster Charge Cut, optimize Histogram
 (bucket sorting) (cms-patatrack#171)

---
 .../PixelTriplets/plugins/GPUCACell.h         | 124 +++++++++---------
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  38 +++---
 2 files changed, 80 insertions(+), 82 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 5995d286fc38d..772b802282d31 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -6,49 +6,49 @@
 
 #include <cuda_runtime.h>
 
-#include "GPUHitsAndDoublets.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 
 struct Quadruplet {
-  int hitId[4];
+   using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+   hindex_type hitId[4];
 };
 
+
 class GPUCACell {
 public:
+
+  using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+  using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+
   GPUCACell() = default;
+#ifdef __CUDACC__
 
-  __host__ __device__
-  void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
-      int layerPairId, int doubletId, int innerHitId,int outerHitId)
+  __device__ __forceinline__
+  void init(Hits const & hh,
+      int layerPairId, int doubletId,  
+      hindex_type innerHitId, hindex_type outerHitId)
   {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
 
-    theInnerX = hh.xg_d[innerHitId];
-    theOuterX = hh.xg_d[outerHitId];
-
-    theInnerY = hh.yg_d[innerHitId];
-    theOuterY = hh.yg_d[outerHitId];
-
-    theInnerZ = hh.zg_d[innerHitId];
-    theOuterZ = hh.zg_d[outerHitId];
-    theInnerR = hh.rg_d[innerHitId];
-    theOuterR = hh.rg_d[outerHitId];
+    theInnerZ = __ldg(hh.zg_d+innerHitId);
+    theInnerR = __ldg(hh.rg_d+innerHitId);
     theOuterNeighbors.reset();
   }
 
-  constexpr float get_inner_x() const { return theInnerX; }
-  constexpr float get_outer_x() const { return theOuterX; }
-  constexpr float get_inner_y() const { return theInnerY; }
-  constexpr float get_outer_y() const { return theOuterY; }
-  constexpr float get_inner_z() const { return theInnerZ; }
-  constexpr float get_outer_z() const { return theOuterZ; }
-  constexpr float get_inner_r() const { return theInnerR; }
-  constexpr float get_outer_r() const { return theOuterR; }
+  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
+  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); }
+  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }
+
   constexpr unsigned int get_inner_hit_id() const {
     return theInnerHitId;
   }
@@ -56,37 +56,42 @@ class GPUCACell {
     return theOuterHitId;
   }
 
-  constexpr void print_cell() const {
+
+  __device__
+  void print_cell() const {
     printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
            "%d, innerradius %f, outerRadius %f \n",
-           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId,
-           theInnerR, theOuterR);
+           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId
+    );
   }
 
-  __host__ __device__
-  bool check_alignment_and_tag(
-      const GPUCACell *cells, unsigned int innerCellId, const float ptmin,
+
+  __device__
+  bool check_alignment(Hits const & hh,
+      GPUCACell const & otherCell, const float ptmin,
       const float region_origin_x, const float region_origin_y,
       const float region_origin_radius, const float thetaCut,
-      const float phiCut, const float hardPtCut)
+      const float phiCut, const float hardPtCut) const
   {
-    auto ro = get_outer_r();
-    auto zo = get_outer_z();
-    const auto &otherCell = cells[innerCellId];
+    auto ri = get_inner_r(hh);
+    auto zi = get_inner_z(hh);
+
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
 
-    auto r1 = otherCell.get_inner_r();
-    auto z1 = otherCell.get_inner_z();
-    bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut);
+    auto r1 = otherCell.get_inner_r(hh);
+    auto z1 = otherCell.get_inner_z(hh);
+    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut);
     return (aligned &&
-            haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x,
+            haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x,
                                  region_origin_y, region_origin_radius, phiCut,
                                  hardPtCut));
   }
 
-
-  constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo,
+  __device__ __forceinline__
+  static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo,
                                         const float ptmin,
-                                        const float thetaCut) const {
+                                        const float thetaCut) {
     float radius_diff = std::abs(r1 - ro);
     float distance_13_squared =
         radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
@@ -96,27 +101,26 @@ class GPUCACell {
                                                 // radius_diff later
 
     float tan_12_13_half_mul_distance_13_squared =
-        fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r()));
+        fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
     return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
   }
 
-  constexpr bool
-  haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId,
+  __device__ 
+  bool
+  haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell,
                        const float ptmin, const float region_origin_x,
                        const float region_origin_y,
                        const float region_origin_radius, const float phiCut,
                        const float hardPtCut) const {
 
-    const auto &otherCell = cells[innerCellId];
-
-    auto x1 = otherCell.get_inner_x();
-    auto y1 = otherCell.get_inner_y();
+    auto x1 = otherCell.get_inner_x(hh);
+    auto y1 = otherCell.get_inner_y(hh);
 
-    auto x2 = get_inner_x();
-    auto y2 = get_inner_y();
+    auto x2 = get_inner_x(hh);
+    auto y2 = get_inner_y(hh);
 
-    auto x3 = get_outer_x();
-    auto y3 = get_outer_y();
+    auto x3 = get_outer_x(hh);
+    auto y3 = get_outer_y(hh);
 
     float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3);
     float tan_12_13_half_mul_distance_13_squared =
@@ -139,7 +143,7 @@ class GPUCACell {
 
       return distance_13_beamspot_squared <
              (region_origin_radius + phiCut) * (region_origin_radius + phiCut);
-    }
+    } 
 
     // 87 cm/GeV = 1/(3.8T * 0.3)
 
@@ -186,13 +190,13 @@ class GPUCACell {
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
 
-#ifdef __CUDACC__
+// #ifdef __CUDACC__
 
   __device__
   inline void find_ntuplets(
-      const GPUCACell *cells,
+      GPUCACell const * __restrict__ cells,
       GPU::SimpleVector<Quadruplet> *foundNtuplets,
-      GPU::VecArray<unsigned int,3> &tmpNtuplet,
+      GPU::VecArray<hindex_type,3> &tmpNtuplet,
       const unsigned int minHitsPerNtuplet) const
   {
     // the building process for a track ends if:
@@ -231,16 +235,10 @@ class GPUCACell {
   int theLayerPairId;
 
 private:
-  unsigned int theInnerHitId;
-  unsigned int theOuterHitId;
-  float theInnerX;
-  float theOuterX;
-  float theInnerY;
-  float theOuterY;
   float theInnerZ;
-  float theOuterZ;
   float theInnerR;
-  float theOuterR;
+  hindex_type theInnerHitId;
+  hindex_type theOuterHitId;
 };
 
 #endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 31844f39f9727..61d048637585c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -20,12 +20,13 @@ namespace gpuPixelDoublets {
 
   template<typename Hist>
   __device__
+  __forceinline__
   void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
                          uint32_t nPairs,
                          GPUCACell * cells,
                          uint32_t * nCells,
                          int16_t const * __restrict__ iphi,
-                         Hist const * __restrict__ hist,
+                         Hist const & __restrict__ hist,
                          uint32_t const * __restrict__ offsets,
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
                          GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
@@ -63,6 +64,8 @@ namespace gpuPixelDoublets {
       uint8_t outer = layerPairs[2*pairLayerId+1];
       assert(outer > inner);
 
+      auto hoff = Hist::histOff(outer);
+
       auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
       i += offsets[inner];
 
@@ -73,8 +76,8 @@ namespace gpuPixelDoublets {
 
       // found hit corresponding to our cuda thread, now do the job
       auto mep = iphi[i];
-      auto mez = hh.zg_d[i];
-      auto mer = hh.rg_d[i];
+      auto mez = __ldg(hh.zg_d+i);
+      auto mer = __ldg(hh.rg_d+i);
 
       constexpr float z0cut = 12.f;                     // cm
       constexpr float hardPtCut = 0.5f;                 // GeV
@@ -83,13 +86,13 @@ namespace gpuPixelDoublets {
       auto ptcut = [&](int j) {
         auto r2t4 = minRadius2T4;
         auto ri = mer;
-        auto ro = hh.rg_d[j];
+        auto ro = __ldg(hh.rg_d+j);
         auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
         return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
       };
       auto z0cutoff = [&](int j) {
-        auto zo = hh.zg_d[j];
-        auto ro = hh.rg_d[j];
+        auto zo = __ldg(hh.zg_d+j);
+        auto ro = __ldg(hh.rg_d+j);
         auto dr = ro-mer;
         return dr > maxr[pairLayerId] ||
           dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
@@ -97,8 +100,8 @@ namespace gpuPixelDoublets {
 
       auto iphicut = phicuts[pairLayerId];
 
-      auto kl = hist[outer].bin(int16_t(mep-iphicut));
-      auto kh = hist[outer].bin(int16_t(mep+iphicut));
+      auto kl = Hist::bin(int16_t(mep-iphicut));
+      auto kh = Hist::bin(int16_t(mep+iphicut));
       auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
       int  tot  = 0;
       int  nmin = 0;
@@ -108,9 +111,11 @@ namespace gpuPixelDoublets {
       int tooMany=0;
       for (auto kk = kl; kk != khh; incr(kk)) {
         if (kk != kl && kk != kh)
-          nmin += hist[outer].size(kk);
-        for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
-          auto oi=*p;
+          nmin += hist.size(kk+hoff);
+        auto const * __restrict__ p = hist.begin(kk+hoff);
+        auto const * __restrict__ e = hist.end(kk+hoff);
+        for (;p < e; ++p) {
+          auto oi=__ldg(p);
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
 
@@ -128,19 +133,14 @@ namespace gpuPixelDoublets {
       if (tooMany > 0)
         printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
 
-      if (hist[outer].nspills > 0)
-        printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);
-
-      // if (0==hist[outer].nspills) assert(tot>=nmin);
-      // look in spill bin as well....
-
     }  // loop in block...
   }
 
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
   __global__
-  __launch_bounds__(getDoubletsFromHistoMaxBlockSize)
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
   void getDoubletsFromHisto(GPUCACell * cells,
                             uint32_t * nCells,
                             siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
@@ -184,7 +184,7 @@ namespace gpuPixelDoublets {
 
     auto const &  __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs, nPairs, cells, nCells,
-                      hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
+                      hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
                       hh, isOuterHitOfCell,
                       phicuts, minz, maxz, maxr);
   }

From 79fcd953688144eccd6b276ff423db24fce59200 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 27 Sep 2018 02:11:05 +0200
Subject: [PATCH 026/102] Clean up Riemann fit in CA  (cms-patatrack#178)

Reduce the number of blocks used to launch the Riemann fit kernels within the CA.
Rename the kernels to avoid the ambiguiity with the standalone Riemann fit.
Work around spurious warnings in the Eigen test.
---
 .../test/testEigenGPUNoFit.cu                 | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index 2112f5f6027a5..ead2e3cc00504 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -83,7 +83,7 @@ void testMultiply() {
   kernelMultiply<<<1,1>>>(JGPU, CGPU, multiply_resultGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(multiply_resultGPUret, multiply_resultGPU, 
+  cudaMemcpy(multiply_resultGPUret, multiply_resultGPU,
       sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
   printIt(multiply_resultGPUret);
   assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
@@ -91,7 +91,10 @@ void testMultiply() {
 
 void testInverse3x3() {
   std::cout << "TEST INVERSE 3x3" << std::endl;
-  Matrix3d m = Matrix3d::Random();
+  Matrix3d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
   Matrix3d m_inv = m.inverse();
   Matrix3d *mGPU = nullptr;
   Matrix3d *mGPUret = nullptr;
@@ -117,7 +120,10 @@ void testInverse3x3() {
 
 void testInverse4x4() {
   std::cout << "TEST INVERSE 4x4" << std::endl;
-  Matrix4d m = Matrix4d::Random();
+  Matrix4d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
   Matrix4d m_inv = m.inverse();
   Matrix4d *mGPU = nullptr;
   Matrix4d *mGPUret = nullptr;
@@ -143,9 +149,10 @@ void testInverse4x4() {
 
 void testEigenvalues() {
   std::cout << "TEST EIGENVALUES" << std::endl;
-  Matrix3d m = Matrix3d::Random();
-  Matrix3d mt = m.transpose();
-  m += mt;
+  Matrix3d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
   Matrix3d * m_gpu = nullptr;
   Matrix3d * mgpudebug = new Matrix3d();
   Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret = new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;

From f932567a5c2d59177d9f3f6e4ea7c1f6f24aef00 Mon Sep 17 00:00:00 2001
From: Marco Rovere <rovere@users.noreply.github.com>
Date: Thu, 27 Sep 2018 02:15:13 +0200
Subject: [PATCH 027/102] Riemann fit multiple scattering (cms-patatrack#174)

Implement the multiple scattering treatments in the Riemann Fit. In particular:
  - modify the previous implementation of the multiple scattering in the circle fit to correctly cover both the barrel and the forward case;
  - implement the multiple scattering in the line fit in the S-Z plane both for the barrel and the forward case.

The effective radiation length is still an approximate value since the phi angle is not taken into account (it is not known on a layer-by-layer case). Ad ad-hoc correction based on the inverse of the pt has been added, with a cut-off of 1 GeV.

The pulls are ok-ish, the material could be further tuned.
The Chi2 is flat on all eta range.
---
 .../PixelTrackFitting/interface/RiemannFit.h  | 195 ++++++++++++++++--
 1 file changed, 174 insertions(+), 21 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 0eed4d7a12faf..ac5e82b542e73 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -5,6 +5,8 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
+#include <math.h>
+
 #ifndef RFIT_DEBUG
 #define RFIT_DEBUG 0
 #endif  // RFIT_DEBUG
@@ -115,6 +117,107 @@ __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
     return a.x() * b.y() - a.y() * b.x();
 }
 
+
+__host__ __device__ inline void computeRadLenEff(const Vector4d& fast_fit,
+                                                 const double B,
+                                                 double & radlen_eff,
+                                                 double & theta,
+                                                 bool & in_forward) {
+    double X_barrel = 0.015;
+    double X_forward = 0.05;
+    theta = atan(fast_fit(3));
+    // atan returns values in [-pi/2, pi/2], we need [0, pi]
+    theta = theta < 0. ? theta + M_PI : theta;
+    radlen_eff = X_barrel / std::abs(sin(theta));
+    in_forward = (theta <= 0.398 or theta >= 2.743);
+    if (in_forward)
+      radlen_eff = X_forward / std::abs(cos(theta));
+    assert(radlen_eff > 0.);
+    double p_t = fast_fit(2) * B;
+    // We have also to correct the radiation lenght in the x-y plane. Since we
+    // do not know the angle of incidence of the track at this point, we
+    // arbitrarily set the correction proportional to the inverse of the
+    // transerse momentum. The cut-off is at 1 Gev, set using a single Muon Pt
+    // gun and verifying that, at that momentum, not additional correction is,
+    // in fact, needed. This is an approximation.
+    if (std::abs(p_t/1.) < 1.)
+      radlen_eff /= std::abs(p_t/1.);
+}
+
+/*!
+    \brief Compute the covariance matrix along cartesian S-Z of points due to
+    multiple Coulomb scattering to be used in the line_fit, for the barrel
+    and forward cases.
+
+ */
+__host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
+                                                     const Vector4d& fast_fit,
+                                                     VectorNd const& s_arcs,
+                                                     VectorNd const& z_values,
+                                                     const double B)
+{
+#if RFIT_DEBUG
+    Rfit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
+#endif
+    u_int n = s_arcs.rows();
+    double p_t = fast_fit(2) * B;
+    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    double radlen_eff = 0.;
+    double theta = 0.;
+    bool in_forward = false;
+    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
+
+    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
+    for (u_int k = 0; k < n; ++k)
+    {
+        for (u_int l = k; l < n; ++l)
+        {
+            for (u_int i = 0; i < std::min(k, l); ++i)
+            {
+#if RFIT_DEBUG
+              printf("Scatter_cov_line - B: %f\n", B);
+              printf("Scatter_cov_line - radlen_eff: %f, p_t: %f, p2: %f\n", radlen_eff, p_t, p_2);
+              printf("Scatter_cov_line - sig2:%f, theta: %f\n", sig2, theta);
+              printf("Scatter_cov_line - Adding to element %d, %d value %f\n", n + k, n + l, (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta))));
+#endif
+              if (in_forward) {
+                cov_sz(k, l) += (z_values(k) - z_values(i)) * (z_values(l) - z_values(i)) * sig2 / sqr(sqr(cos(theta)));
+                cov_sz(l, k) = cov_sz(k, l);
+              } else {
+                cov_sz(n + k, n + l) += (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta)));
+                cov_sz(n + l, n + k) = cov_sz(n + k, n + l);
+              }
+            }
+        }
+    }
+#if RFIT_DEBUG
+    Rfit::printIt(&cov_sz, "Scatter_cov_line - cov_sz: ");
+#endif
+    Matrix2Nd rot = MatrixXd::Zero(2 * n, 2 * n);
+    for (u_int i = 0; i < n; ++i) {
+      rot(i, i) = cos(theta);
+      rot(n + i, n + i) = cos(theta);
+      u_int j = (i + n);
+      // Signs seem to be wrong for the off-diagonal element, but we are
+      // inverting x-y in the input vector, since theta is the angle between
+      // the z axis and the line, and we are putting the s values, which are Y,
+      // in the first position. A simple sign flip will take care of it.
+      rot(i, j) = i < j ? sin(theta) : -sin(theta);
+    }
+
+#if RFIT_DEBUG
+    Rfit::printIt(&rot, "Scatter_cov_line - rot: ");
+#endif
+
+    Matrix2Nd tmp = rot*cov_sz*rot.transpose();
+    // We are interested only in the errors in the rotated s -axis which, in
+    // our formalism, are in the upper square matrix.
+#if RFIT_DEBUG
+    Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
+#endif
+    return tmp.block(0, 0, n, n);
+}
+
 /*!
     \brief Compute the covariance matrix (in radial coordinates) of points in
     the transverse plane due to multiple Coulomb scattering.
@@ -141,11 +244,12 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
                                                     double B)
 {
     u_int n = p2D.cols();
-    double X = 0.04;
-    double theta = atan(fast_fit(3));
-    double radlen_eff = X * sqrt(fast_fit(3) * fast_fit(3) + 1);
     double p_t = fast_fit(2) * B;
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    double radlen_eff = 0.;
+    double theta = 0.;
+    bool in_forward = false;
+    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
 
     MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
     const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
@@ -155,13 +259,18 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
         {
             for (u_int i = 0; i < std::min(k, l); ++i)
             {
+              if (in_forward) {
+                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(cos(theta));
+              } else {
                 scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(sin(theta));
-                scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+              }
+              scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
             }
         }
     }
+#if RFIT_DEBUG
     Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
-
+#endif
     return scatter_cov_rad;
 }
 
@@ -960,9 +1069,12 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     Matrix2xNd p2D(2, n);
     MatrixNx5d Jx(n, 5);
 
+#if RFIT_DEBUG
+    printf("Line_fit - B: %g\n", B);
+
     printIt(&hits, "Line_fit points: ");
     printIt(&hits_cov, "Line_fit covs: ");
-
+#endif
     // x & associated Jacobian
     // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
     // Slide 11
@@ -1000,16 +1112,43 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     p2D.row(1) = hits.row(2);
 
     // WEIGHT COMPUTATION
+    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
     VectorNd x_err2 = X_err2(hits_cov, circle, Jx, error, n);
     VectorNd y_err2 = hits_cov.block(2 * n, 2 * n, n, n).diagonal();
+    cov_sz.block(0, 0, n, n) = x_err2.asDiagonal();
+    cov_sz.block(n, n, n, n) = y_err2.asDiagonal();
+#if RFIT_DEBUG
+    printIt(&cov_sz, "line_fit - cov_sz:");
+#endif
+    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), B);
+#if RFIT_DEBUG
+    printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+#endif
+    Matrix4d G, G4;
+    G4 = cov_with_ms.inverse();
+#if RFIT_DEBUG
+    printIt(&G4, "line_fit - cov_with_ms.inverse():");
+#endif
+    double renorm = G4.sum();
+    G4 *= 1. / renorm;
+#if RFIT_DEBUG
+    printIt(&G4, "line_fit - G4:");
+#endif
+    G = G4;
+    const VectorNd weight = Weight_circle(G);
+
 
-    const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
-    const VectorNd weight = err2_inv * 1. / err2_inv.sum();
+    VectorNd err2_inv = cov_with_ms.diagonal();
+    err2_inv = err2_inv.cwiseInverse();
+//    const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
+//    const VectorNd weight = err2_inv * 1. / err2_inv.sum();
 
+#if RFIT_DEBUG
     printIt(&x_err2, "Line_fit - x_err2: ");
     printIt(&y_err2, "Line_fit - y_err2: ");
     printIt(&err2_inv, "Line_fit - err2_inv: ");
     printIt(&weight, "Line_fit - weight: ");
+#endif
 
     // COST FUNCTION
 
@@ -1020,17 +1159,23 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     // scatter matrix S = X^T * X
     const Matrix2xNd X = p2D.colwise() - r0;
     Matrix2d A = Matrix2d::Zero();
-    for (u_int i = 0; i < n; ++i)
-    {
-        A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
-    }
+    A = X * G * X.transpose();
+//    for (u_int i = 0; i < n; ++i)
+//    {
+//        A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
+//    }
 
+#if RFIT_DEBUG
     printIt(&A, "Line_fit - A: ");
+#endif
 
     // minimize
     double chi2;
     Vector2d v = min_eigen2D(A, chi2);
+#if RFIT_DEBUG
     printIt(&v, "Line_fit - v: ");
+    printf("Line_fit chi2: %e\n", chi2);
+#endif
 
     // n *= (chi2>0) ? 1 : -1; //TO FIX
     // This hack to be able to run on GPU where the automatic assignment to a
@@ -1044,7 +1189,10 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     line.par << -v(0) / v(1),                          // cotan(theta))
         -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
     line.chi2 = abs(chi2);
+#if RFIT_DEBUG
     printIt(&(line.par), "Line_fit - line.par: ");
+    printf("Line_fit - v norm: %e\n", sqrt(v(0)*v(0) + v(1)*v(1)));
+#endif
 
     // ERROR PROPAGATION
     if (error)
@@ -1054,16 +1202,13 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
 
         Matrix3d C;  // cov(v,c)
         {
-            double norm_chernov = 0.;
-            for (u_int i = 0; i < n; ++i)
-                norm_chernov += err2_inv(i) * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c) * (v(0) * p2D(0, i) + v(1) * p2D(1, i) + c);
-            norm_chernov /= float(n);
-            // Indeed it should read:
-            // * compute the average error in the orthogonal direction: err2_inv.cwiseInverse().sum()/sqr(n)
-            // * normalize the A(0,0)+A(1,1) dividing by err2_inv.sum(), since those have been weighted
-            const double norm = (err2_inv.cwiseInverse().sum()) * err2_inv.sum() * 1. / sqr(n);
+          // The norm is taken from Chernov, properly adapted to the weights case.
+            double norm = v.transpose() * A * v;
+            norm /= weight.sum();
+#if RFIT_DEBUG
+            printf("Line_fit - norm:    %e\n", norm);
+#endif
             const double sig2 = 1. / (A(0, 0) + A(1, 1)) * norm;
-            //      const double sig2 = 1. / (A(0, 0) + A(1, 1));
             C(0, 0) = sig2 * v1_2;
             C(1, 1) = sig2 * v0_2;
             C(0, 1) = C(1, 0) = -sig2 * v(0) * v(1);
@@ -1073,6 +1218,9 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
             Matrix<double, 1, 1> tmp = (r0.transpose() * C.block(0, 0, 2, 2) * r0);
             C(2, 2) = v0_2 * C0(0) + v1_2 * C0(1) + C0(0) * C(0, 0) + C0(1) * C(1, 1) + tmp(0, 0);
         }
+#if RFIT_DEBUG
+        printIt(&C, "line_fit - C:");
+#endif
 
         Matrix<double, 2, 3> J;  // Jacobian of (v,c) -> (cotan(theta)),Zip)
         {
@@ -1083,10 +1231,15 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
             J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
         }
         Matrix<double, 3, 2> JT = J.transpose().eval();
+#if RFIT_DEBUG
+        printIt(&J, "line_fit - J:");
+#endif
         line.cov = J * C * JT;
     }
 
+#if RFIT_DEBUG
     printIt(&line.cov, "Line cov:");
+#endif
     return line;
 }
 

From e1c1a7ee8a3d2e735ec680d505c372561b818d4c Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 6 Oct 2018 14:29:06 +0200
Subject: [PATCH 028/102] Suppress asserts in the GPU code, unless GPU_DEBUG is
 defined (cms-patatrack#186)

---
 RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h | 4 +++-
 RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h        | 5 +++--
 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index ac5e82b542e73..6de1c77bbac12 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -1,11 +1,13 @@
 #ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
 #define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
 
+#include <cmath>
+
 #include <cuda_runtime.h>
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
-#include <math.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #ifndef RFIT_DEBUG
 #define RFIT_DEBUG 0
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 772b802282d31..43fcd88fa30de 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -6,9 +6,10 @@
 
 #include <cuda_runtime.h>
 
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
 struct Quadruplet {
    using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 61d048637585c..d4b44f64573c6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -2,17 +2,17 @@
 #define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
 
 #include <algorithm>
-#include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <limits>
 
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
 #include "GPUCACell.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 
 namespace gpuPixelDoublets {
 

From e14fd83af47c62dccd9dc6d9e2f29b807e262857 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 8 Nov 2018 09:09:11 +0100
Subject: [PATCH 029/102] Riemann fit rework (cms-patatrack#190)

The Riemann Fit has been reworked so that both barrel
and forward cases are naturally supported without branching.
The underlying assumption is the uniform material distribution
within the Pixel Tracker.
The line fit has been reworked and is now using an ordinary
least square fit in the S-Z plane.
See the motivations and explanations inside the comments
in the code.

Additional changes:
  - code clean up
  - remove unused functions
  - fix standalone test of RiemannFit on GPU
---
 .../PixelTrackFitting/interface/RiemannFit.h  | 510 +++++++++++-------
 .../test/PixelTrackRiemannFit.cc              | 388 ++++++++-----
 .../PixelTrackFitting/test/testEigenGPU.cu    | 137 +----
 3 files changed, 581 insertions(+), 454 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 6de1c77bbac12..33f8334c8b5a5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -47,7 +47,7 @@ struct circle_fit
       |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
-  */
+      */
     int64_t q;  //!< particle charge
     double chi2 = 0.0;
 };
@@ -78,7 +78,6 @@ struct helix_fit
     double chi2_line = 0.0;
     Vector4d fast_fit;
     int64_t q;  //!< particle charge
-                //  VectorXd time;  // TO FIX just for profiling
 } __attribute__((aligned(16)));
 
 template <class C>
@@ -119,31 +118,40 @@ __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
     return a.x() * b.y() - a.y() * b.x();
 }
 
+/*!  Compute the Radiation length in the uniform hypothesis
+ *
+ * The Pixel detector, barrel and forward, is considered as an omogeneous
+ * cilinder of material, whose radiation lengths has been derived from the TDR
+ * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
+ * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
+ * lengths are computed using this unique number, in both regions, barrel and
+ * endcap.
+ *
+ * NB: no angle corrections nor projections are computed inside this routine.
+ * It is therefore the responsibility of the caller to supply the proper
+ * lengths in input. These lenghts are the path travelled by the particle along
+ * its trajectory, namely the so called S of the helix in 3D space.
+ *
+ * \param length_values vector of incremental distances that will be translated
+ * into radiation length equivalent. Each radiation length i is computed
+ * incrementally with respect to the previous length i-1. The first lenght has
+ * no reference point (i.e. it has the dca).
+ *
+ * \return incremental radiation lengths that correspond to each segment.
+ */
 
-__host__ __device__ inline void computeRadLenEff(const Vector4d& fast_fit,
-                                                 const double B,
-                                                 double & radlen_eff,
-                                                 double & theta,
-                                                 bool & in_forward) {
-    double X_barrel = 0.015;
-    double X_forward = 0.05;
-    theta = atan(fast_fit(3));
-    // atan returns values in [-pi/2, pi/2], we need [0, pi]
-    theta = theta < 0. ? theta + M_PI : theta;
-    radlen_eff = X_barrel / std::abs(sin(theta));
-    in_forward = (theta <= 0.398 or theta >= 2.743);
-    if (in_forward)
-      radlen_eff = X_forward / std::abs(cos(theta));
-    assert(radlen_eff > 0.);
-    double p_t = fast_fit(2) * B;
-    // We have also to correct the radiation lenght in the x-y plane. Since we
-    // do not know the angle of incidence of the track at this point, we
-    // arbitrarily set the correction proportional to the inverse of the
-    // transerse momentum. The cut-off is at 1 Gev, set using a single Muon Pt
-    // gun and verifying that, at that momentum, not additional correction is,
-    // in fact, needed. This is an approximation.
-    if (std::abs(p_t/1.) < 1.)
-      radlen_eff /= std::abs(p_t/1.);
+__host__ __device__ inline
+void computeRadLenUniformMaterial(const VectorNd &length_values,
+    VectorNd & rad_lengths) {
+  // Radiation length of the pixel detector in the uniform assumption, with
+  // 0.06 rad_len at 16 cm
+  const double XX_0 = 16.f/(0.06);
+//  const double XX_0 = 1000.*16.f/(0.06);
+  u_int n = length_values.rows();
+  rad_lengths(0) = length_values(0)/XX_0;
+  for (u_int j = 1; j < n; ++j) {
+    rad_lengths(j) = std::abs(length_values(j)-length_values(j-1))/XX_0;
+  }
 }
 
 /*!
@@ -151,11 +159,29 @@ __host__ __device__ inline void computeRadLenEff(const Vector4d& fast_fit,
     multiple Coulomb scattering to be used in the line_fit, for the barrel
     and forward cases.
 
+    The input covariance matrix is in the variables s-z, original and
+    unrotated.
+
+    The multiple scattering component is computed in the usual linear
+    approximation, using the 3D path which is computed as the squared root of
+    the squared sum of the s and z components passed in.
+
+    Internally a rotation by theta is performed and the covariance matrix
+    returned is the one in the direction orthogonal to the rotated S3D axis,
+    i.e. along the rotated Z axis.
+
+    The choice of the rotation is not arbitrary, but derived from the fact that
+    putting the horizontal axis along the S3D direction allows the usage of the
+    ordinary least squared fitting techiques with the trivial parametrization y
+    = mx + q, avoiding the patological case with m = +/- inf, that would
+    correspond to the case at eta = 0.
  */
+
 __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
                                                      const Vector4d& fast_fit,
                                                      VectorNd const& s_arcs,
                                                      VectorNd const& z_values,
+                                                     const double theta,
                                                      const double B)
 {
 #if RFIT_DEBUG
@@ -164,47 +190,24 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
     u_int n = s_arcs.rows();
     double p_t = fast_fit(2) * B;
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
-    double radlen_eff = 0.;
-    double theta = 0.;
-    bool in_forward = false;
-    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
-
-    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
-    for (u_int k = 0; k < n; ++k)
-    {
-        for (u_int l = k; l < n; ++l)
-        {
-            for (u_int i = 0; i < std::min(k, l); ++i)
-            {
-#if RFIT_DEBUG
-              printf("Scatter_cov_line - B: %f\n", B);
-              printf("Scatter_cov_line - radlen_eff: %f, p_t: %f, p2: %f\n", radlen_eff, p_t, p_2);
-              printf("Scatter_cov_line - sig2:%f, theta: %f\n", sig2, theta);
-              printf("Scatter_cov_line - Adding to element %d, %d value %f\n", n + k, n + l, (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta))));
-#endif
-              if (in_forward) {
-                cov_sz(k, l) += (z_values(k) - z_values(i)) * (z_values(l) - z_values(i)) * sig2 / sqr(sqr(cos(theta)));
-                cov_sz(l, k) = cov_sz(k, l);
-              } else {
-                cov_sz(n + k, n + l) += (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta)));
-                cov_sz(n + l, n + k) = cov_sz(n + k, n + l);
-              }
-            }
-        }
-    }
+    VectorNd rad_lengths_S(n);
+    // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
+    // Basically, to perform cwise operations on Matrices and Vectors, you need
+    // to transform them into Array-like objects.
+    VectorNd S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    S_values = S_values.array().sqrt();
+    computeRadLenUniformMaterial(S_values, rad_lengths_S);
+    VectorNd sig2_S(n);
+    sig2_S = .000225 / p_2 * (1.f + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
 #if RFIT_DEBUG
     Rfit::printIt(&cov_sz, "Scatter_cov_line - cov_sz: ");
 #endif
     Matrix2Nd rot = MatrixXd::Zero(2 * n, 2 * n);
     for (u_int i = 0; i < n; ++i) {
-      rot(i, i) = cos(theta);
-      rot(n + i, n + i) = cos(theta);
+      rot(i, i) = sin(theta);
+      rot(n + i, n + i) = sin(theta);
       u_int j = (i + n);
-      // Signs seem to be wrong for the off-diagonal element, but we are
-      // inverting x-y in the input vector, since theta is the angle between
-      // the z axis and the line, and we are putting the s values, which are Y,
-      // in the first position. A simple sign flip will take care of it.
-      rot(i, j) = i < j ? sin(theta) : -sin(theta);
+      rot(i, j) = i < j ? cos(theta) : -cos(theta);
     }
 
 #if RFIT_DEBUG
@@ -212,12 +215,23 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
 #endif
 
     Matrix2Nd tmp = rot*cov_sz*rot.transpose();
-    // We are interested only in the errors in the rotated s -axis which, in
-    // our formalism, are in the upper square matrix.
+    for (u_int k = 0; k < n; ++k)
+    {
+      for (u_int l = k; l < n; ++l)
+      {
+        for (u_int i = 0; i < std::min(k, l); ++i)
+        {
+          tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
+          tmp(l + n, k + n) = tmp(k + n, l + n);
+        }
+      }
+    }
+    // We are interested only in the errors orthogonal to the rotated s-axis
+    // which, in our formalism, are in the lower square matrix.
 #if RFIT_DEBUG
     Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
-    return tmp.block(0, 0, n, n);
+    return tmp.block(n, n, n, n);
 }
 
 /*!
@@ -233,13 +247,11 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
 
     \warning input points must be ordered radially from the detector center
     (from inner layer to outer ones; points on the same layer must ordered too).
-    \bug currently works only for points in the barrel.
 
     \details Only the tangential component is computed (the radial one is
     negligible).
 
  */
-// X in input TO FIX
 __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
                                                     const Vector4d& fast_fit,
                                                     VectorNd const& rad,
@@ -248,24 +260,32 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
     u_int n = p2D.cols();
     double p_t = fast_fit(2) * B;
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
-    double radlen_eff = 0.;
-    double theta = 0.;
-    bool in_forward = false;
-    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
+    double theta = atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI :  theta;
+    VectorNd s_values(n);
+    VectorNd rad_lengths(n);
+    const Vector2d o(fast_fit(0), fast_fit(1));
 
+    // associated Jacobian, used in weights and errors computation
+    for (u_int i = 0; i < n; ++i)
+    {  // x
+        Vector2d p = p2D.block(0, i, 2, 1) - o;
+        const double cross = cross2D(-o, p);
+        const double dot = (-o).dot(p);
+        const double atan2_ = atan2(cross, dot);
+        s_values(i) = std::abs(atan2_ * fast_fit(2));
+    }
+    computeRadLenUniformMaterial(s_values*sqrt(1. + 1./(fast_fit(3)*fast_fit(3))), rad_lengths);
     MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
-    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
+    VectorNd sig2(n);
+    sig2 = .000225 / p_2 * (1.f + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
     for (u_int k = 0; k < n; ++k)
     {
         for (u_int l = k; l < n; ++l)
         {
             for (u_int i = 0; i < std::min(k, l); ++i)
             {
-              if (in_forward) {
-                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(cos(theta));
-              } else {
-                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(sin(theta));
-              }
+              scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i) / (sqr(sin(theta)));
               scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
             }
         }
@@ -409,23 +429,6 @@ __host__ __device__ inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv)
     return cov_rad_inv.colwise().sum().transpose();
 }
 
-/*!
-    \brief Compute the points' weights' vector for the line fit (ODR).
-    Results from a pre-fit is needed in order to take the orthogonal (to the
-    line) component of the errors.
-
-    \param x_err2 squared errors in the x axis.
-    \param y_err2 squared errors in the y axis.
-    \param tan_theta tangent of theta (angle between y axis and line).
-
-    \return weight points' weights' vector for the line fit (ODR).
-*/
-
-__host__ __device__ inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta)
-{
-    return (1. + sqr(tan_theta)) * 1. / (x_err2 + y_err2 * sqr(tan_theta));
-}
-
 /*!
     \brief Find particle q considering the  sign of cross product between
     particles velocity (estimated by the first 2 hits) and the vector radius
@@ -470,40 +473,6 @@ __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B,
     circle.par = par_pak;
 }
 
-/*!
-    \brief Compute the error propagation to obtain the square errors in the
-    x axis for the line fit. If errors have not been computed in the circle fit
-    than an'approximation is made.
-    Further information in attached documentation.
-
-    \param V hits' covariance matrix.
-    \param circle result of the previous circle fit (only the covariance matrix
-    is needed) TO FIX
-    \param J Jacobian of the transformation producing x values.
-    \param error flag for error computation.
-
-    \return x_err2 squared errors in the x axis.
-*/
-
-__host__ __device__ inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
-                                           const bool error, u_int n)
-{
-    VectorNd x_err2(n);
-    for (u_int i = 0; i < n; ++i)
-    {
-        Matrix5d Cov = MatrixXd::Zero(5, 5);
-        if (error)
-            Cov.block(0, 0, 3, 3) = circle.cov;
-        Cov(3, 3) = V(i, i);
-        Cov(4, 4) = V(i + n, i + n);
-        Cov(3, 4) = Cov(4, 3) = V(i, i + n);
-        Eigen::Matrix<double, 1, 1> tmp;
-        tmp = J.row(i) * Cov * J.row(i).transpose().eval();
-        x_err2(i) = tmp(0, 0);
-    }
-    return x_err2;
-}
-
 /*!
     \brief Compute the eigenvector associated to the minimum eigenvalue.
 
@@ -1004,7 +973,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         {
             const double t = 1. / h;
             J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
-                0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+                v(0)*v2x2_inv*t, v(1)*v2x2_inv*t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
         }
         printIt(&J3, "circle_fit - J3:");
 
@@ -1059,21 +1028,22 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     errors.
 */
 
-__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
-                                             const Matrix3Nd& hits_cov,
-                                             const circle_fit& circle,
-                                             const Vector4d& fast_fit,
-                                             const double B,
-                                             const bool error = true)
+__host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
+    const Matrix3Nd& hits_cov,
+    const circle_fit& circle,
+    const Vector4d& fast_fit,
+    const double B,
+    const bool error = true)
 {
     u_int n = hits.cols();
+    double theta = -circle.q*atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI :  theta;
     // PROJECTION ON THE CILINDER
-    Matrix2xNd p2D(2, n);
-    MatrixNx5d Jx(n, 5);
+    Matrix2xNd p2D = MatrixXd::Zero(2, n);
+    Eigen::Matrix<double, 2, 6> Jx;
 
 #if RFIT_DEBUG
     printf("Line_fit - B: %g\n", B);
-
     printIt(&hits, "Line_fit points: ");
     printIt(&hits_cov, "Line_fit covs: ");
 #endif
@@ -1085,8 +1055,11 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     const Vector2d o(circle.par(0), circle.par(1));
 
     // associated Jacobian, used in weights and errors computation
+    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
     for (u_int i = 0; i < n; ++i)
     {  // x
+      Matrix6d Cov = MatrixXd::Zero(6, 6);
+      Matrix2d Cov_sz_single = MatrixXd::Zero(2, 2);
         Vector2d p = hits.block(0, i, 2, 1) - o;
         const double cross = cross2D(-o, p);
         const double dot = (-o).dot(p);
@@ -1095,9 +1068,9 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         const double atan2_ = -circle.q * atan2(cross, dot);
         p2D(0, i) = atan2_ * circle.par(2);
 
-        // associated Jacobian, used in weights and errors computation
+        // associated Jacobian, used in weights and errors- computation
         const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-        double d_X0 = 0, d_Y0 = 0, d_R = 0.;  // good approximation for big pt and eta
+        double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
         if (error)
         {
             d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
@@ -1106,7 +1079,19 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         }
         const double d_x = temp0 * (o(1) * dot + o(0) * cross);
         const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-        Jx.row(i) << d_X0, d_Y0, d_R, d_x, d_y;
+        Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+//        Jx << d_X0, d_Y0, d_R, p(1)/p.norm(), -p(0)/p.norm(), 0, 0, 0, 0, 0, 0, 1.;
+        Cov.block(0, 0, 3, 3) = circle.cov;
+        Cov(3, 3) = hits_cov(i, i);
+        Cov(4, 4) = hits_cov(i + n, i + n);
+        Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);
+        Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);
+        Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);
+        Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);
+        Cov_sz_single = Jx * Cov * Jx.transpose();
+        cov_sz(i, i) = Cov_sz_single(0, 0);
+        cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
+        cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
     }
     // Math of d_{X0,Y0,R,x,y} all verified by hand
 
@@ -1114,43 +1099,25 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     p2D.row(1) = hits.row(2);
 
     // WEIGHT COMPUTATION
-    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
-    VectorNd x_err2 = X_err2(hits_cov, circle, Jx, error, n);
-    VectorNd y_err2 = hits_cov.block(2 * n, 2 * n, n, n).diagonal();
-    cov_sz.block(0, 0, n, n) = x_err2.asDiagonal();
-    cov_sz.block(n, n, n, n) = y_err2.asDiagonal();
 #if RFIT_DEBUG
     printIt(&cov_sz, "line_fit - cov_sz:");
 #endif
-    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), B);
+    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
 #if RFIT_DEBUG
     printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
 #endif
-    Matrix4d G, G4;
-    G4 = cov_with_ms.inverse();
+    Matrix4d G;
+    G = cov_with_ms.inverse();
 #if RFIT_DEBUG
-    printIt(&G4, "line_fit - cov_with_ms.inverse():");
+    printIt(&G, "line_fit - cov_with_ms.inverse():");
 #endif
-    double renorm = G4.sum();
-    G4 *= 1. / renorm;
+    double renorm = G.sum();
+    G *= 1. / renorm;
 #if RFIT_DEBUG
-    printIt(&G4, "line_fit - G4:");
+    printIt(&G, "line_fit - G4:");
 #endif
-    G = G4;
-    const VectorNd weight = Weight_circle(G);
 
-
-    VectorNd err2_inv = cov_with_ms.diagonal();
-    err2_inv = err2_inv.cwiseInverse();
-//    const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
-//    const VectorNd weight = err2_inv * 1. / err2_inv.sum();
-
-#if RFIT_DEBUG
-    printIt(&x_err2, "Line_fit - x_err2: ");
-    printIt(&y_err2, "Line_fit - y_err2: ");
-    printIt(&err2_inv, "Line_fit - err2_inv: ");
-    printIt(&weight, "Line_fit - weight: ");
-#endif
+    const VectorNd weight = Weight_circle(G);
 
     // COST FUNCTION
 
@@ -1162,16 +1129,12 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     const Matrix2xNd X = p2D.colwise() - r0;
     Matrix2d A = Matrix2d::Zero();
     A = X * G * X.transpose();
-//    for (u_int i = 0; i < n; ++i)
-//    {
-//        A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
-//    }
 
 #if RFIT_DEBUG
     printIt(&A, "Line_fit - A: ");
 #endif
 
-    // minimize
+    // minimize. v is normalized!!
     double chi2;
     Vector2d v = min_eigen2D(A, chi2);
 #if RFIT_DEBUG
@@ -1179,7 +1142,6 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     printf("Line_fit chi2: %e\n", chi2);
 #endif
 
-    // n *= (chi2>0) ? 1 : -1; //TO FIX
     // This hack to be able to run on GPU where the automatic assignment to a
     // double from the vector multiplication is not working.
     Matrix<double, 1, 1> cm;
@@ -1189,8 +1151,8 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     // COMPUTE LINE PARAMETER
     line_fit line;
     line.par << -v(0) / v(1),                          // cotan(theta))
-        -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
-    line.chi2 = abs(chi2);
+        -c / v(1);  // Zip
+    line.chi2 = abs(chi2*renorm);
 #if RFIT_DEBUG
     printIt(&(line.par), "Line_fit - line.par: ");
     printf("Line_fit - v norm: %e\n", sqrt(v(0)*v(0) + v(1)*v(1)));
@@ -1206,19 +1168,21 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         {
           // The norm is taken from Chernov, properly adapted to the weights case.
             double norm = v.transpose() * A * v;
-            norm /= weight.sum();
+//            double norm_empirical = cov_with_ms.diagonal().mean();
 #if RFIT_DEBUG
+            printf("Chi_2: %g\n", chi2);
+            printf("Norm: %g\n", norm);
+            printf("weight.sum(): %g\n", weight.sum());
             printf("Line_fit - norm:    %e\n", norm);
 #endif
-            const double sig2 = 1. / (A(0, 0) + A(1, 1)) * norm;
+
+            const double sig2 = norm/(A(0,0) + A(1,1));
             C(0, 0) = sig2 * v1_2;
             C(1, 1) = sig2 * v0_2;
-            C(0, 1) = C(1, 0) = -sig2 * v(0) * v(1);
-            const VectorNd weight_2 = (weight).array().square();
-            const Vector2d C0(weight_2.dot(x_err2), weight_2.dot(y_err2));
-            C.block(0, 2, 2, 1) = C.block(2, 0, 1, 2).transpose() = -C.block(0, 0, 2, 2) * r0;
-            Matrix<double, 1, 1> tmp = (r0.transpose() * C.block(0, 0, 2, 2) * r0);
-            C(2, 2) = v0_2 * C0(0) + v1_2 * C0(1) + C0(0) * C(0, 0) + C0(1) * C(1, 1) + tmp(0, 0);
+            C(1, 0) = C(0, 1) = -sig2 * v(0) * v(1);
+            C(2, 2) = sig2 * (v(0)*r0(1)-v(1)*r0(0))*(v(0)*r0(1)-v(1)*r0(0)) + (sig2/n)*(A(0,0)+A(1,1));
+            C(0, 2) = C(2, 0) = sig2*(v(0)*r0(1)-v(1)*r0(0))*v(1);
+            C(1, 2) = C(2, 1) = - sig2*(v(0)*r0(1)-v(1)*r0(0))*v(0);
         }
 #if RFIT_DEBUG
         printIt(&C, "line_fit - C:");
@@ -1228,9 +1192,7 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         {
             const double t0 = 1. / v(1);
             const double t1 = sqr(t0);
-            const double sqrt_ = sqrt(v1_2 + v0_2);
-            const double t2 = 1. / sqrt_;
-            J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
+            J << -t0, v(0) * t1, 0., 0., c * t1, -t0;
         }
         Matrix<double, 3, 2> JT = J.transpose().eval();
 #if RFIT_DEBUG
@@ -1245,6 +1207,184 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     return line;
 }
 
+/*!  \brief Perform an ordinary least square fit in the s-z plane to compute
+ * the parameters cotTheta and Zip.
+ *
+ * The fit is performed in the rotated S3D-Z' plane, following the formalism of
+ * Frodesen, Chapter 10, p. 259.
+ *
+ * The system has been rotated to both try to use the combined errors in s-z
+ * along Z', as errors in the Y direction and to avoid the patological case of
+ * degenerate lines with angular coefficient m = +/- inf.
+ *
+ * The rotation is using the information on the theta angle computed in the
+ * fast fit. The rotation is such that the S3D axis will be the X-direction,
+ * while the rotated Z-axis will be the Y-direction. This pretty much follows
+ * what is done in the same fit in the Broken Line approach.
+ */
+
+__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
+    const Matrix3Nd& hits_cov,
+    const circle_fit& circle,
+    const Vector4d& fast_fit,
+    const double B,
+    const bool error = true) {
+  auto n = hits.cols();
+  double theta = -circle.q*atan(fast_fit(3));
+  theta = theta < 0. ? theta + M_PI : theta;
+
+  // PROJECTION ON THE CILINDER
+  //
+  // p2D will be:
+  // [s1, s2, s3, ..., sn]
+  // [z1, z2, z3, ..., zn]
+  // s values will be ordinary x-values
+  // z values will be ordinary y-values
+
+  Matrix2xNd p2D(2, n);
+  Eigen::Matrix<double, 2, 6> Jx;
+
+  p2D << MatrixXd::Zero(2, n);
+  Jx << MatrixXd::Zero(2, 6);
+
+#if RFIT_DEBUG
+  printf("Line_fit - B: %g\n", B);
+  printIt(&hits, "Line_fit points: ");
+  printIt(&hits_cov, "Line_fit covs: ");
+#endif
+  // x & associated Jacobian
+  // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+  // Slide 11
+  // a ==> -o i.e. the origin of the circle in XY plane, negative
+  // b ==> p i.e. distances of the points wrt the origin of the circle.
+  const Vector2d o(circle.par(0), circle.par(1));
+
+  // associated Jacobian, used in weights and errors computation
+  Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
+  Matrix6d Cov(6,6);
+  Matrix2d Cov_sz_single(2, 2);
+  for (u_int i = 0; i < n; ++i)
+  {
+    Vector2d p = hits.block(0, i, 2, 1) - o;
+    const double cross = cross2D(-o, p);
+    const double dot = (-o).dot(p);
+    // atan2(cross, dot) give back the angle in the transverse plane so tha the
+    // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
+    const double atan2_ = -circle.q * atan2(cross, dot);
+//    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
+    p2D(0, i) = atan2_ * circle.par(2);
+
+    // associated Jacobian, used in weights and errors- computation
+    const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+    double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+    if (error)
+    {
+      d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
+      d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
+      d_R = atan2_;
+    }
+    const double d_x = temp0 * (o(1) * dot + o(0) * cross);
+    const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
+    Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+    Cov << MatrixXd::Zero(6, 6);
+    Cov_sz_single << MatrixXd::Zero(2, 2);
+    Cov.block(0, 0, 3, 3) = circle.cov;
+    Cov(3, 3) = hits_cov(i, i);                        // x errors
+    Cov(4, 4) = hits_cov(i + n, i + n);                // y errors
+    Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);            // z errors
+    Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);        // cov_xy
+    Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);      // cov_xz
+    Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);  // cov_yz
+    Cov_sz_single = Jx * Cov * Jx.transpose();
+    cov_sz(i, i) = Cov_sz_single(0, 0);
+    cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
+    cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
+  }
+  // Math of d_{X0,Y0,R,x,y} all verified by hand
+  p2D.row(1) = hits.row(2);
+
+  // The following matrix will contain errors orthogonal to the rotated S
+  // component only, with the Multiple Scattering properly treated!!
+  MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
+#if RFIT_DEBUG
+  printIt(&cov_sz, "line_fit - cov_sz:");
+  printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+#endif
+
+  // Prepare the Rotation Matrix to rotate the points
+  Eigen::Matrix<double, 2, 2> rot = Eigen::Matrix<double, 2, 2>::Zero();
+  rot << sin(theta), cos(theta), -cos(theta), sin(theta);
+
+  // Rotate Points with the shape [2, n]
+  Matrix2xNd p2D_rot = rot*p2D;
+
+#if RFIT_DEBUG
+  printf("Fast fit Tan(theta): %g\n", fast_fit(3));
+  printf("Rotation angle: %g\n", theta);
+  printIt(&rot, "Rotation Matrix:");
+  printIt(&p2D, "Original Hits(s,z):");
+  printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
+  printIt(&rot, "Rotation Matrix:");
+#endif
+
+  // Build the A Matrix
+  Matrix2xNd A(2,n);
+  A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+
+#if RFIT_DEBUG
+  printIt(&A, "A Matrix:");
+#endif
+
+  // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
+  MatrixNd Vy_inv = cov_with_ms.inverse();
+  Eigen::Matrix<double, 2, 2> Inv_Cov = A*Vy_inv*A.transpose();
+
+  // Compute the Covariance Matrix of the fit parameters
+  Eigen::Matrix<double, 2, 2> Cov_params = Inv_Cov.inverse();
+
+  // Now Compute the Parameters in the form [2,1]
+  // The first component is q.
+  // The second component is m.
+  Eigen::Matrix<double, 2, 1> sol = Cov_params*A*Vy_inv*p2D_rot.row(1).transpose();
+
+
+#if RFIT_DEBUG
+  printIt(&sol, "Rotated solutions:");
+#endif
+
+  // We need now to transfer back the results in the original s-z plane
+  auto common_factor = 1./(sin(theta)-sol(1,0)*cos(theta));
+  Matrix<double, 2, 2> J = Matrix<double, 2, 2>::Zero();
+  J << 0., common_factor*common_factor, common_factor, sol(0,0)*cos(theta)*common_factor*common_factor;
+
+  double m = common_factor*(sol(1,0)*sin(theta)+cos(theta));
+  double q = common_factor*sol(0,0);
+  auto cov_mq = J * Cov_params * J.transpose();
+
+  VectorNd res = p2D_rot.row(1).transpose() - A.transpose() * sol;
+  double chi2 = res.transpose()*Vy_inv*res;
+  chi2 = chi2 / float(n);
+
+  line_fit line;
+  line.par << m, q;
+  line.cov << cov_mq;
+  line.chi2 = chi2;
+
+#if RFIT_DEBUG
+  printf("Common_factor: %g\n", common_factor);
+  printIt(&J, "Jacobian:");
+  printIt(&sol, "Rotated solutions:");
+  printIt(&Cov_params, "Cov_params:");
+  printIt(&cov_mq, "Rotated Covariance Matrix:");
+  printIt(&(line.par), "Real Parameters:");
+  printIt(&(line.cov), "Real Covariance Matrix:");
+  printf("Chi2: %g\n", chi2);
+#endif
+
+  return line;
+}
+
 /*!
     \brief Helix fit by three step:
     -fast pre-fit (see Fast_fit() for further info); \n
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
index b27ed52473388..f71e3f082ada4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -6,7 +6,11 @@
 #include <random>
 #include <memory>  // unique_ptr
 
+#include <TFile.h>
+#include <TH1F.h>
+
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+//#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
 
 using namespace std;
 using namespace Eigen;
@@ -164,161 +168,261 @@ Matrix<double, 6, 1> New_par(const Matrix<double, 6, 1>& gen_par, const int& cha
   return new_par;
 }
 
+template<typename Fit, size_t N>
+void computePull(std::array<Fit, N> & fit, const char * label,
+    int n_, int iteration, const Vector5d & true_par) {
+  Matrix<double, 41, Dynamic, 1> score(41, iteration);
+
+  std::string histo_name("Phi Pull");
+  histo_name += label;
+  TH1F phi_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "dxy Pull ";
+  histo_name += label;
+  TH1F dxy_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "dz Pull ";
+  histo_name += label;
+  TH1F dz_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Theta Pull ";
+  histo_name += label;
+  TH1F theta_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Pt Pull ";
+  histo_name += label;
+  TH1F pt_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Phi Error ";
+  histo_name += label;
+  TH1F phi_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "dxy error ";
+  histo_name += label;
+  TH1F dxy_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "dz error ";
+  histo_name += label;
+  TH1F dz_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "Theta error ";
+  histo_name += label;
+  TH1F theta_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "Pt error ";
+  histo_name += label;
+  TH1F pt_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  for (int x = 0; x < iteration; x++) {
+    // Compute PULLS information
+    score(0, x) = (fit[x].par(0) - true_par(0)) / sqrt(fit[x].cov(0, 0));
+    score(1, x) = (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(1, 1));
+    score(2, x) = (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(2, 2));
+    score(3, x) = (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(3, 3));
+    score(4, x) = (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(4, 4));
+    phi_pull.Fill(score(0, x));
+    dxy_pull.Fill(score(1, x));
+    pt_pull.Fill(score(2, x));
+    theta_pull.Fill(score(3, x));
+    dz_pull.Fill(score(4, x));
+    phi_error.Fill(sqrt(fit[x].cov(0, 0)));
+    dxy_error.Fill(sqrt(fit[x].cov(1, 1)));
+    pt_error.Fill(sqrt(fit[x].cov(2, 2)));
+    theta_error.Fill(sqrt(fit[x].cov(3, 3)));
+    dz_error.Fill(sqrt(fit[x].cov(4, 4)));
+    score(5, x) =
+      (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / (fit[x].cov(0, 1));
+    score(6, x) =
+      (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(0, 2));
+    score(7, x) =
+      (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(1, 2));
+    score(8, x) =
+      (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / (fit[x].cov(3, 4));
+    score(9, x) = fit[x].chi2_circle;
+    score(25, x) = fit[x].chi2_line;
+    score(10, x) = sqrt(fit[x].cov(0, 0)) / fit[x].par(0) * 100;
+    score(13, x) = sqrt(fit[x].cov(3, 3)) / fit[x].par(3) * 100;
+    score(14, x) = sqrt(fit[x].cov(4, 4)) / fit[x].par(4) * 100;
+    score(15, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(3) - true_par(3)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(3, 3));
+    score(16, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(3) - true_par(3)) /
+      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(3, 3));
+    score(17, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(3) - true_par(3)) /
+      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(3, 3));
+    score(18, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(4, 4));
+    score(19, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(4, 4));
+    score(20, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(4, 4));
+    score(21, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(1, 1));
+    score(22, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(2, 2));
+    score(23, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) /
+      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(2, 2));
+    score(24, x) = (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(3, 3)) / sqrt(fit[x].cov(4, 4));
+    score(30, x) = fit[x].par(0);
+    score(31, x) = fit[x].par(1);
+    score(32, x) = fit[x].par(2);
+    score(33, x) = fit[x].par(3);
+    score(34, x) = fit[x].par(4);
+    score(35, x) = sqrt(fit[x].cov(0,0));
+    score(36, x) = sqrt(fit[x].cov(1,1));
+    score(37, x) = sqrt(fit[x].cov(2,2));
+    score(38, x) = sqrt(fit[x].cov(3,3));
+    score(39, x) = sqrt(fit[x].cov(4,4));
+
+  }
+
+  double phi_ = score.row(0).mean();
+  double a_ = score.row(1).mean();
+  double pt_ = score.row(2).mean();
+  double coT_ = score.row(3).mean();
+  double Zip_ = score.row(4).mean();
+  std::cout << std::setprecision(5) << std::scientific << label << " AVERAGE FITTED VALUES: \n"
+    << "phi: " << score.row(30).mean() << " +/- " << score.row(35).mean() << " [+/-] " << sqrt(score.row(35).array().abs2().mean() - score.row(35).mean()*score.row(35).mean()) << std::endl
+    << "d0:  " << score.row(31).mean() << " +/- " << score.row(36).mean() << " [+/-] " << sqrt(score.row(36).array().abs2().mean() - score.row(36).mean()*score.row(36).mean()) << std::endl
+    << "pt:  " << score.row(32).mean() << " +/- " << score.row(37).mean() << " [+/-] " << sqrt(score.row(37).array().abs2().mean() - score.row(37).mean()*score.row(37).mean()) << std::endl
+    << "coT: " << score.row(33).mean() << " +/- " << score.row(38).mean() << " [+/-] " << sqrt(score.row(38).array().abs2().mean() - score.row(38).mean()*score.row(38).mean()) << std::endl
+    << "Zip: " << score.row(34).mean() << " +/- " << score.row(39).mean() << " [+/-] " << sqrt(score.row(39).array().abs2().mean() - score.row(39).mean()*score.row(39).mean()) << std::endl;
+
+  Matrix5d correlation;
+  correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
+              score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
+              score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
+              score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
+              score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
+              score.row(24).mean(), 1.;
+
+  cout << "\n" << label << " PULLS (mean, sigma, relative_error):\n"
+    << "phi:  " << phi_ << "     "
+    << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(10).mean()) << "%\n"
+    << "a0 :  " << a_ << "     "
+    << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(11).mean()) << "%\n"
+    << "pt :  " << pt_ << "     "
+    << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(12).mean()) << "%\n"
+    << "coT:  " << coT_ << "     "
+    << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(13).mean()) << "%\n"
+    << "Zip:  " << Zip_ << "     "
+    << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(14).mean()) << "%\n\n"
+    << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
+    << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
+    << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
+    << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
+    << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
+    << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
+    << "correlation matrix:\n"
+    << correlation << "\n\n"
+    << endl;
+
+  phi_pull.Fit("gaus", "Q");
+  dxy_pull.Fit("gaus", "Q");
+  dz_pull.Fit("gaus", "Q");
+  theta_pull.Fit("gaus", "Q");
+  pt_pull.Fit("gaus", "Q");
+  phi_pull.Write();
+  dxy_pull.Write();
+  dz_pull.Write();
+  theta_pull.Write();
+  pt_pull.Write();
+  phi_error.Write();
+  dxy_error.Write();
+  dz_error.Write();
+  theta_error.Write();
+  pt_error.Write();
+}
+
+
 void test_helix_fit() {
   int n_;
-  int iteration;
-  int debug2 = 0;
   bool return_err;
   const double B_field = 3.8 * c_speed / pow(10, 9) / 100;
   Matrix<double, 6, 1> gen_par;
   Vector5d true_par;
   Vector5d err;
-//  while (1) {
-    generator.seed(1);
-    int debug = 0;
-    debug2 = 0;
-    std::cout << std::setprecision(6);
-    cout << "_________________________________________________________________________\n";
-    cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
-//    cin >> n_ >> gen_par(0) >> gen_par(1) >> gen_par(2) >> gen_par(3) >> gen_par(4) >> gen_par(5) >>
-//        iteration >> return_err >> debug2;
-    n_ = 4;
-    gen_par(0) = -0.1;  // x
-    gen_par(1) = 0.1;   // y
-    gen_par(2) = -1.;  // z
-    gen_par(3) = 45.;   // phi
-    gen_par(4) = 10.;   // R (p_t)
-    gen_par(5) = 1.;   // eta
-    iteration = 1;
-    return_err = 1;
-    debug2 = 1;
-
-    iteration *= 10;
-    gen_par = New_par(gen_par, 1, B_field);
-    true_par = True_par(gen_par, 1, B_field);
-    Matrix3xNd hits;
-    Matrix3Nd hits_cov;
-    unique_ptr<helix_fit[]> helix(new helix_fit[iteration]);
-//    helix_fit* helix = new helix_fit[iteration];
-    Matrix<double, 41, Dynamic, 1> score(41, iteration);
-
-    for (int i = 0; i < iteration; i++) {
-      if (debug2 == 1 && i == (iteration - 1)) {
-        debug = 1;
-      }
-      hits_gen gen;
-      gen = Hits_gen(n_, gen_par);
-//      gen.hits = MatrixXd::Zero(3, 4);
-//      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
-//      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
-//      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
-//      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
-//      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
-      helix[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err);
+  generator.seed(1);
+  std::cout << std::setprecision(6);
+  cout << "_________________________________________________________________________\n";
+  cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
+  cout << "hits: ";
+  cin  >> n_;
+  cout << "x: ";
+  cin  >> gen_par(0);
+  cout << "y: ";
+  cin  >> gen_par(1);
+  cout << "z: ";
+  cin  >> gen_par(2);
+  cout << "phi: ";
+  cin  >> gen_par(3);
+  cout << "p_t: ";
+  cin  >> gen_par(4);
+  cout << "eta: ";
+  cin  >> gen_par(5);
+  //
+  /*
+     n_ = 4;
+     gen_par(0) = -0.1;  // x
+     gen_par(1) = 0.1;   // y
+     gen_par(2) = -1.;  // z
+     gen_par(3) = 45.;   // phi
+     gen_par(4) = 10.;   // R (p_t)
+     gen_par(5) = 1.;   // eta
+     iteration = 1;
+     */
+  return_err = 1;
 
-      if (debug)
-        cout << std::setprecision(10)
-            << "phi:  " << helix[i].par(0) << " +/- " << sqrt(helix[i].cov(0, 0)) << " vs "
-            << true_par(0) << endl
-            << "Tip:  " << helix[i].par(1) << " +/- " << sqrt(helix[i].cov(1, 1)) << " vs "
-            << true_par(1) << endl
-            << "p_t:  " << helix[i].par(2) << " +/- " << sqrt(helix[i].cov(2, 2)) << " vs "
-            << true_par(2) << endl
-            << "theta:" << helix[i].par(3) << " +/- " << sqrt(helix[i].cov(3, 3)) << " vs "
-            << true_par(3) << endl
-            << "Zip:  " << helix[i].par(4) << " +/- " << sqrt(helix[i].cov(4, 4)) << " vs "
-            << true_par(4) << endl
-            << "charge:" << helix[i].q << " vs 1" << endl
-            << "covariance matrix:" << endl
-            << helix[i].cov << endl
-            << "Initial hits:\n" << gen.hits << endl
-            << "Initial Covariance:\n" << gen.hits_cov << endl;
-    }
-
-    for (int x = 0; x < iteration; x++) {
-      // Compute PULLS information
-      score(0, x) = (helix[x].par(0) - true_par(0)) / sqrt(helix[x].cov(0, 0));
-      score(1, x) = (helix[x].par(1) - true_par(1)) / sqrt(helix[x].cov(1, 1));
-      score(2, x) = (helix[x].par(2) - true_par(2)) / sqrt(helix[x].cov(2, 2));
-      score(3, x) = (helix[x].par(3) - true_par(3)) / sqrt(helix[x].cov(3, 3));
-      score(4, x) = (helix[x].par(4) - true_par(4)) / sqrt(helix[x].cov(4, 4));
-      score(5, x) =
-          (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) / (helix[x].cov(0, 1));
-      score(6, x) =
-          (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(0, 2));
-      score(7, x) =
-          (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(1, 2));
-      score(8, x) =
-          (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) / (helix[x].cov(3, 4));
-      score(9, x) = helix[x].chi2_circle;
-      score(25, x) = helix[x].chi2_line;
-      score(10, x) = sqrt(helix[x].cov(0, 0)) / helix[x].par(0) * 100;
-      score(13, x) = sqrt(helix[x].cov(3, 3)) / helix[x].par(3) * 100;
-      score(14, x) = sqrt(helix[x].cov(4, 4)) / helix[x].par(4) * 100;
-      score(15, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(3) - true_par(3)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(3, 3));
-      score(16, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(3) - true_par(3)) /
-                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(3, 3));
-      score(17, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(3) - true_par(3)) /
-                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(3, 3));
-      score(18, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(4, 4));
-      score(19, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(4, 4));
-      score(20, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(4, 4));
-      score(21, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(1, 1));
-      score(22, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(2, 2));
-      score(23, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) /
-                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(2, 2));
-      score(24, x) = (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(3, 3)) / sqrt(helix[x].cov(4, 4));
-    }
+  const int iteration = 5000;
+  gen_par = New_par(gen_par, 1, B_field);
+  true_par = True_par(gen_par, 1, B_field);
+  Matrix3xNd hits;
+  Matrix3Nd hits_cov;
+  std::array<helix_fit, iteration> helixRiemann_fit;
+//  std::array<BrokenLine::helix_fit, iteration> helixBrokenLine_fit;
 
-    double phi_ = score.row(0).mean();
-    double a_ = score.row(1).mean();
-    double pt_ = score.row(2).mean();
-    double coT_ = score.row(3).mean();
-    double Zip_ = score.row(4).mean();
-    Matrix5d correlation;
-    correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
-        score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
-        score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
-        score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
-        score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
-        score.row(24).mean(), 1.;
+  std::cout << "\nTrue parameters: "
+    << "phi: " << true_par(0) << " "
+    << "dxy: " << true_par(1) << " "
+    << "pt: " << true_par(2) << " "
+    << "CotT: " << true_par(3) << " "
+    << "Zip: " << true_par(4) << " "
+    << std::endl;
+  for (int i = 0; i < iteration; i++) {
+    hits_gen gen;
+    gen = Hits_gen(n_, gen_par);
+    //      gen.hits = MatrixXd::Zero(3, 4);
+    //      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
+    //      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
+    //      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
+    //      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
+    //      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
+    helixRiemann_fit[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err);
+//    helixBrokenLine_fit[i] = BrokenLine::Helix_fit(gen.hits, gen.hits_cov, B_field);
 
-    cout << "\nPULLS:\n"
-         << "phi:  " << phi_ << "     "
-         << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(10).mean()) << "%\n"
-         << "a0 :  " << a_ << "     "
-         << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(11).mean()) << "%\n"
-         << "pt :  " << pt_ << "     "
-         << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(12).mean()) << "%\n"
-         << "coT:  " << coT_ << "     "
-         << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(13).mean()) << "%\n"
-         << "Zip:  " << Zip_ << "     "
-         << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(14).mean()) << "%\n\n"
-         << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
-         << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
-         << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
-         << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
-         << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
-         << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
-         << "correlation matrix:\n"
-         << correlation << "\n\n"
-         << endl;
-//  }
+    std::cout << std::endl;
+    /*
+    if (debug)
+      cout << std::setprecision(6)
+        << "phi:  " << helixRiemann_fit[i].par(0) << " +/- " << sqrt(helixRiemann_fit[i].cov(0, 0)) << " vs "
+        << true_par(0) << endl
+        << "Tip:  " << helixRiemann_fit[i].par(1) << " +/- " << sqrt(helixRiemann_fit[i].cov(1, 1)) << " vs "
+        << true_par(1) << endl
+        << "p_t:  " << helixRiemann_fit[i].par(2) << " +/- " << sqrt(helixRiemann_fit[i].cov(2, 2)) << " vs "
+        << true_par(2) << endl
+        << "theta:" << helixRiemann_fit[i].par(3) << " +/- " << sqrt(helixRiemann_fit[i].cov(3, 3)) << " vs "
+        << true_par(3) << endl
+        << "Zip:  " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs "
+        << true_par(4) << endl
+        << "charge:" << helixRiemann_fit[i].q << " vs 1" << endl
+        << "covariance matrix:" << endl
+        << helixRiemann_fit[i].cov << endl
+        << "Initial hits:\n" << gen.hits << endl
+        << "Initial Covariance:\n" << gen.hits_cov << endl;
+        */
+  }
+  computePull(helixRiemann_fit, "Riemann", n_, iteration, true_par);
+//  computePull(helixBrokenLine_fit, "BrokenLine", n_, iteration, true_par);
 }
 
 int main() {
+  TFile f("TestFitResults.root", "RECREATE");
   test_helix_fit();
+  f.Close();
   return 0;
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 7b1125eebc312..485fac34b00b2 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -29,12 +29,10 @@ void kernelFullFit(Rfit::Matrix3xNd * hits,
   Rfit::Matrix2Nd hits_cov2D_local = (hits_cov->block(0, 0, 2 * n, 2 * n)).eval();
   Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ");
   Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ");
-  /*
   printf("kernelFullFit - hits address: %p\n", hits);
   printf("kernelFullFit - hits_cov address: %p\n", hits_cov);
   printf("kernelFullFit - hits_cov2D address: %p\n", &hits2D_local);
   printf("kernelFullFit - hits_cov2D_local address: %p\n", &hits_cov2D_local);
-  */
   /* At some point I gave up and locally construct block on the stack, so that
      the next invocation to Rfit::Circle_fit works properly. Failing to do so
      implied basically an empty collection of hits and covariances. That could
@@ -43,60 +41,20 @@ void kernelFullFit(Rfit::Matrix3xNd * hits,
      creations of the blocks. To be understood and compared against the myriad
      of compilation warnings we have.
      */
+
   (*circle_fit_resultsGPU) =
     Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-      fast_fit, rad, B, errors);
+        fast_fit, rad, B, errors);
   /*
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
-      fast_fit, rad, B, errors, scattering);
-   */
-  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, errors);
+     (*circle_fit_resultsGPU) =
+     Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
+     fast_fit, rad, B, errors);
+  */
+  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, B, errors);
 
   return;
 }
 
-__global__
-void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
-  (*results) = Rfit::Fast_fit(*hits);
-}
-
-__global__
-void kernelCircleFit(Rfit::Matrix3xNd * hits,
-    Rfit::Matrix3Nd * hits_cov, Vector4d * fast_fit_input, double B,
-    Rfit::circle_fit * circle_fit_resultsGPU) {
-  u_int n = hits->cols();
-  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
-
-#if TEST_DEBUG
-  printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
-  printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
-  printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
-  printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
-  printf("rad(0,0): %f\n", rad(0,0));
-  printf("rad(1,1): %f\n", rad(1,1));
-  printf("rad(2,2): %f\n", rad(2,2));
-  printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
-  printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
-  printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
-  printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
-  printf("B: %f\n", B);
-#endif
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-      *fast_fit_input, rad, B, false);
-}
-
-__global__
-void kernelLineFit(Rfit::Matrix3xNd * hits,
-                   Rfit::Matrix3Nd * hits_cov,
-                   Rfit::circle_fit * circle_fit,
-                   Vector4d * fast_fit,
-                   Rfit::line_fit * line_fit)
-{
-  (*line_fit) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit, *fast_fit, true);
-}
-
 void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
   hits << 1.98645, 4.72598, 7.65632, 11.3151,
           2.18002, 4.88864, 7.75845, 11.3134,
@@ -119,79 +77,6 @@ void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
   hits_cov(3,7) = hits_cov(7,3) = -5.28e-06;
 }
 
-void testFit() {
-  constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd hits(3,4);
-  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
-  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
-  Rfit::Matrix3Nd * hits_covGPU = nullptr;
-  Vector4d * fast_fit_resultsGPU = new Vector4d();
-  Vector4d * fast_fit_resultsGPUret = new Vector4d();
-  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
-  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
-
-  fillHitsAndHitsCov(hits, hits_cov);
-
-  // FAST_FIT_CPU
-  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
-#if TEST_DEBUG
-  std::cout << "Generated hits:\n" << hits << std::endl;
-#endif
-  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
-
-  // FAST_FIT GPU
-  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
-  cudaMalloc((void**)&fast_fit_resultsGPU, sizeof(Vector4d));
-  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
-
-  kernelFastFit<<<1, 1>>>(hitsGPU, fast_fit_resultsGPU);
-  cudaDeviceSynchronize();
-  
-  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, sizeof(Vector4d), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << *fast_fit_resultsGPUret << std::endl;
-  assert(isEqualFuzzy(fast_fit_results, (*fast_fit_resultsGPUret)));
-
-  // CIRCLE_FIT CPU
-  u_int n = hits.cols();
-  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
-
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
-      hits_cov.block(0, 0, 2 * n, 2 * n),
-      fast_fit_results, rad, B, false);
-  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
-
-  // CIRCLE_FIT GPU
-  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
-  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
-  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
-
-  kernelCircleFit<<<1,1>>>(hitsGPU, hits_covGPU,
-      fast_fit_resultsGPU, B, circle_fit_resultsGPU);
-  cudaDeviceSynchronize();
-
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
-      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
-
-  // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
-  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
-
-  // LINE_FIT GPU
-  Rfit::line_fit * line_fit_resultsGPU = nullptr;
-  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
-
-  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
-
-  kernelLineFit<<<1,1>>>(hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
-  cudaDeviceSynchronize();
-
-  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
-}
-
 void testFitOneGo(bool errors, double epsilon=1e-6) {
   constexpr double B = 0.0113921;
   Rfit::Matrix3xNd hits(3,4);
@@ -205,12 +90,12 @@ void testFitOneGo(bool errors, double epsilon=1e-6) {
   u_int n = hits.cols();
   Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
 
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n), 
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
       hits_cov.block(0, 0, 2 * n, 2 * n),
       fast_fit_results, rad, B, errors);
   // LINE_FIT CPU
   Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results,
-      fast_fit_results, errors);
+      fast_fit_results, B, errors);
 
   // FIT GPU
   std::cout << "GPU FIT" << std::endl;
@@ -253,10 +138,8 @@ void testFitOneGo(bool errors, double epsilon=1e-6) {
 }
 
 int main (int argc, char * argv[]) {
-//  testFit();
-  std::cout << "TEST FIT, NO ERRORS" << std::endl;
-  testFitOneGo(false);
 
+  cudaDeviceSetLimit(cudaLimitStackSize, 32*1024);
   std::cout << "TEST FIT, ERRORS AND SCATTER" << std::endl;
   testFitOneGo(true, 1e-5);
 

From eef1ec8c5bb904e8b495b86a59bf7837fa9c7bdb Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 14 Nov 2018 23:56:26 +0100
Subject: [PATCH 030/102] Synchronise with CMSSW_10_4_0_pre2

---
 .../PixelTrackFitting/interface/RiemannFit.h  | 510 +++++++-----------
 .../test/PixelTrackRiemannFit.cc              | 388 +++++--------
 .../PixelTrackFitting/test/testEigenGPU.cu    | 137 ++++-
 .../PixelTriplets/plugins/BuildFile.xml       |   3 +-
 .../python/plotting/trackingPlots.py          |  16 +-
 5 files changed, 469 insertions(+), 585 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 33f8334c8b5a5..6de1c77bbac12 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -47,7 +47,7 @@ struct circle_fit
       |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
-      */
+  */
     int64_t q;  //!< particle charge
     double chi2 = 0.0;
 };
@@ -78,6 +78,7 @@ struct helix_fit
     double chi2_line = 0.0;
     Vector4d fast_fit;
     int64_t q;  //!< particle charge
+                //  VectorXd time;  // TO FIX just for profiling
 } __attribute__((aligned(16)));
 
 template <class C>
@@ -118,40 +119,31 @@ __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
     return a.x() * b.y() - a.y() * b.x();
 }
 
-/*!  Compute the Radiation length in the uniform hypothesis
- *
- * The Pixel detector, barrel and forward, is considered as an omogeneous
- * cilinder of material, whose radiation lengths has been derived from the TDR
- * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
- * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
- * lengths are computed using this unique number, in both regions, barrel and
- * endcap.
- *
- * NB: no angle corrections nor projections are computed inside this routine.
- * It is therefore the responsibility of the caller to supply the proper
- * lengths in input. These lenghts are the path travelled by the particle along
- * its trajectory, namely the so called S of the helix in 3D space.
- *
- * \param length_values vector of incremental distances that will be translated
- * into radiation length equivalent. Each radiation length i is computed
- * incrementally with respect to the previous length i-1. The first lenght has
- * no reference point (i.e. it has the dca).
- *
- * \return incremental radiation lengths that correspond to each segment.
- */
 
-__host__ __device__ inline
-void computeRadLenUniformMaterial(const VectorNd &length_values,
-    VectorNd & rad_lengths) {
-  // Radiation length of the pixel detector in the uniform assumption, with
-  // 0.06 rad_len at 16 cm
-  const double XX_0 = 16.f/(0.06);
-//  const double XX_0 = 1000.*16.f/(0.06);
-  u_int n = length_values.rows();
-  rad_lengths(0) = length_values(0)/XX_0;
-  for (u_int j = 1; j < n; ++j) {
-    rad_lengths(j) = std::abs(length_values(j)-length_values(j-1))/XX_0;
-  }
+__host__ __device__ inline void computeRadLenEff(const Vector4d& fast_fit,
+                                                 const double B,
+                                                 double & radlen_eff,
+                                                 double & theta,
+                                                 bool & in_forward) {
+    double X_barrel = 0.015;
+    double X_forward = 0.05;
+    theta = atan(fast_fit(3));
+    // atan returns values in [-pi/2, pi/2], we need [0, pi]
+    theta = theta < 0. ? theta + M_PI : theta;
+    radlen_eff = X_barrel / std::abs(sin(theta));
+    in_forward = (theta <= 0.398 or theta >= 2.743);
+    if (in_forward)
+      radlen_eff = X_forward / std::abs(cos(theta));
+    assert(radlen_eff > 0.);
+    double p_t = fast_fit(2) * B;
+    // We have also to correct the radiation lenght in the x-y plane. Since we
+    // do not know the angle of incidence of the track at this point, we
+    // arbitrarily set the correction proportional to the inverse of the
+    // transerse momentum. The cut-off is at 1 Gev, set using a single Muon Pt
+    // gun and verifying that, at that momentum, not additional correction is,
+    // in fact, needed. This is an approximation.
+    if (std::abs(p_t/1.) < 1.)
+      radlen_eff /= std::abs(p_t/1.);
 }
 
 /*!
@@ -159,29 +151,11 @@ void computeRadLenUniformMaterial(const VectorNd &length_values,
     multiple Coulomb scattering to be used in the line_fit, for the barrel
     and forward cases.
 
-    The input covariance matrix is in the variables s-z, original and
-    unrotated.
-
-    The multiple scattering component is computed in the usual linear
-    approximation, using the 3D path which is computed as the squared root of
-    the squared sum of the s and z components passed in.
-
-    Internally a rotation by theta is performed and the covariance matrix
-    returned is the one in the direction orthogonal to the rotated S3D axis,
-    i.e. along the rotated Z axis.
-
-    The choice of the rotation is not arbitrary, but derived from the fact that
-    putting the horizontal axis along the S3D direction allows the usage of the
-    ordinary least squared fitting techiques with the trivial parametrization y
-    = mx + q, avoiding the patological case with m = +/- inf, that would
-    correspond to the case at eta = 0.
  */
-
 __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
                                                      const Vector4d& fast_fit,
                                                      VectorNd const& s_arcs,
                                                      VectorNd const& z_values,
-                                                     const double theta,
                                                      const double B)
 {
 #if RFIT_DEBUG
@@ -190,24 +164,47 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
     u_int n = s_arcs.rows();
     double p_t = fast_fit(2) * B;
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
-    VectorNd rad_lengths_S(n);
-    // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
-    // Basically, to perform cwise operations on Matrices and Vectors, you need
-    // to transform them into Array-like objects.
-    VectorNd S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
-    S_values = S_values.array().sqrt();
-    computeRadLenUniformMaterial(S_values, rad_lengths_S);
-    VectorNd sig2_S(n);
-    sig2_S = .000225 / p_2 * (1.f + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
+    double radlen_eff = 0.;
+    double theta = 0.;
+    bool in_forward = false;
+    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
+
+    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
+    for (u_int k = 0; k < n; ++k)
+    {
+        for (u_int l = k; l < n; ++l)
+        {
+            for (u_int i = 0; i < std::min(k, l); ++i)
+            {
+#if RFIT_DEBUG
+              printf("Scatter_cov_line - B: %f\n", B);
+              printf("Scatter_cov_line - radlen_eff: %f, p_t: %f, p2: %f\n", radlen_eff, p_t, p_2);
+              printf("Scatter_cov_line - sig2:%f, theta: %f\n", sig2, theta);
+              printf("Scatter_cov_line - Adding to element %d, %d value %f\n", n + k, n + l, (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta))));
+#endif
+              if (in_forward) {
+                cov_sz(k, l) += (z_values(k) - z_values(i)) * (z_values(l) - z_values(i)) * sig2 / sqr(sqr(cos(theta)));
+                cov_sz(l, k) = cov_sz(k, l);
+              } else {
+                cov_sz(n + k, n + l) += (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta)));
+                cov_sz(n + l, n + k) = cov_sz(n + k, n + l);
+              }
+            }
+        }
+    }
 #if RFIT_DEBUG
     Rfit::printIt(&cov_sz, "Scatter_cov_line - cov_sz: ");
 #endif
     Matrix2Nd rot = MatrixXd::Zero(2 * n, 2 * n);
     for (u_int i = 0; i < n; ++i) {
-      rot(i, i) = sin(theta);
-      rot(n + i, n + i) = sin(theta);
+      rot(i, i) = cos(theta);
+      rot(n + i, n + i) = cos(theta);
       u_int j = (i + n);
-      rot(i, j) = i < j ? cos(theta) : -cos(theta);
+      // Signs seem to be wrong for the off-diagonal element, but we are
+      // inverting x-y in the input vector, since theta is the angle between
+      // the z axis and the line, and we are putting the s values, which are Y,
+      // in the first position. A simple sign flip will take care of it.
+      rot(i, j) = i < j ? sin(theta) : -sin(theta);
     }
 
 #if RFIT_DEBUG
@@ -215,23 +212,12 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
 #endif
 
     Matrix2Nd tmp = rot*cov_sz*rot.transpose();
-    for (u_int k = 0; k < n; ++k)
-    {
-      for (u_int l = k; l < n; ++l)
-      {
-        for (u_int i = 0; i < std::min(k, l); ++i)
-        {
-          tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
-          tmp(l + n, k + n) = tmp(k + n, l + n);
-        }
-      }
-    }
-    // We are interested only in the errors orthogonal to the rotated s-axis
-    // which, in our formalism, are in the lower square matrix.
+    // We are interested only in the errors in the rotated s -axis which, in
+    // our formalism, are in the upper square matrix.
 #if RFIT_DEBUG
     Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
-    return tmp.block(n, n, n, n);
+    return tmp.block(0, 0, n, n);
 }
 
 /*!
@@ -247,11 +233,13 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
 
     \warning input points must be ordered radially from the detector center
     (from inner layer to outer ones; points on the same layer must ordered too).
+    \bug currently works only for points in the barrel.
 
     \details Only the tangential component is computed (the radial one is
     negligible).
 
  */
+// X in input TO FIX
 __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
                                                     const Vector4d& fast_fit,
                                                     VectorNd const& rad,
@@ -260,32 +248,24 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
     u_int n = p2D.cols();
     double p_t = fast_fit(2) * B;
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
-    double theta = atan(fast_fit(3));
-    theta = theta < 0. ? theta + M_PI :  theta;
-    VectorNd s_values(n);
-    VectorNd rad_lengths(n);
-    const Vector2d o(fast_fit(0), fast_fit(1));
+    double radlen_eff = 0.;
+    double theta = 0.;
+    bool in_forward = false;
+    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
 
-    // associated Jacobian, used in weights and errors computation
-    for (u_int i = 0; i < n; ++i)
-    {  // x
-        Vector2d p = p2D.block(0, i, 2, 1) - o;
-        const double cross = cross2D(-o, p);
-        const double dot = (-o).dot(p);
-        const double atan2_ = atan2(cross, dot);
-        s_values(i) = std::abs(atan2_ * fast_fit(2));
-    }
-    computeRadLenUniformMaterial(s_values*sqrt(1. + 1./(fast_fit(3)*fast_fit(3))), rad_lengths);
     MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
-    VectorNd sig2(n);
-    sig2 = .000225 / p_2 * (1.f + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
     for (u_int k = 0; k < n; ++k)
     {
         for (u_int l = k; l < n; ++l)
         {
             for (u_int i = 0; i < std::min(k, l); ++i)
             {
-              scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i) / (sqr(sin(theta)));
+              if (in_forward) {
+                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(cos(theta));
+              } else {
+                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(sin(theta));
+              }
               scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
             }
         }
@@ -429,6 +409,23 @@ __host__ __device__ inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv)
     return cov_rad_inv.colwise().sum().transpose();
 }
 
+/*!
+    \brief Compute the points' weights' vector for the line fit (ODR).
+    Results from a pre-fit is needed in order to take the orthogonal (to the
+    line) component of the errors.
+
+    \param x_err2 squared errors in the x axis.
+    \param y_err2 squared errors in the y axis.
+    \param tan_theta tangent of theta (angle between y axis and line).
+
+    \return weight points' weights' vector for the line fit (ODR).
+*/
+
+__host__ __device__ inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta)
+{
+    return (1. + sqr(tan_theta)) * 1. / (x_err2 + y_err2 * sqr(tan_theta));
+}
+
 /*!
     \brief Find particle q considering the  sign of cross product between
     particles velocity (estimated by the first 2 hits) and the vector radius
@@ -473,6 +470,40 @@ __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B,
     circle.par = par_pak;
 }
 
+/*!
+    \brief Compute the error propagation to obtain the square errors in the
+    x axis for the line fit. If errors have not been computed in the circle fit
+    than an'approximation is made.
+    Further information in attached documentation.
+
+    \param V hits' covariance matrix.
+    \param circle result of the previous circle fit (only the covariance matrix
+    is needed) TO FIX
+    \param J Jacobian of the transformation producing x values.
+    \param error flag for error computation.
+
+    \return x_err2 squared errors in the x axis.
+*/
+
+__host__ __device__ inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
+                                           const bool error, u_int n)
+{
+    VectorNd x_err2(n);
+    for (u_int i = 0; i < n; ++i)
+    {
+        Matrix5d Cov = MatrixXd::Zero(5, 5);
+        if (error)
+            Cov.block(0, 0, 3, 3) = circle.cov;
+        Cov(3, 3) = V(i, i);
+        Cov(4, 4) = V(i + n, i + n);
+        Cov(3, 4) = Cov(4, 3) = V(i, i + n);
+        Eigen::Matrix<double, 1, 1> tmp;
+        tmp = J.row(i) * Cov * J.row(i).transpose().eval();
+        x_err2(i) = tmp(0, 0);
+    }
+    return x_err2;
+}
+
 /*!
     \brief Compute the eigenvector associated to the minimum eigenvalue.
 
@@ -973,7 +1004,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         {
             const double t = 1. / h;
             J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
-                v(0)*v2x2_inv*t, v(1)*v2x2_inv*t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+                0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
         }
         printIt(&J3, "circle_fit - J3:");
 
@@ -1028,22 +1059,21 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     errors.
 */
 
-__host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
-    const Matrix3Nd& hits_cov,
-    const circle_fit& circle,
-    const Vector4d& fast_fit,
-    const double B,
-    const bool error = true)
+__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
+                                             const Matrix3Nd& hits_cov,
+                                             const circle_fit& circle,
+                                             const Vector4d& fast_fit,
+                                             const double B,
+                                             const bool error = true)
 {
     u_int n = hits.cols();
-    double theta = -circle.q*atan(fast_fit(3));
-    theta = theta < 0. ? theta + M_PI :  theta;
     // PROJECTION ON THE CILINDER
-    Matrix2xNd p2D = MatrixXd::Zero(2, n);
-    Eigen::Matrix<double, 2, 6> Jx;
+    Matrix2xNd p2D(2, n);
+    MatrixNx5d Jx(n, 5);
 
 #if RFIT_DEBUG
     printf("Line_fit - B: %g\n", B);
+
     printIt(&hits, "Line_fit points: ");
     printIt(&hits_cov, "Line_fit covs: ");
 #endif
@@ -1055,11 +1085,8 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
     const Vector2d o(circle.par(0), circle.par(1));
 
     // associated Jacobian, used in weights and errors computation
-    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
     for (u_int i = 0; i < n; ++i)
     {  // x
-      Matrix6d Cov = MatrixXd::Zero(6, 6);
-      Matrix2d Cov_sz_single = MatrixXd::Zero(2, 2);
         Vector2d p = hits.block(0, i, 2, 1) - o;
         const double cross = cross2D(-o, p);
         const double dot = (-o).dot(p);
@@ -1068,9 +1095,9 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
         const double atan2_ = -circle.q * atan2(cross, dot);
         p2D(0, i) = atan2_ * circle.par(2);
 
-        // associated Jacobian, used in weights and errors- computation
+        // associated Jacobian, used in weights and errors computation
         const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-        double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+        double d_X0 = 0, d_Y0 = 0, d_R = 0.;  // good approximation for big pt and eta
         if (error)
         {
             d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
@@ -1079,19 +1106,7 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
         }
         const double d_x = temp0 * (o(1) * dot + o(0) * cross);
         const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-        Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
-//        Jx << d_X0, d_Y0, d_R, p(1)/p.norm(), -p(0)/p.norm(), 0, 0, 0, 0, 0, 0, 1.;
-        Cov.block(0, 0, 3, 3) = circle.cov;
-        Cov(3, 3) = hits_cov(i, i);
-        Cov(4, 4) = hits_cov(i + n, i + n);
-        Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);
-        Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);
-        Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);
-        Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);
-        Cov_sz_single = Jx * Cov * Jx.transpose();
-        cov_sz(i, i) = Cov_sz_single(0, 0);
-        cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
-        cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
+        Jx.row(i) << d_X0, d_Y0, d_R, d_x, d_y;
     }
     // Math of d_{X0,Y0,R,x,y} all verified by hand
 
@@ -1099,26 +1114,44 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
     p2D.row(1) = hits.row(2);
 
     // WEIGHT COMPUTATION
+    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
+    VectorNd x_err2 = X_err2(hits_cov, circle, Jx, error, n);
+    VectorNd y_err2 = hits_cov.block(2 * n, 2 * n, n, n).diagonal();
+    cov_sz.block(0, 0, n, n) = x_err2.asDiagonal();
+    cov_sz.block(n, n, n, n) = y_err2.asDiagonal();
 #if RFIT_DEBUG
     printIt(&cov_sz, "line_fit - cov_sz:");
 #endif
-    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
+    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), B);
 #if RFIT_DEBUG
     printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
 #endif
-    Matrix4d G;
-    G = cov_with_ms.inverse();
+    Matrix4d G, G4;
+    G4 = cov_with_ms.inverse();
 #if RFIT_DEBUG
-    printIt(&G, "line_fit - cov_with_ms.inverse():");
+    printIt(&G4, "line_fit - cov_with_ms.inverse():");
 #endif
-    double renorm = G.sum();
-    G *= 1. / renorm;
+    double renorm = G4.sum();
+    G4 *= 1. / renorm;
 #if RFIT_DEBUG
-    printIt(&G, "line_fit - G4:");
+    printIt(&G4, "line_fit - G4:");
 #endif
-
+    G = G4;
     const VectorNd weight = Weight_circle(G);
 
+
+    VectorNd err2_inv = cov_with_ms.diagonal();
+    err2_inv = err2_inv.cwiseInverse();
+//    const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
+//    const VectorNd weight = err2_inv * 1. / err2_inv.sum();
+
+#if RFIT_DEBUG
+    printIt(&x_err2, "Line_fit - x_err2: ");
+    printIt(&y_err2, "Line_fit - y_err2: ");
+    printIt(&err2_inv, "Line_fit - err2_inv: ");
+    printIt(&weight, "Line_fit - weight: ");
+#endif
+
     // COST FUNCTION
 
     // compute
@@ -1129,12 +1162,16 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
     const Matrix2xNd X = p2D.colwise() - r0;
     Matrix2d A = Matrix2d::Zero();
     A = X * G * X.transpose();
+//    for (u_int i = 0; i < n; ++i)
+//    {
+//        A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
+//    }
 
 #if RFIT_DEBUG
     printIt(&A, "Line_fit - A: ");
 #endif
 
-    // minimize. v is normalized!!
+    // minimize
     double chi2;
     Vector2d v = min_eigen2D(A, chi2);
 #if RFIT_DEBUG
@@ -1142,6 +1179,7 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
     printf("Line_fit chi2: %e\n", chi2);
 #endif
 
+    // n *= (chi2>0) ? 1 : -1; //TO FIX
     // This hack to be able to run on GPU where the automatic assignment to a
     // double from the vector multiplication is not working.
     Matrix<double, 1, 1> cm;
@@ -1151,8 +1189,8 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
     // COMPUTE LINE PARAMETER
     line_fit line;
     line.par << -v(0) / v(1),                          // cotan(theta))
-        -c / v(1);  // Zip
-    line.chi2 = abs(chi2*renorm);
+        -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
+    line.chi2 = abs(chi2);
 #if RFIT_DEBUG
     printIt(&(line.par), "Line_fit - line.par: ");
     printf("Line_fit - v norm: %e\n", sqrt(v(0)*v(0) + v(1)*v(1)));
@@ -1168,21 +1206,19 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
         {
           // The norm is taken from Chernov, properly adapted to the weights case.
             double norm = v.transpose() * A * v;
-//            double norm_empirical = cov_with_ms.diagonal().mean();
+            norm /= weight.sum();
 #if RFIT_DEBUG
-            printf("Chi_2: %g\n", chi2);
-            printf("Norm: %g\n", norm);
-            printf("weight.sum(): %g\n", weight.sum());
             printf("Line_fit - norm:    %e\n", norm);
 #endif
-
-            const double sig2 = norm/(A(0,0) + A(1,1));
+            const double sig2 = 1. / (A(0, 0) + A(1, 1)) * norm;
             C(0, 0) = sig2 * v1_2;
             C(1, 1) = sig2 * v0_2;
-            C(1, 0) = C(0, 1) = -sig2 * v(0) * v(1);
-            C(2, 2) = sig2 * (v(0)*r0(1)-v(1)*r0(0))*(v(0)*r0(1)-v(1)*r0(0)) + (sig2/n)*(A(0,0)+A(1,1));
-            C(0, 2) = C(2, 0) = sig2*(v(0)*r0(1)-v(1)*r0(0))*v(1);
-            C(1, 2) = C(2, 1) = - sig2*(v(0)*r0(1)-v(1)*r0(0))*v(0);
+            C(0, 1) = C(1, 0) = -sig2 * v(0) * v(1);
+            const VectorNd weight_2 = (weight).array().square();
+            const Vector2d C0(weight_2.dot(x_err2), weight_2.dot(y_err2));
+            C.block(0, 2, 2, 1) = C.block(2, 0, 1, 2).transpose() = -C.block(0, 0, 2, 2) * r0;
+            Matrix<double, 1, 1> tmp = (r0.transpose() * C.block(0, 0, 2, 2) * r0);
+            C(2, 2) = v0_2 * C0(0) + v1_2 * C0(1) + C0(0) * C(0, 0) + C0(1) * C(1, 1) + tmp(0, 0);
         }
 #if RFIT_DEBUG
         printIt(&C, "line_fit - C:");
@@ -1192,7 +1228,9 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
         {
             const double t0 = 1. / v(1);
             const double t1 = sqr(t0);
-            J << -t0, v(0) * t1, 0., 0., c * t1, -t0;
+            const double sqrt_ = sqrt(v1_2 + v0_2);
+            const double t2 = 1. / sqrt_;
+            J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
         }
         Matrix<double, 3, 2> JT = J.transpose().eval();
 #if RFIT_DEBUG
@@ -1207,184 +1245,6 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
     return line;
 }
 
-/*!  \brief Perform an ordinary least square fit in the s-z plane to compute
- * the parameters cotTheta and Zip.
- *
- * The fit is performed in the rotated S3D-Z' plane, following the formalism of
- * Frodesen, Chapter 10, p. 259.
- *
- * The system has been rotated to both try to use the combined errors in s-z
- * along Z', as errors in the Y direction and to avoid the patological case of
- * degenerate lines with angular coefficient m = +/- inf.
- *
- * The rotation is using the information on the theta angle computed in the
- * fast fit. The rotation is such that the S3D axis will be the X-direction,
- * while the rotated Z-axis will be the Y-direction. This pretty much follows
- * what is done in the same fit in the Broken Line approach.
- */
-
-__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
-    const Matrix3Nd& hits_cov,
-    const circle_fit& circle,
-    const Vector4d& fast_fit,
-    const double B,
-    const bool error = true) {
-  auto n = hits.cols();
-  double theta = -circle.q*atan(fast_fit(3));
-  theta = theta < 0. ? theta + M_PI : theta;
-
-  // PROJECTION ON THE CILINDER
-  //
-  // p2D will be:
-  // [s1, s2, s3, ..., sn]
-  // [z1, z2, z3, ..., zn]
-  // s values will be ordinary x-values
-  // z values will be ordinary y-values
-
-  Matrix2xNd p2D(2, n);
-  Eigen::Matrix<double, 2, 6> Jx;
-
-  p2D << MatrixXd::Zero(2, n);
-  Jx << MatrixXd::Zero(2, 6);
-
-#if RFIT_DEBUG
-  printf("Line_fit - B: %g\n", B);
-  printIt(&hits, "Line_fit points: ");
-  printIt(&hits_cov, "Line_fit covs: ");
-#endif
-  // x & associated Jacobian
-  // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
-  // Slide 11
-  // a ==> -o i.e. the origin of the circle in XY plane, negative
-  // b ==> p i.e. distances of the points wrt the origin of the circle.
-  const Vector2d o(circle.par(0), circle.par(1));
-
-  // associated Jacobian, used in weights and errors computation
-  Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
-  Matrix6d Cov(6,6);
-  Matrix2d Cov_sz_single(2, 2);
-  for (u_int i = 0; i < n; ++i)
-  {
-    Vector2d p = hits.block(0, i, 2, 1) - o;
-    const double cross = cross2D(-o, p);
-    const double dot = (-o).dot(p);
-    // atan2(cross, dot) give back the angle in the transverse plane so tha the
-    // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
-    const double atan2_ = -circle.q * atan2(cross, dot);
-//    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
-    p2D(0, i) = atan2_ * circle.par(2);
-
-    // associated Jacobian, used in weights and errors- computation
-    const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-    double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
-    if (error)
-    {
-      d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
-      d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
-      d_R = atan2_;
-    }
-    const double d_x = temp0 * (o(1) * dot + o(0) * cross);
-    const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-    Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
-
-    Cov << MatrixXd::Zero(6, 6);
-    Cov_sz_single << MatrixXd::Zero(2, 2);
-    Cov.block(0, 0, 3, 3) = circle.cov;
-    Cov(3, 3) = hits_cov(i, i);                        // x errors
-    Cov(4, 4) = hits_cov(i + n, i + n);                // y errors
-    Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);            // z errors
-    Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);        // cov_xy
-    Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);      // cov_xz
-    Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);  // cov_yz
-    Cov_sz_single = Jx * Cov * Jx.transpose();
-    cov_sz(i, i) = Cov_sz_single(0, 0);
-    cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
-    cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
-  }
-  // Math of d_{X0,Y0,R,x,y} all verified by hand
-  p2D.row(1) = hits.row(2);
-
-  // The following matrix will contain errors orthogonal to the rotated S
-  // component only, with the Multiple Scattering properly treated!!
-  MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
-#if RFIT_DEBUG
-  printIt(&cov_sz, "line_fit - cov_sz:");
-  printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
-#endif
-
-  // Prepare the Rotation Matrix to rotate the points
-  Eigen::Matrix<double, 2, 2> rot = Eigen::Matrix<double, 2, 2>::Zero();
-  rot << sin(theta), cos(theta), -cos(theta), sin(theta);
-
-  // Rotate Points with the shape [2, n]
-  Matrix2xNd p2D_rot = rot*p2D;
-
-#if RFIT_DEBUG
-  printf("Fast fit Tan(theta): %g\n", fast_fit(3));
-  printf("Rotation angle: %g\n", theta);
-  printIt(&rot, "Rotation Matrix:");
-  printIt(&p2D, "Original Hits(s,z):");
-  printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
-  printIt(&rot, "Rotation Matrix:");
-#endif
-
-  // Build the A Matrix
-  Matrix2xNd A(2,n);
-  A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
-
-#if RFIT_DEBUG
-  printIt(&A, "A Matrix:");
-#endif
-
-  // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
-  MatrixNd Vy_inv = cov_with_ms.inverse();
-  Eigen::Matrix<double, 2, 2> Inv_Cov = A*Vy_inv*A.transpose();
-
-  // Compute the Covariance Matrix of the fit parameters
-  Eigen::Matrix<double, 2, 2> Cov_params = Inv_Cov.inverse();
-
-  // Now Compute the Parameters in the form [2,1]
-  // The first component is q.
-  // The second component is m.
-  Eigen::Matrix<double, 2, 1> sol = Cov_params*A*Vy_inv*p2D_rot.row(1).transpose();
-
-
-#if RFIT_DEBUG
-  printIt(&sol, "Rotated solutions:");
-#endif
-
-  // We need now to transfer back the results in the original s-z plane
-  auto common_factor = 1./(sin(theta)-sol(1,0)*cos(theta));
-  Matrix<double, 2, 2> J = Matrix<double, 2, 2>::Zero();
-  J << 0., common_factor*common_factor, common_factor, sol(0,0)*cos(theta)*common_factor*common_factor;
-
-  double m = common_factor*(sol(1,0)*sin(theta)+cos(theta));
-  double q = common_factor*sol(0,0);
-  auto cov_mq = J * Cov_params * J.transpose();
-
-  VectorNd res = p2D_rot.row(1).transpose() - A.transpose() * sol;
-  double chi2 = res.transpose()*Vy_inv*res;
-  chi2 = chi2 / float(n);
-
-  line_fit line;
-  line.par << m, q;
-  line.cov << cov_mq;
-  line.chi2 = chi2;
-
-#if RFIT_DEBUG
-  printf("Common_factor: %g\n", common_factor);
-  printIt(&J, "Jacobian:");
-  printIt(&sol, "Rotated solutions:");
-  printIt(&Cov_params, "Cov_params:");
-  printIt(&cov_mq, "Rotated Covariance Matrix:");
-  printIt(&(line.par), "Real Parameters:");
-  printIt(&(line.cov), "Real Covariance Matrix:");
-  printf("Chi2: %g\n", chi2);
-#endif
-
-  return line;
-}
-
 /*!
     \brief Helix fit by three step:
     -fast pre-fit (see Fast_fit() for further info); \n
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
index f71e3f082ada4..b27ed52473388 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -6,11 +6,7 @@
 #include <random>
 #include <memory>  // unique_ptr
 
-#include <TFile.h>
-#include <TH1F.h>
-
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-//#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
 
 using namespace std;
 using namespace Eigen;
@@ -168,261 +164,161 @@ Matrix<double, 6, 1> New_par(const Matrix<double, 6, 1>& gen_par, const int& cha
   return new_par;
 }
 
-template<typename Fit, size_t N>
-void computePull(std::array<Fit, N> & fit, const char * label,
-    int n_, int iteration, const Vector5d & true_par) {
-  Matrix<double, 41, Dynamic, 1> score(41, iteration);
-
-  std::string histo_name("Phi Pull");
-  histo_name += label;
-  TH1F phi_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
-  histo_name = "dxy Pull ";
-  histo_name += label;
-  TH1F dxy_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
-  histo_name = "dz Pull ";
-  histo_name += label;
-  TH1F dz_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
-  histo_name = "Theta Pull ";
-  histo_name += label;
-  TH1F theta_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
-  histo_name = "Pt Pull ";
-  histo_name += label;
-  TH1F pt_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
-  histo_name = "Phi Error ";
-  histo_name += label;
-  TH1F phi_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
-  histo_name = "dxy error ";
-  histo_name += label;
-  TH1F dxy_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
-  histo_name = "dz error ";
-  histo_name += label;
-  TH1F dz_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
-  histo_name = "Theta error ";
-  histo_name += label;
-  TH1F theta_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
-  histo_name = "Pt error ";
-  histo_name += label;
-  TH1F pt_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
-  for (int x = 0; x < iteration; x++) {
-    // Compute PULLS information
-    score(0, x) = (fit[x].par(0) - true_par(0)) / sqrt(fit[x].cov(0, 0));
-    score(1, x) = (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(1, 1));
-    score(2, x) = (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(2, 2));
-    score(3, x) = (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(3, 3));
-    score(4, x) = (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(4, 4));
-    phi_pull.Fill(score(0, x));
-    dxy_pull.Fill(score(1, x));
-    pt_pull.Fill(score(2, x));
-    theta_pull.Fill(score(3, x));
-    dz_pull.Fill(score(4, x));
-    phi_error.Fill(sqrt(fit[x].cov(0, 0)));
-    dxy_error.Fill(sqrt(fit[x].cov(1, 1)));
-    pt_error.Fill(sqrt(fit[x].cov(2, 2)));
-    theta_error.Fill(sqrt(fit[x].cov(3, 3)));
-    dz_error.Fill(sqrt(fit[x].cov(4, 4)));
-    score(5, x) =
-      (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / (fit[x].cov(0, 1));
-    score(6, x) =
-      (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(0, 2));
-    score(7, x) =
-      (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(1, 2));
-    score(8, x) =
-      (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / (fit[x].cov(3, 4));
-    score(9, x) = fit[x].chi2_circle;
-    score(25, x) = fit[x].chi2_line;
-    score(10, x) = sqrt(fit[x].cov(0, 0)) / fit[x].par(0) * 100;
-    score(13, x) = sqrt(fit[x].cov(3, 3)) / fit[x].par(3) * 100;
-    score(14, x) = sqrt(fit[x].cov(4, 4)) / fit[x].par(4) * 100;
-    score(15, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(3) - true_par(3)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(3, 3));
-    score(16, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(3) - true_par(3)) /
-      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(3, 3));
-    score(17, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(3) - true_par(3)) /
-      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(3, 3));
-    score(18, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(4, 4));
-    score(19, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(4, 4));
-    score(20, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(4, 4));
-    score(21, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(1, 1));
-    score(22, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(2, 2));
-    score(23, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) /
-      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(2, 2));
-    score(24, x) = (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(3, 3)) / sqrt(fit[x].cov(4, 4));
-    score(30, x) = fit[x].par(0);
-    score(31, x) = fit[x].par(1);
-    score(32, x) = fit[x].par(2);
-    score(33, x) = fit[x].par(3);
-    score(34, x) = fit[x].par(4);
-    score(35, x) = sqrt(fit[x].cov(0,0));
-    score(36, x) = sqrt(fit[x].cov(1,1));
-    score(37, x) = sqrt(fit[x].cov(2,2));
-    score(38, x) = sqrt(fit[x].cov(3,3));
-    score(39, x) = sqrt(fit[x].cov(4,4));
-
-  }
-
-  double phi_ = score.row(0).mean();
-  double a_ = score.row(1).mean();
-  double pt_ = score.row(2).mean();
-  double coT_ = score.row(3).mean();
-  double Zip_ = score.row(4).mean();
-  std::cout << std::setprecision(5) << std::scientific << label << " AVERAGE FITTED VALUES: \n"
-    << "phi: " << score.row(30).mean() << " +/- " << score.row(35).mean() << " [+/-] " << sqrt(score.row(35).array().abs2().mean() - score.row(35).mean()*score.row(35).mean()) << std::endl
-    << "d0:  " << score.row(31).mean() << " +/- " << score.row(36).mean() << " [+/-] " << sqrt(score.row(36).array().abs2().mean() - score.row(36).mean()*score.row(36).mean()) << std::endl
-    << "pt:  " << score.row(32).mean() << " +/- " << score.row(37).mean() << " [+/-] " << sqrt(score.row(37).array().abs2().mean() - score.row(37).mean()*score.row(37).mean()) << std::endl
-    << "coT: " << score.row(33).mean() << " +/- " << score.row(38).mean() << " [+/-] " << sqrt(score.row(38).array().abs2().mean() - score.row(38).mean()*score.row(38).mean()) << std::endl
-    << "Zip: " << score.row(34).mean() << " +/- " << score.row(39).mean() << " [+/-] " << sqrt(score.row(39).array().abs2().mean() - score.row(39).mean()*score.row(39).mean()) << std::endl;
-
-  Matrix5d correlation;
-  correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
-              score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
-              score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
-              score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
-              score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
-              score.row(24).mean(), 1.;
-
-  cout << "\n" << label << " PULLS (mean, sigma, relative_error):\n"
-    << "phi:  " << phi_ << "     "
-    << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(10).mean()) << "%\n"
-    << "a0 :  " << a_ << "     "
-    << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(11).mean()) << "%\n"
-    << "pt :  " << pt_ << "     "
-    << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(12).mean()) << "%\n"
-    << "coT:  " << coT_ << "     "
-    << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(13).mean()) << "%\n"
-    << "Zip:  " << Zip_ << "     "
-    << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(14).mean()) << "%\n\n"
-    << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
-    << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
-    << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
-    << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
-    << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
-    << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
-    << "correlation matrix:\n"
-    << correlation << "\n\n"
-    << endl;
-
-  phi_pull.Fit("gaus", "Q");
-  dxy_pull.Fit("gaus", "Q");
-  dz_pull.Fit("gaus", "Q");
-  theta_pull.Fit("gaus", "Q");
-  pt_pull.Fit("gaus", "Q");
-  phi_pull.Write();
-  dxy_pull.Write();
-  dz_pull.Write();
-  theta_pull.Write();
-  pt_pull.Write();
-  phi_error.Write();
-  dxy_error.Write();
-  dz_error.Write();
-  theta_error.Write();
-  pt_error.Write();
-}
-
-
 void test_helix_fit() {
   int n_;
+  int iteration;
+  int debug2 = 0;
   bool return_err;
   const double B_field = 3.8 * c_speed / pow(10, 9) / 100;
   Matrix<double, 6, 1> gen_par;
   Vector5d true_par;
   Vector5d err;
-  generator.seed(1);
-  std::cout << std::setprecision(6);
-  cout << "_________________________________________________________________________\n";
-  cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
-  cout << "hits: ";
-  cin  >> n_;
-  cout << "x: ";
-  cin  >> gen_par(0);
-  cout << "y: ";
-  cin  >> gen_par(1);
-  cout << "z: ";
-  cin  >> gen_par(2);
-  cout << "phi: ";
-  cin  >> gen_par(3);
-  cout << "p_t: ";
-  cin  >> gen_par(4);
-  cout << "eta: ";
-  cin  >> gen_par(5);
-  //
-  /*
-     n_ = 4;
-     gen_par(0) = -0.1;  // x
-     gen_par(1) = 0.1;   // y
-     gen_par(2) = -1.;  // z
-     gen_par(3) = 45.;   // phi
-     gen_par(4) = 10.;   // R (p_t)
-     gen_par(5) = 1.;   // eta
-     iteration = 1;
-     */
-  return_err = 1;
+//  while (1) {
+    generator.seed(1);
+    int debug = 0;
+    debug2 = 0;
+    std::cout << std::setprecision(6);
+    cout << "_________________________________________________________________________\n";
+    cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
+//    cin >> n_ >> gen_par(0) >> gen_par(1) >> gen_par(2) >> gen_par(3) >> gen_par(4) >> gen_par(5) >>
+//        iteration >> return_err >> debug2;
+    n_ = 4;
+    gen_par(0) = -0.1;  // x
+    gen_par(1) = 0.1;   // y
+    gen_par(2) = -1.;  // z
+    gen_par(3) = 45.;   // phi
+    gen_par(4) = 10.;   // R (p_t)
+    gen_par(5) = 1.;   // eta
+    iteration = 1;
+    return_err = 1;
+    debug2 = 1;
 
-  const int iteration = 5000;
-  gen_par = New_par(gen_par, 1, B_field);
-  true_par = True_par(gen_par, 1, B_field);
-  Matrix3xNd hits;
-  Matrix3Nd hits_cov;
-  std::array<helix_fit, iteration> helixRiemann_fit;
-//  std::array<BrokenLine::helix_fit, iteration> helixBrokenLine_fit;
+    iteration *= 10;
+    gen_par = New_par(gen_par, 1, B_field);
+    true_par = True_par(gen_par, 1, B_field);
+    Matrix3xNd hits;
+    Matrix3Nd hits_cov;
+    unique_ptr<helix_fit[]> helix(new helix_fit[iteration]);
+//    helix_fit* helix = new helix_fit[iteration];
+    Matrix<double, 41, Dynamic, 1> score(41, iteration);
 
-  std::cout << "\nTrue parameters: "
-    << "phi: " << true_par(0) << " "
-    << "dxy: " << true_par(1) << " "
-    << "pt: " << true_par(2) << " "
-    << "CotT: " << true_par(3) << " "
-    << "Zip: " << true_par(4) << " "
-    << std::endl;
-  for (int i = 0; i < iteration; i++) {
-    hits_gen gen;
-    gen = Hits_gen(n_, gen_par);
-    //      gen.hits = MatrixXd::Zero(3, 4);
-    //      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
-    //      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
-    //      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
-    //      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
-    //      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
-    helixRiemann_fit[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err);
-//    helixBrokenLine_fit[i] = BrokenLine::Helix_fit(gen.hits, gen.hits_cov, B_field);
+    for (int i = 0; i < iteration; i++) {
+      if (debug2 == 1 && i == (iteration - 1)) {
+        debug = 1;
+      }
+      hits_gen gen;
+      gen = Hits_gen(n_, gen_par);
+//      gen.hits = MatrixXd::Zero(3, 4);
+//      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
+//      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
+//      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
+//      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
+//      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
+      helix[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err);
 
-    std::cout << std::endl;
-    /*
-    if (debug)
-      cout << std::setprecision(6)
-        << "phi:  " << helixRiemann_fit[i].par(0) << " +/- " << sqrt(helixRiemann_fit[i].cov(0, 0)) << " vs "
-        << true_par(0) << endl
-        << "Tip:  " << helixRiemann_fit[i].par(1) << " +/- " << sqrt(helixRiemann_fit[i].cov(1, 1)) << " vs "
-        << true_par(1) << endl
-        << "p_t:  " << helixRiemann_fit[i].par(2) << " +/- " << sqrt(helixRiemann_fit[i].cov(2, 2)) << " vs "
-        << true_par(2) << endl
-        << "theta:" << helixRiemann_fit[i].par(3) << " +/- " << sqrt(helixRiemann_fit[i].cov(3, 3)) << " vs "
-        << true_par(3) << endl
-        << "Zip:  " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs "
-        << true_par(4) << endl
-        << "charge:" << helixRiemann_fit[i].q << " vs 1" << endl
-        << "covariance matrix:" << endl
-        << helixRiemann_fit[i].cov << endl
-        << "Initial hits:\n" << gen.hits << endl
-        << "Initial Covariance:\n" << gen.hits_cov << endl;
-        */
-  }
-  computePull(helixRiemann_fit, "Riemann", n_, iteration, true_par);
-//  computePull(helixBrokenLine_fit, "BrokenLine", n_, iteration, true_par);
+      if (debug)
+        cout << std::setprecision(10)
+            << "phi:  " << helix[i].par(0) << " +/- " << sqrt(helix[i].cov(0, 0)) << " vs "
+            << true_par(0) << endl
+            << "Tip:  " << helix[i].par(1) << " +/- " << sqrt(helix[i].cov(1, 1)) << " vs "
+            << true_par(1) << endl
+            << "p_t:  " << helix[i].par(2) << " +/- " << sqrt(helix[i].cov(2, 2)) << " vs "
+            << true_par(2) << endl
+            << "theta:" << helix[i].par(3) << " +/- " << sqrt(helix[i].cov(3, 3)) << " vs "
+            << true_par(3) << endl
+            << "Zip:  " << helix[i].par(4) << " +/- " << sqrt(helix[i].cov(4, 4)) << " vs "
+            << true_par(4) << endl
+            << "charge:" << helix[i].q << " vs 1" << endl
+            << "covariance matrix:" << endl
+            << helix[i].cov << endl
+            << "Initial hits:\n" << gen.hits << endl
+            << "Initial Covariance:\n" << gen.hits_cov << endl;
+    }
+
+    for (int x = 0; x < iteration; x++) {
+      // Compute PULLS information
+      score(0, x) = (helix[x].par(0) - true_par(0)) / sqrt(helix[x].cov(0, 0));
+      score(1, x) = (helix[x].par(1) - true_par(1)) / sqrt(helix[x].cov(1, 1));
+      score(2, x) = (helix[x].par(2) - true_par(2)) / sqrt(helix[x].cov(2, 2));
+      score(3, x) = (helix[x].par(3) - true_par(3)) / sqrt(helix[x].cov(3, 3));
+      score(4, x) = (helix[x].par(4) - true_par(4)) / sqrt(helix[x].cov(4, 4));
+      score(5, x) =
+          (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) / (helix[x].cov(0, 1));
+      score(6, x) =
+          (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(0, 2));
+      score(7, x) =
+          (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(1, 2));
+      score(8, x) =
+          (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) / (helix[x].cov(3, 4));
+      score(9, x) = helix[x].chi2_circle;
+      score(25, x) = helix[x].chi2_line;
+      score(10, x) = sqrt(helix[x].cov(0, 0)) / helix[x].par(0) * 100;
+      score(13, x) = sqrt(helix[x].cov(3, 3)) / helix[x].par(3) * 100;
+      score(14, x) = sqrt(helix[x].cov(4, 4)) / helix[x].par(4) * 100;
+      score(15, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(3) - true_par(3)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(3, 3));
+      score(16, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(3) - true_par(3)) /
+                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(3, 3));
+      score(17, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(3) - true_par(3)) /
+                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(3, 3));
+      score(18, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(4, 4));
+      score(19, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(4, 4));
+      score(20, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(4, 4));
+      score(21, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(1, 1));
+      score(22, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) /
+                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(2, 2));
+      score(23, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) /
+                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(2, 2));
+      score(24, x) = (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) /
+                     sqrt(helix[x].cov(3, 3)) / sqrt(helix[x].cov(4, 4));
+    }
+
+    double phi_ = score.row(0).mean();
+    double a_ = score.row(1).mean();
+    double pt_ = score.row(2).mean();
+    double coT_ = score.row(3).mean();
+    double Zip_ = score.row(4).mean();
+    Matrix5d correlation;
+    correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
+        score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
+        score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
+        score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
+        score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
+        score.row(24).mean(), 1.;
+
+    cout << "\nPULLS:\n"
+         << "phi:  " << phi_ << "     "
+         << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(10).mean()) << "%\n"
+         << "a0 :  " << a_ << "     "
+         << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(11).mean()) << "%\n"
+         << "pt :  " << pt_ << "     "
+         << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(12).mean()) << "%\n"
+         << "coT:  " << coT_ << "     "
+         << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(13).mean()) << "%\n"
+         << "Zip:  " << Zip_ << "     "
+         << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
+         << abs(score.row(14).mean()) << "%\n\n"
+         << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
+         << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
+         << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
+         << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
+         << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
+         << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
+         << "correlation matrix:\n"
+         << correlation << "\n\n"
+         << endl;
+//  }
 }
 
 int main() {
-  TFile f("TestFitResults.root", "RECREATE");
   test_helix_fit();
-  f.Close();
   return 0;
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 485fac34b00b2..7b1125eebc312 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -29,10 +29,12 @@ void kernelFullFit(Rfit::Matrix3xNd * hits,
   Rfit::Matrix2Nd hits_cov2D_local = (hits_cov->block(0, 0, 2 * n, 2 * n)).eval();
   Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ");
   Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ");
+  /*
   printf("kernelFullFit - hits address: %p\n", hits);
   printf("kernelFullFit - hits_cov address: %p\n", hits_cov);
   printf("kernelFullFit - hits_cov2D address: %p\n", &hits2D_local);
   printf("kernelFullFit - hits_cov2D_local address: %p\n", &hits_cov2D_local);
+  */
   /* At some point I gave up and locally construct block on the stack, so that
      the next invocation to Rfit::Circle_fit works properly. Failing to do so
      implied basically an empty collection of hits and covariances. That could
@@ -41,20 +43,60 @@ void kernelFullFit(Rfit::Matrix3xNd * hits,
      creations of the blocks. To be understood and compared against the myriad
      of compilation warnings we have.
      */
-
   (*circle_fit_resultsGPU) =
     Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-        fast_fit, rad, B, errors);
+      fast_fit, rad, B, errors);
   /*
-     (*circle_fit_resultsGPU) =
-     Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
-     fast_fit, rad, B, errors);
-  */
-  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, B, errors);
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
+      fast_fit, rad, B, errors, scattering);
+   */
+  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, errors);
 
   return;
 }
 
+__global__
+void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
+  (*results) = Rfit::Fast_fit(*hits);
+}
+
+__global__
+void kernelCircleFit(Rfit::Matrix3xNd * hits,
+    Rfit::Matrix3Nd * hits_cov, Vector4d * fast_fit_input, double B,
+    Rfit::circle_fit * circle_fit_resultsGPU) {
+  u_int n = hits->cols();
+  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
+
+#if TEST_DEBUG
+  printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
+  printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
+  printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
+  printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
+  printf("rad(0,0): %f\n", rad(0,0));
+  printf("rad(1,1): %f\n", rad(1,1));
+  printf("rad(2,2): %f\n", rad(2,2));
+  printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
+  printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
+  printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
+  printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
+  printf("B: %f\n", B);
+#endif
+  (*circle_fit_resultsGPU) =
+    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
+      *fast_fit_input, rad, B, false);
+}
+
+__global__
+void kernelLineFit(Rfit::Matrix3xNd * hits,
+                   Rfit::Matrix3Nd * hits_cov,
+                   Rfit::circle_fit * circle_fit,
+                   Vector4d * fast_fit,
+                   Rfit::line_fit * line_fit)
+{
+  (*line_fit) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit, *fast_fit, true);
+}
+
 void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
   hits << 1.98645, 4.72598, 7.65632, 11.3151,
           2.18002, 4.88864, 7.75845, 11.3134,
@@ -77,25 +119,98 @@ void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
   hits_cov(3,7) = hits_cov(7,3) = -5.28e-06;
 }
 
-void testFitOneGo(bool errors, double epsilon=1e-6) {
+void testFit() {
   constexpr double B = 0.0113921;
   Rfit::Matrix3xNd hits(3,4);
   Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
+  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
+  Rfit::Matrix3Nd * hits_covGPU = nullptr;
+  Vector4d * fast_fit_resultsGPU = new Vector4d();
+  Vector4d * fast_fit_resultsGPUret = new Vector4d();
+  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
+  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
 
   fillHitsAndHitsCov(hits, hits_cov);
 
   // FAST_FIT_CPU
   Vector4d fast_fit_results = Rfit::Fast_fit(hits);
+#if TEST_DEBUG
+  std::cout << "Generated hits:\n" << hits << std::endl;
+#endif
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+  // FAST_FIT GPU
+  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
+  cudaMalloc((void**)&fast_fit_resultsGPU, sizeof(Vector4d));
+  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
+
+  kernelFastFit<<<1, 1>>>(hitsGPU, fast_fit_resultsGPU);
+  cudaDeviceSynchronize();
+  
+  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, sizeof(Vector4d), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << *fast_fit_resultsGPUret << std::endl;
+  assert(isEqualFuzzy(fast_fit_results, (*fast_fit_resultsGPUret)));
+
   // CIRCLE_FIT CPU
   u_int n = hits.cols();
   Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
 
   Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
+      hits_cov.block(0, 0, 2 * n, 2 * n),
+      fast_fit_results, rad, B, false);
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
+
+  // CIRCLE_FIT GPU
+  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
+  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
+  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
+
+  kernelCircleFit<<<1,1>>>(hitsGPU, hits_covGPU,
+      fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
+      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
+
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+
+  // LINE_FIT GPU
+  Rfit::line_fit * line_fit_resultsGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
+
+  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
+
+  kernelLineFit<<<1,1>>>(hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
+}
+
+void testFitOneGo(bool errors, double epsilon=1e-6) {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd hits(3,4);
+  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
+
+  fillHitsAndHitsCov(hits, hits_cov);
+
+  // FAST_FIT_CPU
+  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
+  // CIRCLE_FIT CPU
+  u_int n = hits.cols();
+  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n), 
       hits_cov.block(0, 0, 2 * n, 2 * n),
       fast_fit_results, rad, B, errors);
   // LINE_FIT CPU
   Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results,
-      fast_fit_results, B, errors);
+      fast_fit_results, errors);
 
   // FIT GPU
   std::cout << "GPU FIT" << std::endl;
@@ -138,8 +253,10 @@ void testFitOneGo(bool errors, double epsilon=1e-6) {
 }
 
 int main (int argc, char * argv[]) {
+//  testFit();
+  std::cout << "TEST FIT, NO ERRORS" << std::endl;
+  testFitOneGo(false);
 
-  cudaDeviceSetLimit(cudaLimitStackSize, 32*1024);
   std::cout << "TEST FIT, ERRORS AND SCATTER" << std::endl;
   testFitOneGo(true, 1e-5);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 8956caca42899..77cb6c4da68a4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="cuda"/>
 <use name="cuda-api-wrappers"/>
+<use name="ofast-flag"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/PluginManager"/>
@@ -10,7 +11,7 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
+<flags CXXFLAGS="-fno-math-errno"/>
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
-<flags CXXFLAGS="-Ofast -fno-math-errno"/>
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index ac81473c843cb..f14650a8d92ff 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -1223,6 +1223,12 @@ def _trackingFolders(lastDirName="Track"):
   + _makeMVAPlots(3) \
   + _makeMVAPlots(3, hp=True)
 # add more if needed
+_buildingExtendedPlots = [
+    _pulls,
+    _resolutionsEta,
+    _resolutionsPt,
+    _tuning,
+]
 _extendedPlots = [
     _extDistPtEtaPhi,
     _extDistDxyDzBS,
@@ -1279,7 +1285,7 @@ def _trackingFolders(lastDirName="Track"):
 ]
 plotter = Plotter()
 plotterExt = Plotter()
-def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, onlyForElectron=False, onlyForConversion=False, onlyForBHadron=False, seeding=False, rawSummary=False, highPuritySummary=True):
+def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, onlyForElectron=False, onlyForConversion=False, onlyForBHadron=False, seeding=False, building=False, rawSummary=False, highPuritySummary=True):
     folders = _trackingFolders(lastDirName)
     # to keep backward compatibility, this set of plots has empty name
     limiters = dict(onlyForPileup=onlyForPileup, onlyForElectron=onlyForElectron, onlyForConversion=onlyForConversion, onlyForBHadron=onlyForBHadron)
@@ -1290,7 +1296,11 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
         _trackingSubFoldersFallbackSLHC_Phase1PU140,
         _trackingSubFoldersFallbackFromPV, _trackingSubFoldersFallbackConversion])
     plotter.append(name, folders, TrackingPlotFolder(*algoPlots, **commonForTPF), **common)
-    plotterExt.append(name, folders, TrackingPlotFolder(*_extendedPlots, **commonForTPF), **common)
+    extendedPlots = []
+    if building:
+        extendedPlots.extend(_buildingExtendedPlots)
+    extendedPlots.extend(_extendedPlots)
+    plotterExt.append(name, folders, TrackingPlotFolder(*extendedPlots, **commonForTPF), **common)
 
     summaryName = ""
     if name != "":
@@ -1330,7 +1340,7 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
 _appendTrackingPlots("TrackFromPVAllTP", "fromPVAllTP", _simBasedPlots+_recoBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackFromPVAllTP2", "fromPVAllTP2", _simBasedPlots+_recoBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackSeeding", "seeding", _seedingBuildingPlots, seeding=True)
-_appendTrackingPlots("TrackBuilding", "building", _seedingBuildingPlots)
+_appendTrackingPlots("TrackBuilding", "building", _seedingBuildingPlots, building=True)
 _appendTrackingPlots("TrackConversion", "conversion", _simBasedPlots+_recoBasedPlots, onlyForConversion=True, rawSummary=True, highPuritySummary=False)
 _appendTrackingPlots("TrackGsf", "gsf", _simBasedPlots+_recoBasedPlots, onlyForElectron=True, rawSummary=True, highPuritySummary=False)
 _appendTrackingPlots("TrackBHadron", "bhadron", _simBasedPlots+_recoBasedPlots, onlyForBHadron=True)

From ffa2d9502823602aa69ac0daffbe726b02c79383 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 28 Nov 2018 18:06:28 +0100
Subject: [PATCH 031/102] Migrate tracker local reconstruction and pixel
 tracking to Tasks (backport #25163) (cms-patatrack#202)

Backport "Migrate tracker local reconstruction and pixel tracking to Tasks" (#25163) to the Patatrack branch:
  - migrate RecoLocalTracker_cff to Tasks;
  - migrate RecoPixelVertexing_cff to Tasks;
  - keeping sequences to avoid massive migration (for now).
---
 .../python/PixelTracks_cff.py                 | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 16803a957c928..728b3fec47f39 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -67,24 +67,26 @@
 )
 trackingLowPU.toModify(pixelTracks, SeedingHitSets = "pixelTracksHitTriplets")
 
-pixelTracksSequence = cms.Sequence(
-    pixelTracksTrackingRegions +
-    pixelFitterByHelixProjections +
-    pixelTrackFilterByKinematics +
-    pixelTracksSeedLayers +
-    pixelTracksHitDoublets +
-    pixelTracksHitQuadruplets +
+pixelTracksTask = cms.Task(
+    pixelTracksTrackingRegions,
+    pixelFitterByHelixProjections,
+    pixelTrackFilterByKinematics,
+    pixelTracksSeedLayers,
+    pixelTracksHitDoublets,
+    pixelTracksHitQuadruplets,
     pixelTracks
 )
-_pixelTracksSequence_lowPU = pixelTracksSequence.copy()
-_pixelTracksSequence_lowPU.replace(pixelTracksHitQuadruplets, pixelTracksHitTriplets)
-trackingLowPU.toReplaceWith(pixelTracksSequence, _pixelTracksSequence_lowPU)
+_pixelTracksTask_lowPU = pixelTracksTask.copy()
+_pixelTracksTask_lowPU.replace(pixelTracksHitQuadruplets, pixelTracksHitTriplets)
+trackingLowPU.toReplaceWith(pixelTracksTask, _pixelTracksTask_lowPU)
 
 # Use Riemann fit and substitute previous Fitter producer with the Riemann one
 from Configuration.ProcessModifiers.riemannFit_cff import riemannFit
 from Configuration.ProcessModifiers.riemannFitGPU_cff import riemannFitGPU
 riemannFit.toModify(pixelTracks, Fitter = "pixelFitterByRiemannParaboloid")
 riemannFitGPU.toModify(pixelTracks, runOnGPU = True)
-_pixelTracksSequence_riemannFit = pixelTracksSequence.copy()
-_pixelTracksSequence_riemannFit.replace(pixelFitterByHelixProjections, pixelFitterByRiemannParaboloid)
-riemannFit.toReplaceWith(pixelTracksSequence, _pixelTracksSequence_riemannFit)
+_pixelTracksTask_riemannFit = pixelTracksTask.copy()
+_pixelTracksTask_riemannFit.replace(pixelFitterByHelixProjections, pixelFitterByRiemannParaboloid)
+riemannFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_riemannFit)
+
+pixelTracksSequence = cms.Sequence(pixelTracksTask)

From adea719e1ca5b5be2bc1fad37f04c725b2097a4e Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 29 Nov 2018 02:17:50 -0600
Subject: [PATCH 032/102] Fix MTV validation of initialStepPreSplitting tracks
 and add B-hadron MTV variation to pixel track validation sequence
 (cms-patatrack#199)

  - add B-hadron MTV variation to pixel track validation sequence
  - fix MTV validation of initialStepPreSplitting tracks
---
 .../python/PostProcessorTracker_cfi.py        |   4 +-
 .../RecoTrack/python/TrackValidation_cff.py   | 276 ++++++++++--------
 Validation/RecoTrack/python/plotting/html.py  |   4 +
 .../python/plotting/trackingPlots.py          |   6 +-
 4 files changed, 164 insertions(+), 126 deletions(-)

diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index a926b19d4321a..6b5a19f799035 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -270,9 +270,9 @@ def _addNoFlow(module):
 )
 
 postProcessorTrackTrackingOnly = postProcessorTrack.clone()
-postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*"])
+postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"])
 postProcessorTrackSummaryTrackingOnly = postProcessorTrackSummary.clone()
-postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackSeeding", "Tracking/PixelTrack", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*"])
+postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackSeeding", "Tracking/PixelTrack", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"])
 
 postProcessorTrackSequenceTrackingOnly = cms.Sequence(
     postProcessorTrackTrackingOnly+
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index a90d22e8518cf..bc84c87cf191f 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -33,8 +33,10 @@
         _trackProd = []
 
     locals()["_algos"+_postfix] = ["generalTracks"] + _cfg.iterationAlgos(_postfix) + ["duplicateMerge"]
-    locals()["_seedProducers"+_postfix] = _seedProd + _cfg.seedProducers(_postfix)
-    locals()["_trackProducers"+_postfix] = _trackProd + _cfg.trackProducers(_postfix)
+    locals()["_seedProducersPreSplitting"+_postfix] = _seedProd
+    locals()["_trackProducersPreSplitting"+_postfix] = _trackProd
+    locals()["_seedProducers"+_postfix] = _cfg.seedProducers(_postfix)
+    locals()["_trackProducers"+_postfix] = _cfg.trackProducers(_postfix)
 
     if _eraName != "trackingPhase2PU140":
         locals()["_electronSeedProducers"+_postfix] = ["tripletElectronSeeds", "pixelPairElectronSeeds", "stripPairElectronSeeds"]
@@ -61,7 +63,7 @@ def _algoToSelector(algo):
 
 def _addSelectorsByAlgo(algos, modDict):
     names = []
-    seq = cms.Sequence()
+    task = cms.Task()
     for algo in algos:
         if algo == "generalTracks":
             continue
@@ -72,10 +74,10 @@ def _addSelectorsByAlgo(algos, modDict):
         else:
             mod = modDict[modName]
         names.append(modName)
-        seq += mod
-    return (names, seq)
+        task.add(mod)
+    return (names, task)
 def _addSelectorsByHp(algos, modDict):
-    seq = cms.Sequence()
+    task = cms.Task()
     names = []
     for algo in algos:
         modName = _algoToSelector(algo)
@@ -89,10 +91,10 @@ def _addSelectorsByHp(algos, modDict):
         else:
             mod = modDict[modNameHp]
         names.append(modNameHp)
-        seq += mod
-    return (names, seq)
+        task.add(mod)
+    return (names, task)
 def _addSelectorsBySrc(modules, midfix, src, modDict):
-    seq = cms.Sequence()
+    task = cms.Task()
     names = []
     for modName in modules:
         modNameNew = modName.replace("cutsRecoTracks", "cutsRecoTracks"+midfix)
@@ -102,10 +104,10 @@ def _addSelectorsBySrc(modules, midfix, src, modDict):
         else:
             mod = modDict[modNameNew]
         names.append(modNameNew)
-        seq += mod
-    return (names, seq)
+        task.add(mod)
+    return (names, task)
 def _addSelectorsByOriginalAlgoMask(modules, midfix, algoParam,modDict):
-    seq = cms.Sequence()
+    task = cms.Task()
     names = []
     for modName in modules:
         if modName[-2:] == "Hp":
@@ -120,11 +122,11 @@ def _addSelectorsByOriginalAlgoMask(modules, midfix, algoParam,modDict):
         else:
             mod = modDict[modNameNew]
         names.append(modNameNew)
-        seq += mod
-    return (names, seq)
+        task.add(mod)
+    return (names, task)
 def _addSeedToTrackProducers(seedProducers,modDict):
     names = []
-    seq = cms.Sequence()
+    task = cms.Task()
     for seed in seedProducers:
         modName = "seedTracks"+seed
         if modName not in modDict:
@@ -133,8 +135,8 @@ def _addSeedToTrackProducers(seedProducers,modDict):
         else:
             mod = modDict[modName]
         names.append(modName)
-        seq += mod
-    return (names, seq)
+        task.add(mod)
+    return (names, task)
 
 _relevantEras = _cfg.allEras()
 _relevantErasAndFastSim = _relevantEras + [("fastSim", "_fastSim", fastSim)]
@@ -146,9 +148,9 @@ def _translateArgs(args, postfix, modDict):
         else:
             ret.append(modDict[arg+postfix])
     return ret
-def _sequenceForEachEra(function, args, names, sequence, modDict, plainArgs=[], modifySequence=None, includeFastSim=False):
-    if sequence[0] != "_":
-        raise Exception("Sequence name is expected to begin with _")
+def _taskForEachEra(function, args, names, task, modDict, plainArgs=[], modifyTask=None, includeFastSim=False):
+    if task[0] != "_":
+        raise Exception("Task name is expected to begin with _")
 
     _eras = _relevantErasAndFastSim if includeFastSim else _relevantEras
     for eraName, postfix, _era in _eras:
@@ -156,23 +158,23 @@ def _sequenceForEachEra(function, args, names, sequence, modDict, plainArgs=[],
         _args.extend(plainArgs)
         ret = function(*_args, modDict=modDict)
         if len(ret) != 2:
-            raise Exception("_sequenceForEachEra is expected to return 2 values, but function returned %d" % len(ret))
+            raise Exception("_taskForEachEra is expected to return 2 values, but function returned %d" % len(ret))
         modDict[names+postfix] = ret[0]
-        modDict[sequence+postfix] = ret[1]
+        modDict[task+postfix] = ret[1]
 
-    # The sequence of the first era will be the default one
-    defaultSequenceName = sequence+_eras[0][0]
-    defaultSequence = modDict[defaultSequenceName]
-    modDict[defaultSequenceName[1:]] = defaultSequence # remove leading underscore
+    # The task of the first era will be the default one
+    defaultTaskName = task+_eras[0][0]
+    defaultTask = modDict[defaultTaskName]
+    modDict[defaultTaskName[1:]] = defaultTask # remove leading underscore
 
-    # Optionally modify sequences before applying the era
-    if modifySequence is not None:
+    # Optionally modify task before applying the era
+    if modifyTask is not None:
         for eraName, postfix, _era in _eras:
-            modifySequence(modDict[sequence+postfix])
+            modifyTask(modDict[task+postfix])
 
     # Apply eras
     for _eraName, _postfix, _era in _eras[1:]:
-        _era.toReplaceWith(defaultSequence, modDict[sequence+_postfix])
+        _era.toReplaceWith(defaultTask, modDict[task+_postfix])
 def _setForEra(module, eraName, era, **kwargs):
     if eraName == "":
         for key, value in six.iteritems(kwargs):
@@ -242,10 +244,10 @@ def _getMVASelectors(postfix):
     locals()["_mvaSelectors"+_postfix] = _getMVASelectors(_postfix)
 
 # Validation iterative steps
-_sequenceForEachEra(_addSelectorsByAlgo, args=["_algos"], names="_selectorsByAlgo", sequence="_tracksValidationSelectorsByAlgo", modDict=globals())
+_taskForEachEra(_addSelectorsByAlgo, args=["_algos"], names="_selectorsByAlgo", task="_tracksValidationSelectorsByAlgo", modDict=globals())
 
 # high purity
-_sequenceForEachEra(_addSelectorsByHp, args=["_algos"], names="_selectorsByAlgoHp", sequence="_tracksValidationSelectorsByAlgoHp", modDict=globals())
+_taskForEachEra(_addSelectorsByHp, args=["_algos"], names="_selectorsByAlgoHp", task="_tracksValidationSelectorsByAlgoHp", modDict=globals())
 
 # by originalAlgo
 for _eraName, _postfix, _era in _relevantEras:
@@ -254,9 +256,9 @@ def _getMVASelectors(postfix):
     locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] = [n for n in locals()["_selectorsByAlgoAndHp"+_postfix] if n not in ["generalTracks", "cutsRecoTracksHp"]]
     # For ByOriginalAlgo
     locals()["_selectorsByAlgoAndHpNoGenTkDupMerge"+_postfix] = [n for n in locals()["_selectorsByAlgoAndHpNoGenTk"+_postfix] if n not in ["cutsRecoTracksDuplicateMerge", "cutsRecoTracksDuplicateMergeHp"]]
-_sequenceForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
+_taskForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
                     args = ["_selectorsByAlgoAndHpNoGenTkDupMerge"], plainArgs = ["ByOriginalAlgo", "originalAlgorithm"],
-                    names = "_selectorsByOriginalAlgo", sequence = "_tracksValidationSelectorsByOriginalAlgo")
+                    names = "_selectorsByOriginalAlgo", task = "_tracksValidationSelectorsByOriginalAlgo")
 
 
 for _eraName, _postfix, _era in _relevantEras:
@@ -296,11 +298,11 @@ def _getMVASelectors(postfix):
 # select tracks with pT > 0.9 GeV (for upgrade fake rates)
 generalTracksPt09 = cutsRecoTracks_cfi.cutsRecoTracks.clone(ptMin=0.9)
 # and then the selectors
-_sequenceForEachEra(_addSelectorsBySrc, modDict=globals(),
-                    args=[["_generalTracksHp"]],
-                    plainArgs=["Pt09", "generalTracksPt09"],
-                    names="_selectorsPt09", sequence="_tracksValidationSelectorsPt09",
-                    modifySequence=lambda seq:seq.insert(0, generalTracksPt09))
+_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
+                args=[["_generalTracksHp"]],
+                plainArgs=["Pt09", "generalTracksPt09"],
+                names="_selectorsPt09", task="_tracksValidationSelectorsPt09",
+                modifyTask=lambda task:task.add(generalTracksPt09))
 
 # select tracks from the PV
 from CommonTools.RecoAlgos.TrackWithVertexRefSelector_cfi import trackWithVertexRefSelector as _trackWithVertexRefSelector
@@ -317,20 +319,20 @@ def _getMVASelectors(postfix):
     rhoVtx = 1e10, # intentionally no dxy cut
 )
 # and then the selectors
-_sequenceForEachEra(_addSelectorsBySrc, modDict=globals(),
+_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
                     args=[["_generalTracksHp"]],
                     plainArgs=["FromPV", "generalTracksFromPV"],
-                    names="_selectorsFromPV", sequence="_tracksValidationSelectorsFromPV",
-                    modifySequence=lambda seq: seq.insert(0, generalTracksFromPV))
+                    names="_selectorsFromPV", task="_tracksValidationSelectorsFromPV",
+                    modifyTask=lambda task: task.add(generalTracksFromPV))
 
 # select tracks with pT > 0.9 GeV from the PV
 generalTracksFromPVPt09 = generalTracksPt09.clone(src="generalTracksFromPV")
 # and then the selectors
-_sequenceForEachEra(_addSelectorsBySrc, modDict=globals(),
-                    args=[["_generalTracksHp"]],
-                    plainArgs=["FromPVPt09", "generalTracksFromPVPt09"],
-                    names="_selectorsFromPVPt09", sequence="_tracksValidationSelectorsFromPVPt09",
-                    modifySequence=lambda seq: seq.insert(0, generalTracksFromPVPt09))
+_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
+                args=[["_generalTracksHp"]],
+                plainArgs=["FromPVPt09", "generalTracksFromPVPt09"],
+                names="_selectorsFromPVPt09", task="_tracksValidationSelectorsFromPVPt09",
+                modifyTask=lambda task: task.add(generalTracksFromPVPt09))
 
 ## Select conversion TrackingParticles, and define the corresponding associator
 trackingParticlesConversion = _trackingParticleConversionRefSelector.clone()
@@ -447,6 +449,15 @@ def _getMVASelectors(postfix):
     _setForEra(trackValidatorAllTPEffic, _eraName, _era, label = ["generalTracks", locals()["_generalTracksHp"+_postfix]])
 
 # Built tracks, in the standard sequence mainly for monitoring the track selection MVA
+tpClusterProducerPreSplitting = tpClusterProducer.clone(pixelClusterSrc = "siPixelClustersPreSplitting")
+quickTrackAssociatorByHitsPreSplitting = quickTrackAssociatorByHits.clone(cluster2TPSrc = "tpClusterProducerPreSplitting")
+tpClusterProducerHeterogeneousPreSplitting = tpClusterProducerHeterogeneous.clone(
+   pixelClusterSrc = "siPixelClustersPreSplitting"
+)
+from Configuration.ProcessModifiers.gpu_cff import gpu
+gpu.toReplaceWith(tpClusterProducerPreSplitting, tpClusterProducerConverter.clone(
+    src = "tpClusterProducerHeterogeneousPreSplitting"
+))
 _trackValidatorSeedingBuilding = trackValidator.clone( # common for built tracks and seeds (in trackingOnly)
     associators = ["quickTrackAssociatorByHits"],
     UseAssociators = True,
@@ -459,11 +470,17 @@ def _getMVASelectors(postfix):
     dirName = "Tracking/TrackBuilding/",
     doMVAPlots = True,
 )
+trackValidatorBuildingPreSplitting = trackValidatorBuilding.clone(
+    associators = ["quickTrackAssociatorByHitsPreSplitting"],
+    doMVAPlots = False,
+    doSummaryPlots = False,
+)
 for _eraName, _postfix, _era in _relevantErasAndFastSim:
     _setForEra(trackValidatorBuilding, _eraName, _era, label = locals()["_trackProducers"+_postfix])
 fastSim.toModify(trackValidatorBuilding, doMVAPlots=False)
 for _eraName, _postfix, _era in _relevantEras:
     _setForEra(trackValidatorBuilding, _eraName, _era, mvaLabels = locals()["_mvaSelectors"+_postfix])
+    _setForEra(trackValidatorBuildingPreSplitting, _eraName, _era, label = locals()["_trackProducersPreSplitting"+_postfix])
 
 
 # For conversions
@@ -534,31 +551,34 @@ def _uniqueFirstLayers(layerList):
 
 
 # the track selectors
-tracksValidationSelectors = cms.Sequence(
-    tracksValidationSelectorsByAlgo +
-    tracksValidationSelectorsByAlgoHp +
-    tracksValidationSelectorsByOriginalAlgo +
-    cutsRecoTracksBtvLike +
-    ak4JetTracksAssociatorExplicitAll +
+tracksValidationSelectors = cms.Task(
+    tracksValidationSelectorsByAlgo,
+    tracksValidationSelectorsByAlgoHp,
+    tracksValidationSelectorsByOriginalAlgo,
+    cutsRecoTracksBtvLike,
+    ak4JetTracksAssociatorExplicitAll,
     cutsRecoTracksAK4PFJets
 )
-tracksValidationTruth = cms.Sequence(
-    tpClusterProducer +
-    quickTrackAssociatorByHits +
-    trackingParticleRecoTrackAsssociation +
-    VertexAssociatorByPositionAndTracks +
+tracksValidationTruth = cms.Task(
+    tpClusterProducer,
+    tpClusterProducerHeterogeneousPreSplitting,
+    tpClusterProducerPreSplitting,
+    quickTrackAssociatorByHits,
+    quickTrackAssociatorByHitsPreSplitting,
+    trackingParticleRecoTrackAsssociation,
+    VertexAssociatorByPositionAndTracks,
     trackingParticleNumberOfLayersProducer
 )
 fastSim.toModify(tracksValidationTruth, lambda x: x.remove(tpClusterProducer))
 
-tracksPreValidation = cms.Sequence(
-    tracksValidationSelectors +
-    tracksValidationSelectorsPt09 +
-    tracksValidationSelectorsFromPV +
-    tracksValidationSelectorsFromPVPt09 +
-    tracksValidationTruth +
-    cms.ignore(trackingParticlesSignal) +
-    cms.ignore(trackingParticlesElectron) +
+tracksPreValidation = cms.Task(
+    tracksValidationSelectors,
+    tracksValidationSelectorsPt09,
+    tracksValidationSelectorsFromPV,
+    tracksValidationSelectorsFromPVPt09,
+    tracksValidationTruth,
+    trackingParticlesSignal,
+    trackingParticlesElectron,
     trackingParticlesConversion
 )
 fastSim.toReplaceWith(tracksPreValidation, tracksPreValidation.copyAndExclude([
@@ -567,17 +587,19 @@ def _uniqueFirstLayers(layerList):
 ]))
 
 tracksValidation = cms.Sequence(
-    tracksPreValidation +
     trackValidator +
     trackValidatorTPPtLess09 +
     trackValidatorFromPV +
     trackValidatorFromPVAllTP +
     trackValidatorAllTPEffic +
     trackValidatorBuilding +
+    trackValidatorBuildingPreSplitting +
     trackValidatorConversion +
-    trackValidatorGsfTracks
+    trackValidatorGsfTracks,
+    tracksPreValidation
 )
 fastSim.toReplaceWith(tracksValidation, tracksValidation.copyAndExclude([
+    trackValidatorBuildingPreSplitting,
     trackValidatorConversion,
     trackValidatorGsfTracks,
 ]))
@@ -585,27 +607,27 @@ def _uniqueFirstLayers(layerList):
 ### Then define stuff for standalone mode (i.e. MTV with RECO+DIGI input)
 
 # Select by originalAlgo and algoMask
-_sequenceForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
-                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["ByAlgoMask", "algorithmMaskContains"],
-                    names = "_selectorsByAlgoMask", sequence = "_tracksValidationSelectorsByAlgoMaskStandalone")
+_taskForEachEra(_addSelectorsByOriginalAlgoMask, modDict = globals(),
+                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["ByAlgoMask", "algorithmMaskContains"],
+                names = "_selectorsByAlgoMask", task = "_tracksValidationSelectorsByAlgoMaskStandalone")
 
 # Select pT>0.9 by iteration
 # Need to avoid generalTracks+HP because those are already included in the standard validator
-_sequenceForEachEra(_addSelectorsBySrc, modDict = globals(),
-                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["Pt09", "generalTracksPt09"],
-                    names = "_selectorsPt09Standalone", sequence = "_tracksValidationSelectorsPt09Standalone")
+_taskForEachEra(_addSelectorsBySrc, modDict = globals(),
+                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["Pt09", "generalTracksPt09"],
+                names = "_selectorsPt09Standalone", task = "_tracksValidationSelectorsPt09Standalone")
 
 # Select fromPV by iteration
 # Need to avoid generalTracks+HP because those are already included in the standard validator
-_sequenceForEachEra(_addSelectorsBySrc, modDict = globals(),
-                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPV", "generalTracksFromPV"],
-                    names = "_selectorsFromPVStandalone", sequence = "_tracksValidationSelectorsFromPVStandalone")
+_taskForEachEra(_addSelectorsBySrc, modDict = globals(),
+                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPV", "generalTracksFromPV"],
+                names = "_selectorsFromPVStandalone", task = "_tracksValidationSelectorsFromPVStandalone")
 
 # Select pt>0.9 and fromPV by iteration
 # Need to avoid generalTracks+HP because those are already included in the standard validator
-_sequenceForEachEra(_addSelectorsBySrc, modDict = globals(),
-                    args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPVPt09", "generalTracksFromPVPt09"],
-                    names = "_selectorsFromPVPt09Standalone", sequence = "_tracksValidationSelectorsFromPVPt09Standalone")
+_taskForEachEra(_addSelectorsBySrc, modDict = globals(),
+                args = ["_selectorsByAlgoAndHpNoGenTk"], plainArgs = ["FromPVPt09", "generalTracksFromPVPt09"],
+                names = "_selectorsFromPVPt09Standalone", task = "_tracksValidationSelectorsFromPVPt09Standalone")
 
 # MTV instances
 trackValidatorStandalone = trackValidator.clone()
@@ -632,13 +654,13 @@ def _uniqueFirstLayers(layerList):
 
 # sequences
 tracksPreValidationStandalone = tracksPreValidation.copy()
-tracksPreValidationStandalone += trackingParticlesBHadron
+tracksPreValidationStandalone.add(trackingParticlesBHadron)
 fastSim.toReplaceWith(tracksPreValidationStandalone, tracksPreValidation)
 
-tracksValidationSelectorsStandalone = cms.Sequence(
-    tracksValidationSelectorsByAlgoMaskStandalone +
-    tracksValidationSelectorsPt09Standalone +
-    tracksValidationSelectorsFromPVStandalone +
+tracksValidationSelectorsStandalone = cms.Task(
+    tracksValidationSelectorsByAlgoMaskStandalone,
+    tracksValidationSelectorsPt09Standalone,
+    tracksValidationSelectorsFromPVStandalone,
     tracksValidationSelectorsFromPVPt09Standalone
 )
 
@@ -659,16 +681,18 @@ def _uniqueFirstLayers(layerList):
 
 tracksValidationStandalone = cms.Sequence(
     ak4PFL1FastL2L3CorrectorChain +
-    tracksPreValidationStandalone +
-    tracksValidationSelectorsStandalone +
-    trackValidatorsStandalone
+    trackValidatorsStandalone,
+    tracksPreValidationStandalone,
+    tracksValidationSelectorsStandalone
 )
 
 ### TrackingOnly mode (i.e. MTV with DIGI input + tracking-only reconstruction)
 
 # selectors
 tracksValidationSelectorsTrackingOnly = tracksValidationSelectors.copyAndExclude([ak4JetTracksAssociatorExplicitAll,cutsRecoTracksAK4PFJets]) # selectors using track information only (i.e. no PF)
-_sequenceForEachEra(_addSeedToTrackProducers, args=["_seedProducers"], names="_seedSelectors", sequence="_tracksValidationSeedSelectorsTrackingOnly", includeFastSim=True, modDict=globals())
+_taskForEachEra(_addSeedToTrackProducers, args=["_seedProducers"], names="_seedSelectors", task="_tracksValidationSeedSelectorsTrackingOnly", includeFastSim=True, modDict=globals())
+_taskForEachEra(_addSeedToTrackProducers, args=["_seedProducersPreSplitting"], names="_seedSelectorsPreSplitting", task="_tracksValidationSeedSelectorsPreSplittingTrackingOnly", modDict=globals())
+tracksValidationSeedSelectorsTrackingOnly.add(tracksValidationSeedSelectorsPreSplittingTrackingOnly)
 
 # MTV instances
 trackValidatorTrackingOnly = trackValidatorStandalone.clone(label = [ x for x in trackValidatorStandalone.label if x != "cutsRecoTracksAK4PFJets"] )
@@ -678,8 +702,16 @@ def _uniqueFirstLayers(layerList):
     label = _seedSelectors,
     doSeedPlots = True,
 )
+trackValidatorSeedingPreSplittingTrackingOnly = trackValidatorSeedingTrackingOnly.clone(
+    associators = ["quickTrackAssociatorByHitsPreSplitting"],
+    label = _seedSelectorsPreSplitting,
+    doSummaryPlots = False,
+
+)
 for _eraName, _postfix, _era in _relevantErasAndFastSim:
     _setForEra(trackValidatorSeedingTrackingOnly, _eraName, _era, label = locals()["_seedSelectors"+_postfix])
+for _eraName, _postfix, _era in _relevantEras:
+    _setForEra(trackValidatorSeedingPreSplittingTrackingOnly, _eraName, _era, label = locals()["_seedSelectorsPreSplitting"+_postfix])
 
 
 trackValidatorConversionTrackingOnly = trackValidatorConversion.clone(label = [x for x in trackValidatorConversion.label if x not in ["ckfInOutTracksFromConversions", "ckfOutInTracksFromConversions"]])
@@ -693,41 +725,32 @@ def _uniqueFirstLayers(layerList):
 trackValidatorsTrackingOnly = _trackValidatorsBase.copy()
 trackValidatorsTrackingOnly.replace(trackValidatorStandalone, trackValidatorTrackingOnly)
 trackValidatorsTrackingOnly += trackValidatorSeedingTrackingOnly
+trackValidatorsTrackingOnly += trackValidatorSeedingPreSplittingTrackingOnly
 trackValidatorsTrackingOnly += trackValidatorBuilding
+trackValidatorsTrackingOnly += trackValidatorBuildingPreSplitting
 trackValidatorsTrackingOnly.replace(trackValidatorConversionStandalone, trackValidatorConversionTrackingOnly)
 trackValidatorsTrackingOnly.remove(trackValidatorGsfTracks)
 trackValidatorsTrackingOnly.replace(trackValidatorBHadronStandalone, trackValidatorBHadronTrackingOnly)
-fastSim.toModify(trackValidatorsTrackingOnly, lambda x: x.remove(trackValidatorConversionTrackingOnly))
-fastSim.toModify(trackValidatorsTrackingOnly, lambda x: x.remove(trackValidatorBHadronTrackingOnly))
+fastSim.toReplaceWith(trackValidatorsTrackingOnly, trackValidatorsTrackingOnly.copyAndExclude([
+    trackValidatorBuildingPreSplitting,
+    trackValidatorSeedingPreSplittingTrackingOnly,
+    trackValidatorConversionTrackingOnly,
+    trackValidatorBHadronTrackingOnly
+]))
 
 
 tracksValidationTrackingOnly = cms.Sequence(
-    tracksPreValidationTrackingOnly +
-    tracksValidationSelectorsStandalone +
-    tracksValidationSeedSelectorsTrackingOnly +
-    trackValidatorsTrackingOnly
+    trackValidatorsTrackingOnly,
+    tracksPreValidationTrackingOnly,
+    tracksValidationSelectorsStandalone,
+    tracksValidationSeedSelectorsTrackingOnly
 )
 
 
 ### Pixel tracking only mode (placeholder for now)
-
-tpClusterProducerHeterogeneousPixelTrackingOnly = tpClusterProducerHeterogeneous.clone(
-   pixelClusterSrc = "siPixelClustersPreSplitting"
-)
-tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone(
-   pixelClusterSrc = "siPixelClustersPreSplitting"
-)
-from Configuration.ProcessModifiers.gpu_cff import gpu
-gpu.toReplaceWith(tpClusterProducerPixelTrackingOnly, tpClusterProducerConverter.clone(
-    src = "tpClusterProducerHeterogeneousPixelTrackingOnly"
-))
-
-quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone(
-    cluster2TPSrc = "tpClusterProducerPixelTrackingOnly"
-)
 trackingParticlePixelTrackAsssociation = trackingParticleRecoTrackAsssociation.clone(
     label_tr = "pixelTracks",
-    associator = "quickTrackAssociatorByHitsPixelTrackingOnly",
+    associator = "quickTrackAssociatorByHitsPreSplitting",
 )
 PixelVertexAssociatorByPositionAndTracks = VertexAssociatorByPositionAndTracks.clone(
     trackAssociation = "trackingParticlePixelTrackAsssociation"
@@ -770,25 +793,32 @@ def _uniqueFirstLayers(layerList):
     doSimPlots = False,
     doSimTrackPlots = False,
 )
+trackValidatorBHadronPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone(
+    dirName = "Tracking/PixelTrackBHadron/",
+    label_tp_effic = "trackingParticlesBHadron",
+    label_tp_effic_refvector = True,
+    doSimPlots = True,
+    doRecoTrackPlots = False, # Fake rate is defined wrt. all TPs, and that is already included in trackValidator
+    dodEdxPlots = False,
+)
 
 
 tracksValidationTruthPixelTrackingOnly = tracksValidationTruth.copy()
-tracksValidationTruthPixelTrackingOnly.replace(tpClusterProducer, tpClusterProducerPixelTrackingOnly)
-tracksValidationTruthPixelTrackingOnly.replace(quickTrackAssociatorByHits, quickTrackAssociatorByHitsPixelTrackingOnly)
 tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation)
 tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks)
+tracksValidationTruthPixelTrackingOnly.add(trackingParticlesBHadron)
 
-_tracksValidationTruthPixelTrackingOnlyGPU = tracksValidationTruthPixelTrackingOnly.copy()
-_tracksValidationTruthPixelTrackingOnlyGPU.insert(0, tpClusterProducerHeterogeneousPixelTrackingOnly)
-gpu.toReplaceWith(tracksValidationTruthPixelTrackingOnly, _tracksValidationTruthPixelTrackingOnlyGPU)
-
+tracksPreValidationPixelTrackingOnly = cms.Task(
+    tracksValidationTruthPixelTrackingOnly,
+    trackingParticlesSignal,
+    pixelTracksFromPV,
+)
 tracksValidationPixelTrackingOnly = cms.Sequence(
-    tracksValidationTruthPixelTrackingOnly +
-    cms.ignore(trackingParticlesSignal) +
-    pixelTracksFromPV +
     trackValidatorPixelTrackingOnly +
     trackValidatorFromPVPixelTrackingOnly +
-    trackValidatorFromPVAllTPPixelTrackingOnly
+    trackValidatorFromPVAllTPPixelTrackingOnly +
+    trackValidatorBHadronPixelTrackingOnly,
+    tracksPreValidationPixelTrackingOnly
 )
 
 
@@ -799,8 +829,8 @@ def _uniqueFirstLayers(layerList):
 )
 tracksValidationLite = cms.Sequence(
     cutsRecoTracksHp +
-    tracksValidationTruth +
-    trackValidatorLite
+    trackValidatorLite,
+    tracksValidationTruth
 )
 
 ## customization for timing
diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py
index d3f593f6a7586..1cac97b736941 100644
--- a/Validation/RecoTrack/python/plotting/html.py
+++ b/Validation/RecoTrack/python/plotting/html.py
@@ -63,6 +63,8 @@ def _allToBTV(s):
     return s.replace("All", "BTV-like")
 def _ptCut(s):
     return s.replace("Tracks", "Tracks pT &gt; 0.9 GeV").replace("tracks", "tracks pT &gt; 0.9 GeV")
+def _allToPixel(s):
+    return s.replace("All", "Pixel")
 def _toPixel(s):
     return s.replace("Tracks", "Pixel tracks")
 _trackQualityNameOrder = collections.OrderedDict([
@@ -198,6 +200,7 @@ def _toPixel(s):
     ("pixel", "Pixel tracks"),
     ("pixelFromPV", _toPixel(_fromPVName)),
     ("pixelFromPVAllTP", _toPixel(_fromPVAllTPName)),
+    ("pixelbhadron", _allToPixel(_bhadronName)),
     # These are for vertices
     ("genvertex", "Gen vertices"),
     ("pixelVertices", "Pixel vertices"),
@@ -248,6 +251,7 @@ def _sectionNameLegend():
         "bhadron_btvLike": _bhadronLegend.replace("All tracks", _btvLegend),
         "pixelFromPV": _fromPVLegend,
         "pixelFromPVAllTP": _fromPVAllTPLegend,
+        "pixelbhadron": _bhadronLegend,
     }
 
 class Table:
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index f14650a8d92ff..a14dbe41dfc9b 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -572,6 +572,8 @@ def _trackingSubFoldersFallbackFromPV(subfolder):
     return subfolder.replace("trackingParticleRecoAsssociation", "trackingParticleRecoAsssociationSignal")
 def _trackingSubFoldersFallbackConversion(subfolder):
     return subfolder.replace("quickAssociatorByHits", "quickAssociatorByHitsConversion")
+def _trackingSubFoldersFallbackPreSplitting(subfolder):
+    return subfolder.replace("quickAssociatorByHits", "quickAssociatorByHitsPreSplitting")
 
 # Additional "quality" flags than highPurity. In a separate list to
 # allow customization.
@@ -1294,7 +1296,8 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
     ], **limiters)
     common = dict(fallbackDqmSubFolders=[
         _trackingSubFoldersFallbackSLHC_Phase1PU140,
-        _trackingSubFoldersFallbackFromPV, _trackingSubFoldersFallbackConversion])
+        _trackingSubFoldersFallbackFromPV, _trackingSubFoldersFallbackConversion,
+        _trackingSubFoldersFallbackPreSplitting])
     plotter.append(name, folders, TrackingPlotFolder(*algoPlots, **commonForTPF), **common)
     extendedPlots = []
     if building:
@@ -1357,6 +1360,7 @@ def _appendPixelTrackingPlots(lastDirName, name):
 _appendPixelTrackingPlots("PixelTrack", "pixel")
 _appendPixelTrackingPlots("PixelTrackFromPV", "pixelFromPV")
 _appendPixelTrackingPlots("PixelTrackFromPVAllTP", "pixelFromPVAllTP")
+_appendPixelTrackingPlots("PixelTrackBHadron", "pixelbhadron")
 
 
 # MiniAOD

From 15e668c73ca8e858ce11d7fa14ba48cb4894c842 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 28 Nov 2018 18:41:33 +0100
Subject: [PATCH 033/102] Address code style and quality issues
 (cms-patatrack#203)

Cleaned up by clang-tidy 7.0.0.
Enabled checks:
  - boost-use-to-string
  - misc-uniqueptr-reset-release
  - modernize-deprecated-headers
  - modernize-make-shared
  - modernize-use-bool-literals
  - modernize-use-equals-delete
  - modernize-use-nullptr
  - modernize-use-override
  - performance-unnecessary-copy-initialization
  - readability-container-size-empty
  - readability-redundant-string-cstr
  - readability-static-definition-in-anonymous-namespace
  - readability-uniqueptr-delete-release

See http://releases.llvm.org/7.0.0/tools/clang/tools/extra/docs/clang-tidy/index.html for details.
---
 .../PixelTrackFitting/test/PixelTrackRiemannFit.cc              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
index b27ed52473388..77b5d1bebe6b6 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -190,7 +190,7 @@ void test_helix_fit() {
     gen_par(4) = 10.;   // R (p_t)
     gen_par(5) = 1.;   // eta
     iteration = 1;
-    return_err = 1;
+    return_err = true;
     debug2 = 1;
 
     iteration *= 10;

From 9e6f88d45ae0cdb054ec1d163a35889e306849f6 Mon Sep 17 00:00:00 2001
From: Marco Rovere <marco.rovere@cern.ch>
Date: Tue, 13 Nov 2018 15:02:45 +0100
Subject: [PATCH 034/102] Recover pre-10.4.x Riemann fit rework
 (cms-patatrack#190)

---
 .../PixelTrackFitting/interface/RiemannFit.h  | 510 +++++++++++-------
 1 file changed, 325 insertions(+), 185 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 6de1c77bbac12..33f8334c8b5a5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -47,7 +47,7 @@ struct circle_fit
       |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
-  */
+      */
     int64_t q;  //!< particle charge
     double chi2 = 0.0;
 };
@@ -78,7 +78,6 @@ struct helix_fit
     double chi2_line = 0.0;
     Vector4d fast_fit;
     int64_t q;  //!< particle charge
-                //  VectorXd time;  // TO FIX just for profiling
 } __attribute__((aligned(16)));
 
 template <class C>
@@ -119,31 +118,40 @@ __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
     return a.x() * b.y() - a.y() * b.x();
 }
 
+/*!  Compute the Radiation length in the uniform hypothesis
+ *
+ * The Pixel detector, barrel and forward, is considered as an omogeneous
+ * cilinder of material, whose radiation lengths has been derived from the TDR
+ * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
+ * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
+ * lengths are computed using this unique number, in both regions, barrel and
+ * endcap.
+ *
+ * NB: no angle corrections nor projections are computed inside this routine.
+ * It is therefore the responsibility of the caller to supply the proper
+ * lengths in input. These lenghts are the path travelled by the particle along
+ * its trajectory, namely the so called S of the helix in 3D space.
+ *
+ * \param length_values vector of incremental distances that will be translated
+ * into radiation length equivalent. Each radiation length i is computed
+ * incrementally with respect to the previous length i-1. The first lenght has
+ * no reference point (i.e. it has the dca).
+ *
+ * \return incremental radiation lengths that correspond to each segment.
+ */
 
-__host__ __device__ inline void computeRadLenEff(const Vector4d& fast_fit,
-                                                 const double B,
-                                                 double & radlen_eff,
-                                                 double & theta,
-                                                 bool & in_forward) {
-    double X_barrel = 0.015;
-    double X_forward = 0.05;
-    theta = atan(fast_fit(3));
-    // atan returns values in [-pi/2, pi/2], we need [0, pi]
-    theta = theta < 0. ? theta + M_PI : theta;
-    radlen_eff = X_barrel / std::abs(sin(theta));
-    in_forward = (theta <= 0.398 or theta >= 2.743);
-    if (in_forward)
-      radlen_eff = X_forward / std::abs(cos(theta));
-    assert(radlen_eff > 0.);
-    double p_t = fast_fit(2) * B;
-    // We have also to correct the radiation lenght in the x-y plane. Since we
-    // do not know the angle of incidence of the track at this point, we
-    // arbitrarily set the correction proportional to the inverse of the
-    // transerse momentum. The cut-off is at 1 Gev, set using a single Muon Pt
-    // gun and verifying that, at that momentum, not additional correction is,
-    // in fact, needed. This is an approximation.
-    if (std::abs(p_t/1.) < 1.)
-      radlen_eff /= std::abs(p_t/1.);
+__host__ __device__ inline
+void computeRadLenUniformMaterial(const VectorNd &length_values,
+    VectorNd & rad_lengths) {
+  // Radiation length of the pixel detector in the uniform assumption, with
+  // 0.06 rad_len at 16 cm
+  const double XX_0 = 16.f/(0.06);
+//  const double XX_0 = 1000.*16.f/(0.06);
+  u_int n = length_values.rows();
+  rad_lengths(0) = length_values(0)/XX_0;
+  for (u_int j = 1; j < n; ++j) {
+    rad_lengths(j) = std::abs(length_values(j)-length_values(j-1))/XX_0;
+  }
 }
 
 /*!
@@ -151,11 +159,29 @@ __host__ __device__ inline void computeRadLenEff(const Vector4d& fast_fit,
     multiple Coulomb scattering to be used in the line_fit, for the barrel
     and forward cases.
 
+    The input covariance matrix is in the variables s-z, original and
+    unrotated.
+
+    The multiple scattering component is computed in the usual linear
+    approximation, using the 3D path which is computed as the squared root of
+    the squared sum of the s and z components passed in.
+
+    Internally a rotation by theta is performed and the covariance matrix
+    returned is the one in the direction orthogonal to the rotated S3D axis,
+    i.e. along the rotated Z axis.
+
+    The choice of the rotation is not arbitrary, but derived from the fact that
+    putting the horizontal axis along the S3D direction allows the usage of the
+    ordinary least squared fitting techiques with the trivial parametrization y
+    = mx + q, avoiding the patological case with m = +/- inf, that would
+    correspond to the case at eta = 0.
  */
+
 __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
                                                      const Vector4d& fast_fit,
                                                      VectorNd const& s_arcs,
                                                      VectorNd const& z_values,
+                                                     const double theta,
                                                      const double B)
 {
 #if RFIT_DEBUG
@@ -164,47 +190,24 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
     u_int n = s_arcs.rows();
     double p_t = fast_fit(2) * B;
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
-    double radlen_eff = 0.;
-    double theta = 0.;
-    bool in_forward = false;
-    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
-
-    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
-    for (u_int k = 0; k < n; ++k)
-    {
-        for (u_int l = k; l < n; ++l)
-        {
-            for (u_int i = 0; i < std::min(k, l); ++i)
-            {
-#if RFIT_DEBUG
-              printf("Scatter_cov_line - B: %f\n", B);
-              printf("Scatter_cov_line - radlen_eff: %f, p_t: %f, p2: %f\n", radlen_eff, p_t, p_2);
-              printf("Scatter_cov_line - sig2:%f, theta: %f\n", sig2, theta);
-              printf("Scatter_cov_line - Adding to element %d, %d value %f\n", n + k, n + l, (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta))));
-#endif
-              if (in_forward) {
-                cov_sz(k, l) += (z_values(k) - z_values(i)) * (z_values(l) - z_values(i)) * sig2 / sqr(sqr(cos(theta)));
-                cov_sz(l, k) = cov_sz(k, l);
-              } else {
-                cov_sz(n + k, n + l) += (s_arcs(k) - s_arcs(i)) * (s_arcs(l) - s_arcs(i)) * sig2 / sqr(sqr(sin(theta)));
-                cov_sz(n + l, n + k) = cov_sz(n + k, n + l);
-              }
-            }
-        }
-    }
+    VectorNd rad_lengths_S(n);
+    // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
+    // Basically, to perform cwise operations on Matrices and Vectors, you need
+    // to transform them into Array-like objects.
+    VectorNd S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    S_values = S_values.array().sqrt();
+    computeRadLenUniformMaterial(S_values, rad_lengths_S);
+    VectorNd sig2_S(n);
+    sig2_S = .000225 / p_2 * (1.f + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
 #if RFIT_DEBUG
     Rfit::printIt(&cov_sz, "Scatter_cov_line - cov_sz: ");
 #endif
     Matrix2Nd rot = MatrixXd::Zero(2 * n, 2 * n);
     for (u_int i = 0; i < n; ++i) {
-      rot(i, i) = cos(theta);
-      rot(n + i, n + i) = cos(theta);
+      rot(i, i) = sin(theta);
+      rot(n + i, n + i) = sin(theta);
       u_int j = (i + n);
-      // Signs seem to be wrong for the off-diagonal element, but we are
-      // inverting x-y in the input vector, since theta is the angle between
-      // the z axis and the line, and we are putting the s values, which are Y,
-      // in the first position. A simple sign flip will take care of it.
-      rot(i, j) = i < j ? sin(theta) : -sin(theta);
+      rot(i, j) = i < j ? cos(theta) : -cos(theta);
     }
 
 #if RFIT_DEBUG
@@ -212,12 +215,23 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
 #endif
 
     Matrix2Nd tmp = rot*cov_sz*rot.transpose();
-    // We are interested only in the errors in the rotated s -axis which, in
-    // our formalism, are in the upper square matrix.
+    for (u_int k = 0; k < n; ++k)
+    {
+      for (u_int l = k; l < n; ++l)
+      {
+        for (u_int i = 0; i < std::min(k, l); ++i)
+        {
+          tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
+          tmp(l + n, k + n) = tmp(k + n, l + n);
+        }
+      }
+    }
+    // We are interested only in the errors orthogonal to the rotated s-axis
+    // which, in our formalism, are in the lower square matrix.
 #if RFIT_DEBUG
     Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
-    return tmp.block(0, 0, n, n);
+    return tmp.block(n, n, n, n);
 }
 
 /*!
@@ -233,13 +247,11 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
 
     \warning input points must be ordered radially from the detector center
     (from inner layer to outer ones; points on the same layer must ordered too).
-    \bug currently works only for points in the barrel.
 
     \details Only the tangential component is computed (the radial one is
     negligible).
 
  */
-// X in input TO FIX
 __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
                                                     const Vector4d& fast_fit,
                                                     VectorNd const& rad,
@@ -248,24 +260,32 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
     u_int n = p2D.cols();
     double p_t = fast_fit(2) * B;
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
-    double radlen_eff = 0.;
-    double theta = 0.;
-    bool in_forward = false;
-    computeRadLenEff(fast_fit, B, radlen_eff, theta, in_forward);
+    double theta = atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI :  theta;
+    VectorNd s_values(n);
+    VectorNd rad_lengths(n);
+    const Vector2d o(fast_fit(0), fast_fit(1));
 
+    // associated Jacobian, used in weights and errors computation
+    for (u_int i = 0; i < n; ++i)
+    {  // x
+        Vector2d p = p2D.block(0, i, 2, 1) - o;
+        const double cross = cross2D(-o, p);
+        const double dot = (-o).dot(p);
+        const double atan2_ = atan2(cross, dot);
+        s_values(i) = std::abs(atan2_ * fast_fit(2));
+    }
+    computeRadLenUniformMaterial(s_values*sqrt(1. + 1./(fast_fit(3)*fast_fit(3))), rad_lengths);
     MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
-    const double sig2 = .000225 / p_2 * sqr(1 + 0.038 * log(radlen_eff)) * radlen_eff;
+    VectorNd sig2(n);
+    sig2 = .000225 / p_2 * (1.f + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
     for (u_int k = 0; k < n; ++k)
     {
         for (u_int l = k; l < n; ++l)
         {
             for (u_int i = 0; i < std::min(k, l); ++i)
             {
-              if (in_forward) {
-                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(cos(theta));
-              } else {
-                scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2 / sqr(sin(theta));
-              }
+              scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i) / (sqr(sin(theta)));
               scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
             }
         }
@@ -409,23 +429,6 @@ __host__ __device__ inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv)
     return cov_rad_inv.colwise().sum().transpose();
 }
 
-/*!
-    \brief Compute the points' weights' vector for the line fit (ODR).
-    Results from a pre-fit is needed in order to take the orthogonal (to the
-    line) component of the errors.
-
-    \param x_err2 squared errors in the x axis.
-    \param y_err2 squared errors in the y axis.
-    \param tan_theta tangent of theta (angle between y axis and line).
-
-    \return weight points' weights' vector for the line fit (ODR).
-*/
-
-__host__ __device__ inline VectorNd Weight_line(const ArrayNd& x_err2, const ArrayNd& y_err2, const double& tan_theta)
-{
-    return (1. + sqr(tan_theta)) * 1. / (x_err2 + y_err2 * sqr(tan_theta));
-}
-
 /*!
     \brief Find particle q considering the  sign of cross product between
     particles velocity (estimated by the first 2 hits) and the vector radius
@@ -470,40 +473,6 @@ __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B,
     circle.par = par_pak;
 }
 
-/*!
-    \brief Compute the error propagation to obtain the square errors in the
-    x axis for the line fit. If errors have not been computed in the circle fit
-    than an'approximation is made.
-    Further information in attached documentation.
-
-    \param V hits' covariance matrix.
-    \param circle result of the previous circle fit (only the covariance matrix
-    is needed) TO FIX
-    \param J Jacobian of the transformation producing x values.
-    \param error flag for error computation.
-
-    \return x_err2 squared errors in the x axis.
-*/
-
-__host__ __device__ inline VectorNd X_err2(const Matrix3Nd& V, const circle_fit& circle, const MatrixNx5d& J,
-                                           const bool error, u_int n)
-{
-    VectorNd x_err2(n);
-    for (u_int i = 0; i < n; ++i)
-    {
-        Matrix5d Cov = MatrixXd::Zero(5, 5);
-        if (error)
-            Cov.block(0, 0, 3, 3) = circle.cov;
-        Cov(3, 3) = V(i, i);
-        Cov(4, 4) = V(i + n, i + n);
-        Cov(3, 4) = Cov(4, 3) = V(i, i + n);
-        Eigen::Matrix<double, 1, 1> tmp;
-        tmp = J.row(i) * Cov * J.row(i).transpose().eval();
-        x_err2(i) = tmp(0, 0);
-    }
-    return x_err2;
-}
-
 /*!
     \brief Compute the eigenvector associated to the minimum eigenvalue.
 
@@ -1004,7 +973,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         {
             const double t = 1. / h;
             J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
-                0, 0, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+                v(0)*v2x2_inv*t, v(1)*v2x2_inv*t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
         }
         printIt(&J3, "circle_fit - J3:");
 
@@ -1059,21 +1028,22 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     errors.
 */
 
-__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
-                                             const Matrix3Nd& hits_cov,
-                                             const circle_fit& circle,
-                                             const Vector4d& fast_fit,
-                                             const double B,
-                                             const bool error = true)
+__host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
+    const Matrix3Nd& hits_cov,
+    const circle_fit& circle,
+    const Vector4d& fast_fit,
+    const double B,
+    const bool error = true)
 {
     u_int n = hits.cols();
+    double theta = -circle.q*atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI :  theta;
     // PROJECTION ON THE CILINDER
-    Matrix2xNd p2D(2, n);
-    MatrixNx5d Jx(n, 5);
+    Matrix2xNd p2D = MatrixXd::Zero(2, n);
+    Eigen::Matrix<double, 2, 6> Jx;
 
 #if RFIT_DEBUG
     printf("Line_fit - B: %g\n", B);
-
     printIt(&hits, "Line_fit points: ");
     printIt(&hits_cov, "Line_fit covs: ");
 #endif
@@ -1085,8 +1055,11 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     const Vector2d o(circle.par(0), circle.par(1));
 
     // associated Jacobian, used in weights and errors computation
+    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
     for (u_int i = 0; i < n; ++i)
     {  // x
+      Matrix6d Cov = MatrixXd::Zero(6, 6);
+      Matrix2d Cov_sz_single = MatrixXd::Zero(2, 2);
         Vector2d p = hits.block(0, i, 2, 1) - o;
         const double cross = cross2D(-o, p);
         const double dot = (-o).dot(p);
@@ -1095,9 +1068,9 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         const double atan2_ = -circle.q * atan2(cross, dot);
         p2D(0, i) = atan2_ * circle.par(2);
 
-        // associated Jacobian, used in weights and errors computation
+        // associated Jacobian, used in weights and errors- computation
         const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-        double d_X0 = 0, d_Y0 = 0, d_R = 0.;  // good approximation for big pt and eta
+        double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
         if (error)
         {
             d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
@@ -1106,7 +1079,19 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         }
         const double d_x = temp0 * (o(1) * dot + o(0) * cross);
         const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-        Jx.row(i) << d_X0, d_Y0, d_R, d_x, d_y;
+        Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+//        Jx << d_X0, d_Y0, d_R, p(1)/p.norm(), -p(0)/p.norm(), 0, 0, 0, 0, 0, 0, 1.;
+        Cov.block(0, 0, 3, 3) = circle.cov;
+        Cov(3, 3) = hits_cov(i, i);
+        Cov(4, 4) = hits_cov(i + n, i + n);
+        Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);
+        Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);
+        Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);
+        Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);
+        Cov_sz_single = Jx * Cov * Jx.transpose();
+        cov_sz(i, i) = Cov_sz_single(0, 0);
+        cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
+        cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
     }
     // Math of d_{X0,Y0,R,x,y} all verified by hand
 
@@ -1114,43 +1099,25 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     p2D.row(1) = hits.row(2);
 
     // WEIGHT COMPUTATION
-    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
-    VectorNd x_err2 = X_err2(hits_cov, circle, Jx, error, n);
-    VectorNd y_err2 = hits_cov.block(2 * n, 2 * n, n, n).diagonal();
-    cov_sz.block(0, 0, n, n) = x_err2.asDiagonal();
-    cov_sz.block(n, n, n, n) = y_err2.asDiagonal();
 #if RFIT_DEBUG
     printIt(&cov_sz, "line_fit - cov_sz:");
 #endif
-    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), B);
+    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
 #if RFIT_DEBUG
     printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
 #endif
-    Matrix4d G, G4;
-    G4 = cov_with_ms.inverse();
+    Matrix4d G;
+    G = cov_with_ms.inverse();
 #if RFIT_DEBUG
-    printIt(&G4, "line_fit - cov_with_ms.inverse():");
+    printIt(&G, "line_fit - cov_with_ms.inverse():");
 #endif
-    double renorm = G4.sum();
-    G4 *= 1. / renorm;
+    double renorm = G.sum();
+    G *= 1. / renorm;
 #if RFIT_DEBUG
-    printIt(&G4, "line_fit - G4:");
+    printIt(&G, "line_fit - G4:");
 #endif
-    G = G4;
-    const VectorNd weight = Weight_circle(G);
 
-
-    VectorNd err2_inv = cov_with_ms.diagonal();
-    err2_inv = err2_inv.cwiseInverse();
-//    const VectorNd err2_inv = Weight_line(x_err2, y_err2, fast_fit(3));
-//    const VectorNd weight = err2_inv * 1. / err2_inv.sum();
-
-#if RFIT_DEBUG
-    printIt(&x_err2, "Line_fit - x_err2: ");
-    printIt(&y_err2, "Line_fit - y_err2: ");
-    printIt(&err2_inv, "Line_fit - err2_inv: ");
-    printIt(&weight, "Line_fit - weight: ");
-#endif
+    const VectorNd weight = Weight_circle(G);
 
     // COST FUNCTION
 
@@ -1162,16 +1129,12 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     const Matrix2xNd X = p2D.colwise() - r0;
     Matrix2d A = Matrix2d::Zero();
     A = X * G * X.transpose();
-//    for (u_int i = 0; i < n; ++i)
-//    {
-//        A += err2_inv(i) * (X.col(i) * X.col(i).transpose());
-//    }
 
 #if RFIT_DEBUG
     printIt(&A, "Line_fit - A: ");
 #endif
 
-    // minimize
+    // minimize. v is normalized!!
     double chi2;
     Vector2d v = min_eigen2D(A, chi2);
 #if RFIT_DEBUG
@@ -1179,7 +1142,6 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     printf("Line_fit chi2: %e\n", chi2);
 #endif
 
-    // n *= (chi2>0) ? 1 : -1; //TO FIX
     // This hack to be able to run on GPU where the automatic assignment to a
     // double from the vector multiplication is not working.
     Matrix<double, 1, 1> cm;
@@ -1189,8 +1151,8 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     // COMPUTE LINE PARAMETER
     line_fit line;
     line.par << -v(0) / v(1),                          // cotan(theta))
-        -c * sqrt(sqr(v(0)) + sqr(v(1))) * 1. / v(1);  // Zip
-    line.chi2 = abs(chi2);
+        -c / v(1);  // Zip
+    line.chi2 = abs(chi2*renorm);
 #if RFIT_DEBUG
     printIt(&(line.par), "Line_fit - line.par: ");
     printf("Line_fit - v norm: %e\n", sqrt(v(0)*v(0) + v(1)*v(1)));
@@ -1206,19 +1168,21 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         {
           // The norm is taken from Chernov, properly adapted to the weights case.
             double norm = v.transpose() * A * v;
-            norm /= weight.sum();
+//            double norm_empirical = cov_with_ms.diagonal().mean();
 #if RFIT_DEBUG
+            printf("Chi_2: %g\n", chi2);
+            printf("Norm: %g\n", norm);
+            printf("weight.sum(): %g\n", weight.sum());
             printf("Line_fit - norm:    %e\n", norm);
 #endif
-            const double sig2 = 1. / (A(0, 0) + A(1, 1)) * norm;
+
+            const double sig2 = norm/(A(0,0) + A(1,1));
             C(0, 0) = sig2 * v1_2;
             C(1, 1) = sig2 * v0_2;
-            C(0, 1) = C(1, 0) = -sig2 * v(0) * v(1);
-            const VectorNd weight_2 = (weight).array().square();
-            const Vector2d C0(weight_2.dot(x_err2), weight_2.dot(y_err2));
-            C.block(0, 2, 2, 1) = C.block(2, 0, 1, 2).transpose() = -C.block(0, 0, 2, 2) * r0;
-            Matrix<double, 1, 1> tmp = (r0.transpose() * C.block(0, 0, 2, 2) * r0);
-            C(2, 2) = v0_2 * C0(0) + v1_2 * C0(1) + C0(0) * C(0, 0) + C0(1) * C(1, 1) + tmp(0, 0);
+            C(1, 0) = C(0, 1) = -sig2 * v(0) * v(1);
+            C(2, 2) = sig2 * (v(0)*r0(1)-v(1)*r0(0))*(v(0)*r0(1)-v(1)*r0(0)) + (sig2/n)*(A(0,0)+A(1,1));
+            C(0, 2) = C(2, 0) = sig2*(v(0)*r0(1)-v(1)*r0(0))*v(1);
+            C(1, 2) = C(2, 1) = - sig2*(v(0)*r0(1)-v(1)*r0(0))*v(0);
         }
 #if RFIT_DEBUG
         printIt(&C, "line_fit - C:");
@@ -1228,9 +1192,7 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
         {
             const double t0 = 1. / v(1);
             const double t1 = sqr(t0);
-            const double sqrt_ = sqrt(v1_2 + v0_2);
-            const double t2 = 1. / sqrt_;
-            J << -t0, v(0) * t1, 0, -c * v(0) * t0 * t2, v0_2 * c * t1 * t2, -sqrt_ * t0;
+            J << -t0, v(0) * t1, 0., 0., c * t1, -t0;
         }
         Matrix<double, 3, 2> JT = J.transpose().eval();
 #if RFIT_DEBUG
@@ -1245,6 +1207,184 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     return line;
 }
 
+/*!  \brief Perform an ordinary least square fit in the s-z plane to compute
+ * the parameters cotTheta and Zip.
+ *
+ * The fit is performed in the rotated S3D-Z' plane, following the formalism of
+ * Frodesen, Chapter 10, p. 259.
+ *
+ * The system has been rotated to both try to use the combined errors in s-z
+ * along Z', as errors in the Y direction and to avoid the patological case of
+ * degenerate lines with angular coefficient m = +/- inf.
+ *
+ * The rotation is using the information on the theta angle computed in the
+ * fast fit. The rotation is such that the S3D axis will be the X-direction,
+ * while the rotated Z-axis will be the Y-direction. This pretty much follows
+ * what is done in the same fit in the Broken Line approach.
+ */
+
+__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
+    const Matrix3Nd& hits_cov,
+    const circle_fit& circle,
+    const Vector4d& fast_fit,
+    const double B,
+    const bool error = true) {
+  auto n = hits.cols();
+  double theta = -circle.q*atan(fast_fit(3));
+  theta = theta < 0. ? theta + M_PI : theta;
+
+  // PROJECTION ON THE CILINDER
+  //
+  // p2D will be:
+  // [s1, s2, s3, ..., sn]
+  // [z1, z2, z3, ..., zn]
+  // s values will be ordinary x-values
+  // z values will be ordinary y-values
+
+  Matrix2xNd p2D(2, n);
+  Eigen::Matrix<double, 2, 6> Jx;
+
+  p2D << MatrixXd::Zero(2, n);
+  Jx << MatrixXd::Zero(2, 6);
+
+#if RFIT_DEBUG
+  printf("Line_fit - B: %g\n", B);
+  printIt(&hits, "Line_fit points: ");
+  printIt(&hits_cov, "Line_fit covs: ");
+#endif
+  // x & associated Jacobian
+  // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+  // Slide 11
+  // a ==> -o i.e. the origin of the circle in XY plane, negative
+  // b ==> p i.e. distances of the points wrt the origin of the circle.
+  const Vector2d o(circle.par(0), circle.par(1));
+
+  // associated Jacobian, used in weights and errors computation
+  Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
+  Matrix6d Cov(6,6);
+  Matrix2d Cov_sz_single(2, 2);
+  for (u_int i = 0; i < n; ++i)
+  {
+    Vector2d p = hits.block(0, i, 2, 1) - o;
+    const double cross = cross2D(-o, p);
+    const double dot = (-o).dot(p);
+    // atan2(cross, dot) give back the angle in the transverse plane so tha the
+    // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
+    const double atan2_ = -circle.q * atan2(cross, dot);
+//    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
+    p2D(0, i) = atan2_ * circle.par(2);
+
+    // associated Jacobian, used in weights and errors- computation
+    const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+    double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+    if (error)
+    {
+      d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
+      d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
+      d_R = atan2_;
+    }
+    const double d_x = temp0 * (o(1) * dot + o(0) * cross);
+    const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
+    Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+    Cov << MatrixXd::Zero(6, 6);
+    Cov_sz_single << MatrixXd::Zero(2, 2);
+    Cov.block(0, 0, 3, 3) = circle.cov;
+    Cov(3, 3) = hits_cov(i, i);                        // x errors
+    Cov(4, 4) = hits_cov(i + n, i + n);                // y errors
+    Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);            // z errors
+    Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);        // cov_xy
+    Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);      // cov_xz
+    Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);  // cov_yz
+    Cov_sz_single = Jx * Cov * Jx.transpose();
+    cov_sz(i, i) = Cov_sz_single(0, 0);
+    cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
+    cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
+  }
+  // Math of d_{X0,Y0,R,x,y} all verified by hand
+  p2D.row(1) = hits.row(2);
+
+  // The following matrix will contain errors orthogonal to the rotated S
+  // component only, with the Multiple Scattering properly treated!!
+  MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
+#if RFIT_DEBUG
+  printIt(&cov_sz, "line_fit - cov_sz:");
+  printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+#endif
+
+  // Prepare the Rotation Matrix to rotate the points
+  Eigen::Matrix<double, 2, 2> rot = Eigen::Matrix<double, 2, 2>::Zero();
+  rot << sin(theta), cos(theta), -cos(theta), sin(theta);
+
+  // Rotate Points with the shape [2, n]
+  Matrix2xNd p2D_rot = rot*p2D;
+
+#if RFIT_DEBUG
+  printf("Fast fit Tan(theta): %g\n", fast_fit(3));
+  printf("Rotation angle: %g\n", theta);
+  printIt(&rot, "Rotation Matrix:");
+  printIt(&p2D, "Original Hits(s,z):");
+  printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
+  printIt(&rot, "Rotation Matrix:");
+#endif
+
+  // Build the A Matrix
+  Matrix2xNd A(2,n);
+  A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+
+#if RFIT_DEBUG
+  printIt(&A, "A Matrix:");
+#endif
+
+  // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
+  MatrixNd Vy_inv = cov_with_ms.inverse();
+  Eigen::Matrix<double, 2, 2> Inv_Cov = A*Vy_inv*A.transpose();
+
+  // Compute the Covariance Matrix of the fit parameters
+  Eigen::Matrix<double, 2, 2> Cov_params = Inv_Cov.inverse();
+
+  // Now Compute the Parameters in the form [2,1]
+  // The first component is q.
+  // The second component is m.
+  Eigen::Matrix<double, 2, 1> sol = Cov_params*A*Vy_inv*p2D_rot.row(1).transpose();
+
+
+#if RFIT_DEBUG
+  printIt(&sol, "Rotated solutions:");
+#endif
+
+  // We need now to transfer back the results in the original s-z plane
+  auto common_factor = 1./(sin(theta)-sol(1,0)*cos(theta));
+  Matrix<double, 2, 2> J = Matrix<double, 2, 2>::Zero();
+  J << 0., common_factor*common_factor, common_factor, sol(0,0)*cos(theta)*common_factor*common_factor;
+
+  double m = common_factor*(sol(1,0)*sin(theta)+cos(theta));
+  double q = common_factor*sol(0,0);
+  auto cov_mq = J * Cov_params * J.transpose();
+
+  VectorNd res = p2D_rot.row(1).transpose() - A.transpose() * sol;
+  double chi2 = res.transpose()*Vy_inv*res;
+  chi2 = chi2 / float(n);
+
+  line_fit line;
+  line.par << m, q;
+  line.cov << cov_mq;
+  line.chi2 = chi2;
+
+#if RFIT_DEBUG
+  printf("Common_factor: %g\n", common_factor);
+  printIt(&J, "Jacobian:");
+  printIt(&sol, "Rotated solutions:");
+  printIt(&Cov_params, "Cov_params:");
+  printIt(&cov_mq, "Rotated Covariance Matrix:");
+  printIt(&(line.par), "Real Parameters:");
+  printIt(&(line.cov), "Real Covariance Matrix:");
+  printf("Chi2: %g\n", chi2);
+#endif
+
+  return line;
+}
+
 /*!
     \brief Helix fit by three step:
     -fast pre-fit (see Fast_fit() for further info); \n

From 917c412976a7ae01a2fd0424c1dcabae469488a2 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 8 Jan 2019 18:29:08 +0100
Subject: [PATCH 035/102] Full workflow from raw data to pixel tracks and
 vertices on GPUs (cms-patatrack#216)

Port and optimise the full workflow from pixel raw data to pixel tracks and vertices to GPUs.
Clean the pixel n-tuplets with the "fishbone" algorithm (only on GPUs).

Other changes:
  - recover the Riemann fit updates lost during the merge with CMSSW 10.4.x;
  - speed up clustering and track fitting;
  - minor bug fix to avoid trivial regression with the optimized fit.
---
 .../customizePixelTracksForProfiling.py       |  11 +-
 .../PixelTrackFitting/interface/FitResult.h   | 112 +++
 .../PixelTrackFitting/interface/RiemannFit.h  | 887 ++++++------------
 .../PixelTrackFitting/plugins/BuildFile.xml   |   3 +
 .../plugins/PixelTrackProducer.cc             |  59 +-
 .../plugins/PixelTrackProducer.h              |   2 -
 .../PixelTrackFitting/plugins/storeTracks.h   |  77 ++
 .../python/PixelTracks_cff.py                 |   7 +
 .../PixelTrackFitting/test/BuildFile.xml      |  36 +-
 .../test/PixelTrackRiemannFit.cc              | 423 ++++++---
 .../PixelTrackFitting/test/testEigenGPU.cu    | 313 +++---
 .../test/testEigenJacobian.cpp                |  94 ++
 .../PixelTrackFitting/test/testRiemannFit.cpp |  88 ++
 .../PixelTrackFitting/test/test_common.h      |  11 +-
 .../PixelTriplets/interface/CircleEq.h        | 128 +++
 .../PixelTriplets/plugins/BuildFile.xml       |   3 +-
 .../PixelTriplets/plugins/CAConstants.h       |  34 +
 .../PixelTriplets/plugins/GPUCACell.h         | 147 +--
 .../PixelTriplets/plugins/RiemannFitOnGPU.cc  |  35 +
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  | 195 ++++
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   |  60 ++
 .../PixelTriplets/plugins/gpuFishbone.h       |  93 ++
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  15 +-
 .../python/caHitQuadrupletEDProducer_cfi.py   |   4 -
 .../PixelTriplets/test/BuildFile.xml          |   1 +
 .../PixelTriplets/test/CircleEq_t.cpp         |  99 ++
 26 files changed, 1836 insertions(+), 1101 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
 create mode 100644 RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 99a3a9321062b..15224adb78cc3 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -6,6 +6,7 @@ def customizePixelTracksForProfiling(process):
     process.out = cms.OutputModule("AsciiOutputModule",
         outputCommands = cms.untracked.vstring(
             "keep *_pixelTracks_*_*",
+            "keep *_pixelVertices_*_*",
         ),
         verbosity = cms.untracked.uint32(0),
     )
@@ -19,17 +20,12 @@ def customizePixelTracksForProfiling(process):
 def customizePixelTracksForProfilingDisableConversion(process):
     process = customizePixelTracksForProfiling(process)
 
-    # Turn off cluster shape filter so that CA doesn't depend on clusters
-    process.pixelTracksHitQuadruplets.SeedComparitorPSet = cms.PSet(ComponentName = cms.string("none"))
-
-    # Replace pixel track producer with a dummy one for now
-    from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromCUDA_cfi import pixelTrackProducerFromCUDA as _pixelTrackProducerFromCUDA
-    process.pixelTracks = _pixelTrackProducerFromCUDA.clone()
-
     # Disable conversions to legacy
     process.siPixelClustersPreSplitting.gpuEnableConversion = False
     process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
     process.pixelTracksHitQuadruplets.gpuEnableConversion = False
+    process.pixelTracks.gpuEnableConversion = False
+    process.pixelVertices.gpuEnableConversion = False
 
     return process
 
@@ -40,5 +36,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
     process.siPixelClustersPreSplitting.gpuEnableTransfer = False
     process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
     process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
+    process.pixelVertices.gpuEnableTransfer = False
 
     return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
new file mode 100644
index 0000000000000..ba0f0aa13e1a6
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
@@ -0,0 +1,112 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+
+#include <cmath>
+
+#include <cuda_runtime.h>
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+namespace Rfit
+{
+
+constexpr double d = 1.e-4;          //!< used in numerical derivative (J2 in Circle_fit())
+constexpr unsigned int max_nop = 4;  //!< In order to avoid use of dynamic memory
+
+
+using VectorXd = Eigen::VectorXd;
+using MatrixXd = Eigen::MatrixXd;
+template<int N>
+using MatrixNd = Eigen::Matrix<double, N, N>;
+template<int N>
+using ArrayNd = Eigen::Array<double, N, N>;
+template<int N>
+using Matrix2Nd = Eigen::Matrix<double, 2 * N, 2 * N>;
+template<int N>
+using Matrix3Nd = Eigen::Matrix<double, 3 * N, 3 * N>;
+template<int N>
+using Matrix2xNd = Eigen::Matrix<double, 2, N>;
+template<int N>
+using Array2xNd = Eigen::Array<double, 2, N>;
+template<int N>
+using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+template<int N>
+using MatrixNx3d = Eigen::Matrix<double, N, 3>;
+template<int N>
+using MatrixNx5d = Eigen::Matrix<double, N, 5>;
+template<int N>
+using VectorNd = Eigen::Matrix<double, N, 1>;
+template<int N>
+using Vector2Nd = Eigen::Matrix<double, 2 * N, 1>;
+template<int N>
+using Vector3Nd = Eigen::Matrix<double, 3 * N, 1>;
+template<int N>
+using RowVectorNd = Eigen::Matrix<double, 1, 1, N>;
+template<int N>
+using RowVector2Nd = Eigen::Matrix<double, 1, 2 * N>;
+
+
+
+using Vector2d = Eigen::Vector2d;
+using Vector3d = Eigen::Vector3d;
+using Vector4d = Eigen::Vector4d;
+using Matrix2d = Eigen::Matrix2d;
+using Matrix3d = Eigen::Matrix3d;
+using Matrix4d = Eigen::Matrix4d;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+using Matrix6d = Eigen::Matrix<double, 6, 6>;
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+
+using Matrix3f = Eigen::Matrix3f;
+using Vector3f = Eigen::Vector3f;
+using Vector4f = Eigen::Vector4f;
+using Vector6f = Eigen::Matrix<double, 6, 1>;
+
+using u_int = unsigned int;
+
+
+struct circle_fit
+{
+    Vector3d par;  //!< parameter: (X0,Y0,R)
+    Matrix3d cov;
+    /*!< covariance matrix: \n
+      |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
+      |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
+      |cov(X0, R)|cov(Y0, R)|cov( R, R)|
+  */
+    int32_t q;  //!< particle charge
+    float chi2 = 0.0;
+};
+
+struct line_fit
+{
+    Vector2d par;  //!<(cotan(theta),Zip)
+    Matrix2d cov;
+    /*!<
+      |cov(c_t,c_t)|cov(Zip,c_t)| \n
+      |cov(c_t,Zip)|cov(Zip,Zip)|
+  */
+    double chi2 = 0.0;
+};
+
+struct helix_fit
+{
+    Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
+    Matrix5d cov;
+    /*!< ()->cov() \n
+      |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
+      |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
+      |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
+      |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
+      |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
+  */
+    float chi2_circle;
+    float chi2_line;
+//    Vector4d fast_fit;
+    int32_t q;  //!< particle charge
+} __attribute__((aligned(16)));
+
+} // namespace RFit
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 33f8334c8b5a5..3e93aab13d00d 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -1,89 +1,16 @@
 #ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
 #define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
 
-#include <cmath>
+#include "FitResult.h"
 
-#include <cuda_runtime.h>
-#include <Eigen/Core>
-#include <Eigen/Eigenvalues>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-
-#ifndef RFIT_DEBUG
-#define RFIT_DEBUG 0
-#endif  // RFIT_DEBUG
 
 namespace Rfit
 {
-using namespace Eigen;
-
-constexpr double d = 1.e-4;          //!< used in numerical derivative (J2 in Circle_fit())
-constexpr unsigned int max_nop = 4;  //!< In order to avoid use of dynamic memory
-
-using MatrixNd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
-using ArrayNd = Eigen::Array<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
-using Matrix2Nd = Eigen::Matrix<double, Dynamic, Dynamic, 0, 2 * max_nop, 2 * max_nop>;
-using Matrix3Nd = Eigen::Matrix<double, Dynamic, Dynamic, 0, 3 * max_nop, 3 * max_nop>;
-using Matrix2xNd = Eigen::Matrix<double, 2, Dynamic, 0, 2, max_nop>;
-using Array2xNd = Eigen::Array<double, 2, Dynamic, 0, 2, max_nop>;
-using Matrix3xNd = Eigen::Matrix<double, 3, Dynamic, 0, 3, max_nop>;
-using MatrixNx3d = Eigen::Matrix<double, Dynamic, 3, 0, max_nop, 3>;
-using MatrixNx5d = Eigen::Matrix<double, Dynamic, 5, 0, max_nop, 5>;
-using VectorNd = Eigen::Matrix<double, Dynamic, 1, 0, max_nop, 1>;
-using Vector2Nd = Eigen::Matrix<double, Dynamic, 1, 0, 2 * max_nop, 1>;
-using Vector3Nd = Eigen::Matrix<double, Dynamic, 1, 0, 3 * max_nop, 1>;
-using RowVectorNd = Eigen::Matrix<double, 1, Dynamic, 1, 1, max_nop>;
-using RowVector2Nd = Eigen::Matrix<double, 1, Dynamic, 1, 1, 2 * max_nop>;
-using Matrix5d = Eigen::Matrix<double, 5, 5>;
-using Matrix6d = Eigen::Matrix<double, 6, 6>;
-using Vector5d = Eigen::Matrix<double, 5, 1>;
-using u_int = unsigned int;
-
-struct circle_fit
-{
-    Vector3d par;  //!< parameter: (X0,Y0,R)
-    Matrix3d cov;
-    /*!< covariance matrix: \n
-      |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
-      |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
-      |cov(X0, R)|cov(Y0, R)|cov( R, R)|
-      */
-    int64_t q;  //!< particle charge
-    double chi2 = 0.0;
-};
-
-struct line_fit
-{
-    Vector2d par;  //!<(cotan(theta),Zip)
-    Matrix2d cov;
-    /*!<
-      |cov(c_t,c_t)|cov(Zip,c_t)| \n
-      |cov(c_t,Zip)|cov(Zip,Zip)|
-  */
-    double chi2 = 0.0;
-};
-
-struct helix_fit
-{
-    Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
-    Matrix5d cov;
-    /*!< ()->cov() \n
-      |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
-      |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
-      |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
-      |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
-      |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
-  */
-    double chi2_circle = 0.0;
-    double chi2_line = 0.0;
-    Vector4d fast_fit;
-    int64_t q;  //!< particle charge
-} __attribute__((aligned(16)));
 
 template <class C>
 __host__ __device__ void printIt(C* m, const char* prefix = "")
 {
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     for (u_int r = 0; r < m->rows(); ++r)
     {
         for (u_int c = 0; c < m->cols(); ++c)
@@ -106,10 +33,8 @@ __host__ __device__ inline T sqr(const T a)
 /*!
     \brief Compute cross product of two 2D vector (assuming z component 0),
     returning z component of the result.
-
     \param a first 2D vector in the product.
     \param b second 2D vector in the product.
-
     \return z component of the cross product.
 */
 
@@ -118,6 +43,72 @@ __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
     return a.x() * b.y() - a.y() * b.x();
 }
 
+/*!
+ *  load error in CMSSW format to our formalism
+ *  
+ */
+  template<typename M6x4f, typename M2Nd>
+  __host__ __device__ void loadCovariance2D(M6x4f const & ge,  M2Nd & hits_cov) {
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t  hits_in_fit = 4; // Fixme
+    for (uint32_t i=0; i< hits_in_fit; ++i) {
+      auto ge_idx = 0; auto j=0; auto l=0;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 2; j=1; l=1;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1; j=1; l=0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+    }
+  }
+  
+  template<typename M6x4f, int N>
+  __host__ __device__ void loadCovariance(M6x4f const & ge,  Matrix3Nd<N> & hits_cov) {
+
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t  hits_in_fit = 4; // Fixme
+    for (uint32_t i=0; i<hits_in_fit; ++i) {
+      auto ge_idx = 0; auto j=0; auto l=0;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 2; j=1; l=1;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 5; j=2; l=2;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1; j=1; l=0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 3; j=2; l=0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 4; j=2; l=1;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+    }
+  }
+
+  
 /*!  Compute the Radiation length in the uniform hypothesis
  *
  * The Pixel detector, barrel and forward, is considered as an omogeneous
@@ -140,17 +131,19 @@ __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
  * \return incremental radiation lengths that correspond to each segment.
  */
 
+
+  template<typename VNd1, typename VNd2>
 __host__ __device__ inline
-void computeRadLenUniformMaterial(const VectorNd &length_values,
-    VectorNd & rad_lengths) {
+void computeRadLenUniformMaterial(const VNd1 &length_values,
+    VNd2 & rad_lengths) {
   // Radiation length of the pixel detector in the uniform assumption, with
   // 0.06 rad_len at 16 cm
-  const double XX_0 = 16.f/(0.06);
+  constexpr double XX_0_inv = 0.06/16.;
 //  const double XX_0 = 1000.*16.f/(0.06);
   u_int n = length_values.rows();
-  rad_lengths(0) = length_values(0)/XX_0;
+  rad_lengths(0) = length_values(0)*XX_0_inv;
   for (u_int j = 1; j < n; ++j) {
-    rad_lengths(j) = std::abs(length_values(j)-length_values(j-1))/XX_0;
+    rad_lengths(j) = std::abs(length_values(j)-length_values(j-1))*XX_0_inv;
   }
 }
 
@@ -158,18 +151,14 @@ void computeRadLenUniformMaterial(const VectorNd &length_values,
     \brief Compute the covariance matrix along cartesian S-Z of points due to
     multiple Coulomb scattering to be used in the line_fit, for the barrel
     and forward cases.
-
     The input covariance matrix is in the variables s-z, original and
     unrotated.
-
     The multiple scattering component is computed in the usual linear
     approximation, using the 3D path which is computed as the squared root of
     the squared sum of the s and z components passed in.
-
     Internally a rotation by theta is performed and the covariance matrix
     returned is the one in the direction orthogonal to the rotated S3D axis,
     i.e. along the rotated Z axis.
-
     The choice of the rotation is not arbitrary, but derived from the fact that
     putting the horizontal axis along the S3D direction allows the usage of the
     ordinary least squared fitting techiques with the trivial parametrization y
@@ -177,44 +166,39 @@ void computeRadLenUniformMaterial(const VectorNd &length_values,
     correspond to the case at eta = 0.
  */
 
-__host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
-                                                     const Vector4d& fast_fit,
-                                                     VectorNd const& s_arcs,
-                                                     VectorNd const& z_values,
-                                                     const double theta,
-                                                     const double B)
+  template<typename V4, typename VNd1, typename VNd2, int N>
+__host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
+						 const V4& fast_fit,
+						 VNd1 const& s_arcs,
+						 VNd2 const& z_values,
+						 const double theta,
+						 const double B, 
+                                                 MatrixNd<N>& ret)
 {
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     Rfit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
 #endif
-    u_int n = s_arcs.rows();
-    double p_t = fast_fit(2) * B;
+    constexpr auto n = N;
+    double p_t = std::min(20.,fast_fit(2) * B);   // limit pt to avoid too small error!!!
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
-    VectorNd rad_lengths_S(n);
+    VectorNd<N> rad_lengths_S;
     // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
     // Basically, to perform cwise operations on Matrices and Vectors, you need
     // to transform them into Array-like objects.
-    VectorNd S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    VectorNd<N> S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
     S_values = S_values.array().sqrt();
     computeRadLenUniformMaterial(S_values, rad_lengths_S);
-    VectorNd sig2_S(n);
-    sig2_S = .000225 / p_2 * (1.f + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
-#if RFIT_DEBUG
-    Rfit::printIt(&cov_sz, "Scatter_cov_line - cov_sz: ");
+    VectorNd<N> sig2_S;
+    sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
+#ifdef RFIT_DEBUG
+    Rfit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
 #endif
-    Matrix2Nd rot = MatrixXd::Zero(2 * n, 2 * n);
-    for (u_int i = 0; i < n; ++i) {
-      rot(i, i) = sin(theta);
-      rot(n + i, n + i) = sin(theta);
-      u_int j = (i + n);
-      rot(i, j) = i < j ? cos(theta) : -cos(theta);
-    }
-
-#if RFIT_DEBUG
-    Rfit::printIt(&rot, "Scatter_cov_line - rot: ");
-#endif
-
-    Matrix2Nd tmp = rot*cov_sz*rot.transpose();
+    Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
+    for (u_int k = 0; k < n; ++k) {
+     tmp(k, k) = cov_sz[k](0, 0);
+     tmp(k + n, k + n) = cov_sz[k](1, 1);
+     tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
+    } 
     for (u_int k = 0; k < n; ++k)
     {
       for (u_int l = k; l < n; ++l)
@@ -222,48 +206,44 @@ __host__ __device__ inline MatrixNd Scatter_cov_line(Matrix2Nd& cov_sz,
         for (u_int i = 0; i < std::min(k, l); ++i)
         {
           tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
-          tmp(l + n, k + n) = tmp(k + n, l + n);
         }
+        tmp(l + n, k + n) = tmp(k + n, l + n);
       }
     }
     // We are interested only in the errors orthogonal to the rotated s-axis
     // which, in our formalism, are in the lower square matrix.
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
-    return tmp.block(n, n, n, n);
+    ret = tmp.block(n, n, n, n);
 }
 
 /*!
     \brief Compute the covariance matrix (in radial coordinates) of points in
     the transverse plane due to multiple Coulomb scattering.
-
     \param p2D 2D points in the transverse plane.
     \param fast_fit fast_fit Vector4d result of the previous pre-fit
     structured in this form:(X0, Y0, R, Tan(Theta))).
     \param B magnetic field use to compute p
-
     \return scatter_cov_rad errors due to multiple scattering.
-
     \warning input points must be ordered radially from the detector center
     (from inner layer to outer ones; points on the same layer must ordered too).
-
     \details Only the tangential component is computed (the radial one is
     negligible).
-
  */
-__host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
-                                                    const Vector4d& fast_fit,
-                                                    VectorNd const& rad,
-                                                    double B)
+  template<typename M2xN, typename V4, int N>
+  __host__ __device__ inline MatrixNd<N> Scatter_cov_rad(const M2xN& p2D,
+							 const V4& fast_fit,
+							 VectorNd<N> const& rad,
+							 double B)
 {
-    u_int n = p2D.cols();
-    double p_t = fast_fit(2) * B;
+    u_int n = N;
+    double p_t = std::min(20.,fast_fit(2) * B);   // limit pt to avoid too small error!!!
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
     double theta = atan(fast_fit(3));
     theta = theta < 0. ? theta + M_PI :  theta;
-    VectorNd s_values(n);
-    VectorNd rad_lengths(n);
+    VectorNd<N> s_values;
+    VectorNd<N> rad_lengths;
     const Vector2d o(fast_fit(0), fast_fit(1));
 
     // associated Jacobian, used in weights and errors computation
@@ -276,21 +256,21 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
         s_values(i) = std::abs(atan2_ * fast_fit(2));
     }
     computeRadLenUniformMaterial(s_values*sqrt(1. + 1./(fast_fit(3)*fast_fit(3))), rad_lengths);
-    MatrixNd scatter_cov_rad = MatrixXd::Zero(n, n);
-    VectorNd sig2(n);
-    sig2 = .000225 / p_2 * (1.f + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+    MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
+    VectorNd<N> sig2 =  (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+    sig2 *= 0.000225 / ( p_2 * sqr(sin(theta)) );
     for (u_int k = 0; k < n; ++k)
     {
         for (u_int l = k; l < n; ++l)
         {
             for (u_int i = 0; i < std::min(k, l); ++i)
             {
-              scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i) / (sqr(sin(theta)));
-              scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+              scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
             }
+            scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
         }
     }
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
 #endif
     return scatter_cov_rad;
@@ -299,24 +279,23 @@ __host__ __device__ inline MatrixNd Scatter_cov_rad(const Matrix2xNd& p2D,
 /*!
     \brief Transform covariance matrix from radial (only tangential component)
     to Cartesian coordinates (only transverse plane component).
-
     \param p2D 2D points in the transverse plane.
     \param cov_rad covariance matrix in radial coordinate.
-
     \return cov_cart covariance matrix in Cartesian coordinates.
 */
 
-__host__ __device__ inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
-                                                   const MatrixNd& cov_rad,
-                                                   const VectorNd& rad)
+  template<typename M2xN, int N>
+  __host__ __device__ inline Matrix2Nd<N> cov_radtocart(const M2xN& p2D,
+                                                   const MatrixNd<N>& cov_rad,
+                                                   const VectorNd<N>& rad)
 {
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("Address of p2D: %p\n", &p2D);
 #endif
     printIt(&p2D, "cov_radtocart - p2D:");
     u_int n = p2D.cols();
-    Matrix2Nd cov_cart = MatrixXd::Zero(2 * n, 2 * n);
-    VectorNd rad_inv = rad.cwiseInverse();
+    Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
+    VectorNd<N> rad_inv = rad.cwiseInverse();
     printIt(&rad_inv, "cov_radtocart - rad_inv:");
     for (u_int i = 0; i < n; ++i)
     {
@@ -326,7 +305,6 @@ __host__ __device__ inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
             cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
             cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
             cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
-
             cov_cart(j, i) = cov_cart(i, j);
             cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
             cov_cart(j + n, i) = cov_cart(i, j + n);
@@ -341,29 +319,27 @@ __host__ __device__ inline Matrix2Nd cov_radtocart(const Matrix2xNd& p2D,
     transverse plane component) to radial coordinates (both radial and
     tangential component but only diagonal terms, correlation between different
     point are not managed).
-
     \param p2D 2D points in transverse plane.
     \param cov_cart covariance matrix in Cartesian coordinates.
-
     \return cov_rad covariance matrix in raidal coordinate.
-
     \warning correlation between different point are not computed.
 */
-__host__ __device__ inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
-                                                  const Matrix2Nd& cov_cart,
-                                                  const VectorNd& rad)
+    template<typename M2xN, int N>
+    __host__ __device__ inline VectorNd<N> cov_carttorad(const M2xN& p2D,
+							 const Matrix2Nd<N>& cov_cart,
+							 const VectorNd<N>& rad)
 {
     u_int n = p2D.cols();
-    MatrixNd cov_rad = MatrixXd::Zero(n, n);
-    const VectorNd rad_inv2 = rad.cwiseInverse().array().square();
+    VectorNd<N> cov_rad;
+    const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
     for (u_int i = 0; i < n; ++i)
     {
         //!< in case you have (0,0) to avoid dividing by 0 radius
         if (rad(i) < 1.e-4)
-            cov_rad(i, i) = cov_cart(i, i);
+            cov_rad(i) = cov_cart(i, i);
         else
         {
-            cov_rad(i, i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) - 2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+            cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) - 2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
         }
     }
     return cov_rad;
@@ -374,28 +350,25 @@ __host__ __device__ inline MatrixNd cov_carttorad(const Matrix2xNd& p2D,
     transverse plane component) to coordinates system orthogonal to the
     pre-fitted circle in each point.
     Further information in attached documentation.
-
     \param p2D 2D points in transverse plane.
     \param cov_cart covariance matrix in Cartesian coordinates.
     \param fast_fit fast_fit Vector4d result of the previous pre-fit
     structured in this form:(X0, Y0, R, tan(theta))).
-
     \return cov_rad covariance matrix in the pre-fitted circle's
     orthogonal system.
-
 */
-
-__host__ __device__ inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D, const Matrix2Nd& cov_cart,
-                                                         const Vector4d& fast_fit,
-                                                         const VectorNd& rad)
+template<typename M2xN, typename V4, int N>
+  __host__ __device__ inline VectorNd<N> cov_carttorad_prefit(const M2xN& p2D, const Matrix2Nd<N>& cov_cart,
+							      V4& fast_fit,
+							      const VectorNd<N>& rad)
 {
     u_int n = p2D.cols();
-    MatrixNd cov_rad = MatrixXd::Zero(n, n);
+    VectorNd<N> cov_rad;
     for (u_int i = 0; i < n; ++i)
     {
         //!< in case you have (0,0) to avoid dividing by 0 radius
         if (rad(i) < 1.e-4)
-            cov_rad(i, i) = cov_cart(i, i);  // TO FIX
+            cov_rad(i) = cov_cart(i, i);  // TO FIX
         else
         {
             Vector2d a = p2D.col(i);
@@ -404,7 +377,7 @@ __host__ __device__ inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D,
             const double y2 = cross2D(a, b);
             const double tan_c = -y2 / x2;
             const double tan_c2 = sqr(tan_c);
-            cov_rad(i, i) = 1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+            cov_rad(i) = 1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
         }
     }
     return cov_rad;
@@ -414,17 +387,15 @@ __host__ __device__ inline MatrixNd cov_carttorad_prefit(const Matrix2xNd& p2D,
     \brief Compute the points' weights' vector for the circle fit when multiple
     scattering is managed.
     Further information in attached documentation.
-
     \param cov_rad_inv covariance matrix inverse in radial coordinated
     (or, beter, pre-fitted circle's orthogonal system).
-
     \return weight VectorNd points' weights' vector.
-
     \bug I'm not sure this is the right way to compute the weights for non
     diagonal cov matrix. Further investigation needed.
 */
 
-__host__ __device__ inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv)
+  template<int N>  
+  __host__ __device__ inline VectorNd<N> Weight_circle(const MatrixNd<N>& cov_rad_inv)
 {
     return cov_rad_inv.colwise().sum().transpose();
 }
@@ -433,14 +404,12 @@ __host__ __device__ inline VectorNd Weight_circle(const MatrixNd& cov_rad_inv)
     \brief Find particle q considering the  sign of cross product between
     particles velocity (estimated by the first 2 hits) and the vector radius
     between the first hit and the center of the fitted circle.
-
     \param p2D 2D points in transverse plane.
     \param par_uvr result of the circle fit in this form: (X0,Y0,R).
-
     \return q int 1 or -1.
 */
-
-__host__ __device__ inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d& par_uvr)
+template<typename M2xN> 
+  __host__ __device__ inline int32_t Charge(const M2xN& p2D, const Vector3d& par_uvr)
 {
     return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) > 0)? -1 : 1;
 }
@@ -448,7 +417,6 @@ __host__ __device__ inline int64_t Charge(const Matrix2xNd& p2D, const Vector3d&
 /*!
     \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and
     consequently covariance matrix.
-
     \param circle_uvr parameter (X0,Y0,R), covariance matrix to
     be transformed and particle charge.
     \param B magnetic field in Gev/cm/c unit.
@@ -467,7 +435,9 @@ __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B,
         const double temp2 = sqr(circle.par(0)) * 1. / temp0;
         const double temp3 = 1. / temp1 * circle.q;
         Matrix3d J4;
-        J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3, circle.par(1) * temp3, -circle.q, 0., 0., B;
+        J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., 
+               circle.par(0) * temp3, circle.par(1) * temp3, -circle.q,
+               0., 0., B;
         circle.cov = J4 * circle.cov * J4.transpose();
     }
     circle.par = par_pak;
@@ -475,14 +445,10 @@ __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B,
 
 /*!
     \brief Compute the eigenvector associated to the minimum eigenvalue.
-
     \param A the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored.
-
     \return the eigenvector associated to the minimum eigenvalue.
-
     \warning double precision is needed for a correct assessment of chi2.
-
     \details The minimus eigenvalue is related to chi2.
     We exploit the fact that the matrix is symmetrical and small (2x2 for line
     fit and 3x3 for circle fit), so the SelfAdjointEigenSolver from Eigen
@@ -490,19 +456,18 @@ __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B,
     and 3x3 Matrix) wich computes eigendecomposition of given matrix using a
     fast closed-form algorithm.
     For this optimization the matrix type must be known at compiling time.
-
 */
 
 __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2)
 {
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("min_eigen3D - enter\n");
 #endif
-    SelfAdjointEigenSolver<Matrix3d> solver(3);
+    Eigen::SelfAdjointEigenSolver<Matrix3d> solver(3);
     solver.computeDirect(A);
     int min_index;
     chi2 = solver.eigenvalues().minCoeff(&min_index);
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("min_eigen3D - exit\n");
 #endif
     return solver.eigenvectors().col(min_index);
@@ -511,12 +476,9 @@ __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2)
 /*!
     \brief A faster version of min_eigen3D() where double precision is not
     needed.
-
     \param A the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored
-
     \return the eigenvector associated to the minimum eigenvalue.
-
     \detail The computedDirect() method of SelfAdjointEigenSolver for 3x3 Matrix
     indeed, use trigonometry function (it solves a third degree equation) which
     speed up in  single precision.
@@ -524,7 +486,7 @@ __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2)
 
 __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A)
 {
-    SelfAdjointEigenSolver<Matrix3f> solver(3);
+    Eigen::SelfAdjointEigenSolver<Matrix3f> solver(3);
     solver.computeDirect(A.cast<float>());
     int min_index;
     solver.eigenvalues().minCoeff(&min_index);
@@ -533,12 +495,9 @@ __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A)
 
 /*!
     \brief 2D version of min_eigen3D().
-
     \param A the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored
-
     \return the eigenvector associated to the minimum eigenvalue.
-
     \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix
     do not use special math function (just sqrt) therefore it doesn't speed up
     significantly in single precision.
@@ -546,7 +505,7 @@ __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A)
 
 __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2)
 {
-    SelfAdjointEigenSolver<Matrix2d> solver(2);
+    Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
     solver.computeDirect(A);
     int min_index;
     chi2 = solver.eigenvalues().minCoeff(&min_index);
@@ -556,23 +515,19 @@ __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2)
 /*!
     \brief A very fast helix fit: it fits a circle by three points (first, middle
     and last point) and a line by two points (first and last).
-
     \param hits points to be fitted
-
     \return result in this form: (X0,Y0,R,tan(theta)).
-
     \warning points must be passed ordered (from internal layer to external) in
     order to maximize accuracy and do not mistake tan(theta) sign.
-
     \details This fast fit is used as pre-fit which is needed for:
     - weights estimation and chi2 computation in line fit (fundamental);
     - weights estimation and chi2 computation in circle fit (useful);
     - computation of error due to multiple scattering.
 */
 
-__host__ __device__ inline Vector4d Fast_fit(const Matrix3xNd& hits)
+template<typename M3xN, typename V4>
+__host__ __device__ inline void Fast_fit(const M3xN& hits, V4 & result)
 {
-    Vector4d result;
     u_int n = hits.cols();  // get the number of hits
     printIt(&hits, "Fast_fit - hits: ");
 
@@ -583,35 +538,26 @@ __host__ __device__ inline Vector4d Fast_fit(const Matrix3xNd& hits)
     printIt(&b, "Fast_fit - b: ");
     printIt(&c, "Fast_fit - c: ");
     // Compute their lengths
-    const double b2 = b.squaredNorm();
-    const double c2 = c.squaredNorm();
-    double X0;
-    double Y0;
+    auto b2 = b.squaredNorm();
+    auto c2 = c.squaredNorm();
     // The algebra has been verified (MR). The usual approach has been followed:
     // * use an orthogonal reference frame passing from the first point.
     // * build the segments (chords)
     // * build orthogonal lines through mid points
     // * make a system and solve for X0 and Y0.
     // * add the initial point
-    if (abs(b.x()) > abs(b.y()))
-    {  //!< in case b.x is 0 (2 hits with same x)
-        const double k = c.x() / b.x();
-        const double div = 2. * (k * b.y() - c.y());
-        // if aligned TO FIX
-        Y0 = (k * b2 - c2) / div;
-        X0 = b2 / (2 * b.x()) - b.y() / b.x() * Y0;
-    }
-    else
-    {
-        const double k = c.y() / b.y();
-        const double div = 2. * (k * b.x() - c.x());
-        // if aligned TO FIX
-        X0 = (k * b2 - c2) / div;
-        Y0 = b2 / (2 * b.y()) - b.x() / b.y() * X0;
-    }
-
-    result(0) = X0 + hits(0, 0);
-    result(1) = Y0 + hits(1, 0);
+    bool flip =  abs(b.x()) < abs(b.y());
+    auto bx = flip ? b.y() : b.x();
+    auto by = flip ? b.x() : b.y();
+    auto cx = flip ? c.y() : c.x();
+    auto cy = flip ? c.x() : c.y();
+    //!< in case b.x is 0 (2 hits with same x)
+    auto div = 2. * (cx * by - bx*cy);
+    // if aligned TO FIX
+    auto Y0 = (cx*b2 - bx*c2) / div;
+    auto X0 = (0.5*b2 - Y0*by) / bx;
+    result(0) = hits(0, 0) + ( flip ? Y0 : X0);
+    result(1) = hits(1, 0) + ( flip ? X0 : Y0);
     result(2) = sqrt(sqr(X0) + sqr(Y0));
     printIt(&result, "Fast_fit - result: ");
 
@@ -621,23 +567,21 @@ __host__ __device__ inline Vector4d Fast_fit(const Matrix3xNd& hits)
     printIt(&e, "Fast_fit - e: ");
     printIt(&d, "Fast_fit - d: ");
     // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
-    const double dr = result(2) * atan2(cross2D(d, e), d.dot(e));
+    auto dr = result(2) * atan2(cross2D(d, e), d.dot(e));
     // Simple difference in Z between last and first hit
-    const double dz = hits(2, n - 1) - hits(2, 0);
+    auto dz = hits(2, n - 1) - hits(2, 0);
 
     result(3) = (dr / dz);
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
 #endif
-    return result;
 }
 
 /*!
     \brief Fit a generic number of 2D points with a circle using Riemann-Chernov
     algorithm. Covariance matrix of fitted parameter is optionally computed.
     Multiple scattering (currently only in barrel layer) is optionally handled.
-
     \param hits2D 2D points to be fitted.
     \param hits_cov2D covariance matrix of 2D points.
     \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)).
@@ -645,92 +589,80 @@ __host__ __device__ inline Vector4d Fast_fit(const Matrix3xNd& hits)
     \param B magnetic field
     \param error flag for error computation.
     \param scattering flag for multiple scattering
-
     \return circle circle_fit:
     -par parameter of the fitted circle in this form (X0,Y0,R); \n
     -cov covariance matrix of the fitted parameter (not initialized if
     error = false); \n
     -q charge of the particle; \n
     -chi2.
-
     \warning hits must be passed ordered from inner to outer layer (double hits
     on the same layer must be ordered too) so that multiple scattering is
     treated properly.
     \warning Multiple scattering for barrel is still not tested.
     \warning Multiple scattering for endcap hits is not handled (yet). Do not
     fit endcap hits with scattering = true !
-
     \bug for small pt (<0.3 Gev/c) chi2 could be slightly underestimated.
     \bug further investigation needed for error propagation with multiple
     scattering.
 */
-
-__host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
-                                                 const Matrix2Nd& hits_cov2D,
-                                                 const Vector4d& fast_fit,
-                                                 const VectorNd& rad,
+template<typename M2xN, typename V4, int N>
+__host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
+                                                 const Matrix2Nd<N>& hits_cov2D,
+                                                 const V4& fast_fit,
+                                                 const VectorNd<N>& rad,
                                                  const double B,
-                                                 const bool error = true)
+                                                 const bool error)
 {
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - enter\n");
 #endif
     // INITIALIZATION
-    Matrix2Nd V = hits_cov2D;
+    Matrix2Nd<N> V = hits_cov2D;
     u_int n = hits2D.cols();
     printIt(&hits2D, "circle_fit - hits2D:");
     printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - WEIGHT COMPUTATION\n");
 #endif
     // WEIGHT COMPUTATION
-    VectorNd weight;
-    MatrixNd G;
+    VectorNd<N> weight;
+    MatrixNd<N> G;
     double renorm;
     {
-        MatrixNd cov_rad;
-        cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad);
-        printIt(&cov_rad, "circle_fit - cov_rad:");
-        // cov_rad = cov_carttorad(hits2D, V);
-
-        MatrixNd scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
+        MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad).asDiagonal();
+        MatrixNd<N> scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
         printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
         printIt(&hits2D, "circle_fit - hits2D bis:");
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
         printf("Address of hits2D: a) %p\n", &hits2D);
 #endif
         V += cov_radtocart(hits2D, scatter_cov_rad, rad);
         printIt(&V, "circle_fit - V:");
         cov_rad += scatter_cov_rad;
         printIt(&cov_rad, "circle_fit - cov_rad:");
-        Matrix4d cov_rad4 = cov_rad;
-        Matrix4d G4;
-        G4 = cov_rad4.inverse();
-        printIt(&G4, "circle_fit - G4:");
-        renorm = G4.sum();
-        G4 *= 1. / renorm;
-        printIt(&G4, "circle_fit - G4:");
-        G = G4;
+        G = cov_rad.inverse();
+	renorm = G.sum();
+        G *= 1. / renorm;
         weight = Weight_circle(G);
     }
     printIt(&weight, "circle_fit - weight:");
 
     // SPACE TRANSFORMATION
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - SPACE TRANSFORMATION\n");
 #endif
 
     // center
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("Address of hits2D: b) %p\n", &hits2D);
 #endif
     const Vector2d h_ = hits2D.rowwise().mean();  // centroid
     printIt(&h_, "circle_fit - h_:");
-    Matrix3xNd p3D(3, n);
+    Matrix3xNd<N> p3D;
     p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
     printIt(&p3D, "circle_fit - p3D: a)");
-    Vector2Nd mc(2 * n);  // centered hits, used in error computation
+    Vector2Nd<N> mc;  // centered hits, used in error computation
     mc << p3D.row(0).transpose(), p3D.row(1).transpose();
     printIt(&mc, "circle_fit - mc(centered hits):");
 
@@ -743,25 +675,24 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
     printIt(&p3D, "circle_fit - p3D: b)");
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - COST FUNCTION\n");
 #endif
     // COST FUNCTION
 
     // compute
-    Matrix3d A = Matrix3d::Zero();
-    const Vector3d r0 = p3D * weight;  // center of gravity
-    const Matrix3xNd X = p3D.colwise() - r0;
-    A = X * G * X.transpose();
+    Vector3d r0; r0.noalias() = p3D * weight;  // center of gravity
+    const Matrix3xNd<N> X = p3D.colwise() - r0;
+    Matrix3d A = X * G * X.transpose();
     printIt(&A, "circle_fit - A:");
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - MINIMIZE\n");
 #endif
     // minimize
     double chi2;
     Vector3d v = min_eigen3D(A, chi2);
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN\n");
 #endif
     printIt(&v, "v BEFORE INVERSION");
@@ -769,21 +700,21 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     printIt(&v, "v AFTER INVERSION");
     // This hack to be able to run on GPU where the automatic assignment to a
     // double from the vector multiplication is not working.
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN 1\n");
 #endif
-    Matrix<double, 1, 1> cm;
-#if RFIT_DEBUG
+    Eigen::Matrix<double, 1, 1> cm;
+#ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN 2\n");
 #endif
     cm = -v.transpose() * r0;
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN 3\n");
 #endif
     const double c = cm(0, 0);
     //  const double c = -v.transpose() * r0;
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
 #endif
     // COMPUTE CIRCLE PARAMETER
@@ -801,57 +732,60 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
     printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
     printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
-#if RFIT_DEBUG
-    printf("circle_fit - CIRCLE CHARGE: %ld\n", circle.q);
+#ifdef RFIT_DEBUG
+    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.q);
 #endif
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - ERROR PROPAGATION\n");
 #endif
     // ERROR PROPAGATION
     if (error)
     {
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
         printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
 #endif
-        ArrayNd Vcs_[2][2];  // cov matrix of center & scaled points
-#if RFIT_DEBUG
+        ArrayNd<N> Vcs_[2][2];  // cov matrix of center & scaled points
+        MatrixNd<N> C[3][3];  // cov matrix of 3D transformed points
+#ifdef RFIT_DEBUG
         printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
 #endif
         {
-            Matrix<double, 1, 1> cm;
-            Matrix<double, 1, 1> cm2;
+            Eigen::Matrix<double, 1, 1> cm;
+            Eigen::Matrix<double, 1, 1> cm2;
             cm = mc.transpose() * V * mc;
-            //      cm2 = mc * mc.transpose();
             const double c = cm(0, 0);
-            //      const double c2 = cm2(0,0);
-            const Matrix2Nd Vcs = sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
-                                                   (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
-                                                   mc * mc.transpose();
+	     Matrix2Nd<N> Vcs; Vcs. template triangularView<Eigen::Upper>()  = (sqr(s) * V
+				  + sqr(sqr(s)) * 1. / (4. * q * n) *
+				  (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
+				  (mc * mc.transpose()));
+
             printIt(&Vcs, "circle_fit - Vcs:");
-            Vcs_[0][0] = Vcs.block(0, 0, n, n);
+            C[0][0] = Vcs.block(0, 0, n, n). template selfadjointView<Eigen::Upper>();
             Vcs_[0][1] = Vcs.block(0, n, n, n);
-            Vcs_[1][1] = Vcs.block(n, n, n, n);
+            C[1][1] = Vcs.block(n, n, n, n). template selfadjointView<Eigen::Upper>();
             Vcs_[1][0] = Vcs_[0][1].transpose();
             printIt(&Vcs, "circle_fit - Vcs:");
         }
 
-        MatrixNd C[3][3];  // cov matrix of 3D transformed points
         {
-            const ArrayNd t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
-            const ArrayNd t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
-            const ArrayNd t00 = p3D.row(0).transpose() * p3D.row(0);
-            const ArrayNd t01 = p3D.row(0).transpose() * p3D.row(1);
-            const ArrayNd t11 = p3D.row(1).transpose() * p3D.row(1);
-            const ArrayNd t10 = t01.transpose();
-            C[0][0] = Vcs_[0][0];
+            const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+            const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+            const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
+            const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
+            const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
+            const ArrayNd<N> t10 = t01.transpose();
+            Vcs_[0][0] = C[0][0];;
             C[0][1] = Vcs_[0][1];
             C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
-            C[1][1] = Vcs_[1][1];
+            Vcs_[1][1] = C[1][1];
             C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
-            C[2][2] = 2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
-                            Vcs_[1][1] * Vcs_[1][1]) +
-                      4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11);
+            MatrixNd<N> tmp;
+            tmp. template triangularView<Eigen::Upper>()
+	      =  ( 2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
+					   Vcs_[1][1] * Vcs_[1][1]) +
+				     4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11) ).matrix();
+	    C[2][2] = tmp. template selfadjointView<Eigen::Upper>();
         }
         printIt(&C[0][0], "circle_fit - C[0][0]:");
 
@@ -860,7 +794,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         {
             for (u_int j = i; j < 3; ++j)
             {
-                Matrix<double, 1, 1> tmp;
+                Eigen::Matrix<double, 1, 1> tmp;
                 tmp = weight.transpose() * C[i][j] * weight;
                 const double c = tmp(0, 0);
                 C0(i, j) = c;  //weight.transpose() * C[i][j] * weight;
@@ -869,14 +803,14 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         }
         printIt(&C0, "circle_fit - C0:");
 
-        const MatrixNd W = weight * weight.transpose();
-        const MatrixNd H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
-        const MatrixNx3d s_v = H * p3D.transpose();
+        const MatrixNd<N> W = weight * weight.transpose();
+        const MatrixNd<N> H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
+        const MatrixNx3d<N> s_v = H * p3D.transpose();
         printIt(&W, "circle_fit - W:");
         printIt(&H, "circle_fit - H:");
         printIt(&s_v, "circle_fit - s_v:");
 
-        MatrixNd D_[3][3];  // cov(s_v)
+        MatrixNd<N> D_[3][3];  // cov(s_v)
         {
             D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
             D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
@@ -893,14 +827,16 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
 
         Matrix6d E;  // cov matrix of the 6 independent elements of A
+        #pragma unroll
         for (u_int a = 0; a < 6; ++a)
         {
             const u_int i = nu[a][0], j = nu[a][1];
+            #pragma unroll
             for (u_int b = a; b < 6; ++b)
             {
                 const u_int k = nu[b][0], l = nu[b][1];
-                VectorNd t0(n);
-                VectorNd t1(n);
+                VectorNd<N> t0(n);
+                VectorNd<N> t1(n);
                 if (l == k)
                 {
                     t0 = 2. * D_[j][l] * s_v.col(l);
@@ -920,14 +856,14 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
 
                 if (i == j)
                 {
-                    Matrix<double, 1, 1> cm;
+                    Eigen::Matrix<double, 1, 1> cm;
                     cm = s_v.col(i).transpose() * (t0 + t1);
                     const double c = cm(0, 0);
                     E(a, b) = 0. + c;
                 }
                 else
                 {
-                    Matrix<double, 1, 1> cm;
+                    Eigen::Matrix<double, 1, 1> cm;
                     cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
                     const double c = cm(0, 0);
                     E(a, b) = 0. + c;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
@@ -938,7 +874,8 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         }
         printIt(&E, "circle_fit - E:");
 
-        Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
+        Eigen::Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
+        #pragma unroll
         for (u_int a = 0; a < 6; ++a)
         {
             const u_int i = nu[a][0], j = nu[a][1];
@@ -957,9 +894,8 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
             Cvc.block(0, 0, 3, 3) = t0;
             Cvc.block(0, 3, 3, 1) = t1;
             Cvc.block(3, 0, 1, 3) = t1.transpose();
-            Matrix<double, 1, 1> cm1;
-            //      Matrix<double, 1, 1> cm2;
-            Matrix<double, 1, 1> cm3;
+            Eigen::Matrix<double, 1, 1> cm1;
+            Eigen::Matrix<double, 1, 1> cm3;
             cm1 = (v.transpose() * C0 * v);
             //      cm2 = (C0.cwiseProduct(t0)).sum();
             cm3 = (r0.transpose() * t0 * r0);
@@ -969,7 +905,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         }
         printIt(&Cvc, "circle_fit - Cvc:");
 
-        Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+        Eigen::Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
         {
             const double t = 1. / h;
             J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
@@ -977,7 +913,7 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
         }
         printIt(&J3, "circle_fit - J3:");
 
-        const RowVector2Nd Jq = mc.transpose() * s * 1. / n;  // var(q)
+        const RowVector2Nd<N> Jq = mc.transpose() * s * 1. / n;  // var(q)
         printIt(&Jq, "circle_fit - Jq:");
 
         Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
@@ -987,225 +923,12 @@ __host__ __device__ inline circle_fit Circle_fit(const Matrix2xNd& hits2D,
     }
 
     printIt(&circle.cov, "Circle cov:");
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
     printf("circle_fit - exit\n");
 #endif
     return circle;
 }
 
-/*!
-    \brief Fit of helix parameter cotan(theta)) and Zip by projection on the
-    pre-fitted cylinder  and line fit on its surface.
-
-    \param hits hits coordinates.
-    \param hits_cov covariance matrix of the hits.
-    \param circle cylinder parameter, their covariance (if computed, otherwise
-    uninitialized) and particle charge.
-    \param fast_fit result of the previous fast fit in this form:
-    (X0,Y0,R,cotan(theta))).
-    \param error flag for error computation.
-
-    \return line line_fit:
-    -par parameter of the line in this form: (cotan(theta)), Zip); \n
-    -cov covariance matrix of the fitted parameter; \n
-    -chi2.
-
-    \warning correlation between R and z are neglected, this could be relevant
-    if geometry detector provides sloped modules in the R/z plane.
-
-    \bug chi2 and errors could be slightly underestimated for small eta (<0.2)
-    when pt is small (<0.3 Gev/c).
-
-    \todo multiple scattering treatment.
-
-    \details Line fit is made by orthogonal distance regression where
-    correlation between coordinates in the transverse plane (x,y) and z are
-    neglected (for a barrel + endcap geometry this is a very good
-    approximation).
-    Covariance matrix of the fitted parameter is optionally computed.
-    Multiple scattering is not handled (yet).
-    A fast pre-fit is performed in order to evaluate weights and to compute
-    errors.
-*/
-
-__host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
-    const Matrix3Nd& hits_cov,
-    const circle_fit& circle,
-    const Vector4d& fast_fit,
-    const double B,
-    const bool error = true)
-{
-    u_int n = hits.cols();
-    double theta = -circle.q*atan(fast_fit(3));
-    theta = theta < 0. ? theta + M_PI :  theta;
-    // PROJECTION ON THE CILINDER
-    Matrix2xNd p2D = MatrixXd::Zero(2, n);
-    Eigen::Matrix<double, 2, 6> Jx;
-
-#if RFIT_DEBUG
-    printf("Line_fit - B: %g\n", B);
-    printIt(&hits, "Line_fit points: ");
-    printIt(&hits_cov, "Line_fit covs: ");
-#endif
-    // x & associated Jacobian
-    // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
-    // Slide 11
-    // a ==> -o i.e. the origin of the circle in XY plane, negative
-    // b ==> p i.e. distances of the points wrt the origin of the circle.
-    const Vector2d o(circle.par(0), circle.par(1));
-
-    // associated Jacobian, used in weights and errors computation
-    Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
-    for (u_int i = 0; i < n; ++i)
-    {  // x
-      Matrix6d Cov = MatrixXd::Zero(6, 6);
-      Matrix2d Cov_sz_single = MatrixXd::Zero(2, 2);
-        Vector2d p = hits.block(0, i, 2, 1) - o;
-        const double cross = cross2D(-o, p);
-        const double dot = (-o).dot(p);
-        // atan2(cross, dot) give back the angle in the transverse plane so tha the
-        // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
-        const double atan2_ = -circle.q * atan2(cross, dot);
-        p2D(0, i) = atan2_ * circle.par(2);
-
-        // associated Jacobian, used in weights and errors- computation
-        const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-        double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
-        if (error)
-        {
-            d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
-            d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
-            d_R = atan2_;
-        }
-        const double d_x = temp0 * (o(1) * dot + o(0) * cross);
-        const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-        Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
-//        Jx << d_X0, d_Y0, d_R, p(1)/p.norm(), -p(0)/p.norm(), 0, 0, 0, 0, 0, 0, 1.;
-        Cov.block(0, 0, 3, 3) = circle.cov;
-        Cov(3, 3) = hits_cov(i, i);
-        Cov(4, 4) = hits_cov(i + n, i + n);
-        Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);
-        Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);
-        Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);
-        Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);
-        Cov_sz_single = Jx * Cov * Jx.transpose();
-        cov_sz(i, i) = Cov_sz_single(0, 0);
-        cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
-        cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
-    }
-    // Math of d_{X0,Y0,R,x,y} all verified by hand
-
-    // y
-    p2D.row(1) = hits.row(2);
-
-    // WEIGHT COMPUTATION
-#if RFIT_DEBUG
-    printIt(&cov_sz, "line_fit - cov_sz:");
-#endif
-    MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
-#if RFIT_DEBUG
-    printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
-#endif
-    Matrix4d G;
-    G = cov_with_ms.inverse();
-#if RFIT_DEBUG
-    printIt(&G, "line_fit - cov_with_ms.inverse():");
-#endif
-    double renorm = G.sum();
-    G *= 1. / renorm;
-#if RFIT_DEBUG
-    printIt(&G, "line_fit - G4:");
-#endif
-
-    const VectorNd weight = Weight_circle(G);
-
-    // COST FUNCTION
-
-    // compute
-    // r0 represents the weighted mean of "x" and "y".
-    const Vector2d r0 = p2D * weight;
-    // This is the X  vector that will be used to build the
-    // scatter matrix S = X^T * X
-    const Matrix2xNd X = p2D.colwise() - r0;
-    Matrix2d A = Matrix2d::Zero();
-    A = X * G * X.transpose();
-
-#if RFIT_DEBUG
-    printIt(&A, "Line_fit - A: ");
-#endif
-
-    // minimize. v is normalized!!
-    double chi2;
-    Vector2d v = min_eigen2D(A, chi2);
-#if RFIT_DEBUG
-    printIt(&v, "Line_fit - v: ");
-    printf("Line_fit chi2: %e\n", chi2);
-#endif
-
-    // This hack to be able to run on GPU where the automatic assignment to a
-    // double from the vector multiplication is not working.
-    Matrix<double, 1, 1> cm;
-    cm = -v.transpose() * r0;
-    const double c = cm(0, 0);
-
-    // COMPUTE LINE PARAMETER
-    line_fit line;
-    line.par << -v(0) / v(1),                          // cotan(theta))
-        -c / v(1);  // Zip
-    line.chi2 = abs(chi2*renorm);
-#if RFIT_DEBUG
-    printIt(&(line.par), "Line_fit - line.par: ");
-    printf("Line_fit - v norm: %e\n", sqrt(v(0)*v(0) + v(1)*v(1)));
-#endif
-
-    // ERROR PROPAGATION
-    if (error)
-    {
-        const double v0_2 = sqr(v(0));
-        const double v1_2 = sqr(v(1));
-
-        Matrix3d C;  // cov(v,c)
-        {
-          // The norm is taken from Chernov, properly adapted to the weights case.
-            double norm = v.transpose() * A * v;
-//            double norm_empirical = cov_with_ms.diagonal().mean();
-#if RFIT_DEBUG
-            printf("Chi_2: %g\n", chi2);
-            printf("Norm: %g\n", norm);
-            printf("weight.sum(): %g\n", weight.sum());
-            printf("Line_fit - norm:    %e\n", norm);
-#endif
-
-            const double sig2 = norm/(A(0,0) + A(1,1));
-            C(0, 0) = sig2 * v1_2;
-            C(1, 1) = sig2 * v0_2;
-            C(1, 0) = C(0, 1) = -sig2 * v(0) * v(1);
-            C(2, 2) = sig2 * (v(0)*r0(1)-v(1)*r0(0))*(v(0)*r0(1)-v(1)*r0(0)) + (sig2/n)*(A(0,0)+A(1,1));
-            C(0, 2) = C(2, 0) = sig2*(v(0)*r0(1)-v(1)*r0(0))*v(1);
-            C(1, 2) = C(2, 1) = - sig2*(v(0)*r0(1)-v(1)*r0(0))*v(0);
-        }
-#if RFIT_DEBUG
-        printIt(&C, "line_fit - C:");
-#endif
-
-        Matrix<double, 2, 3> J;  // Jacobian of (v,c) -> (cotan(theta)),Zip)
-        {
-            const double t0 = 1. / v(1);
-            const double t1 = sqr(t0);
-            J << -t0, v(0) * t1, 0., 0., c * t1, -t0;
-        }
-        Matrix<double, 3, 2> JT = J.transpose().eval();
-#if RFIT_DEBUG
-        printIt(&J, "line_fit - J:");
-#endif
-        line.cov = J * C * JT;
-    }
-
-#if RFIT_DEBUG
-    printIt(&line.cov, "Line cov:");
-#endif
-    return line;
-}
 
 /*!  \brief Perform an ordinary least square fit in the s-z plane to compute
  * the parameters cotTheta and Zip.
@@ -1223,16 +946,25 @@ __host__ __device__ inline line_fit Line_fit_odr(const Matrix3xNd& hits,
  * what is done in the same fit in the Broken Line approach.
  */
 
-__host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
-    const Matrix3Nd& hits_cov,
-    const circle_fit& circle,
-    const Vector4d& fast_fit,
-    const double B,
-    const bool error = true) {
+  template<typename M3xN, typename M6xN, typename V4>
+__host__ __device__ 
+inline line_fit Line_fit(const M3xN& hits,
+			 const  M6xN & hits_ge,
+			 const circle_fit& circle,
+			 const V4& fast_fit,
+			 const double B,
+			 const bool error) {
+    
+  constexpr uint32_t N = M3xN::ColsAtCompileTime;
   auto n = hits.cols();
   double theta = -circle.q*atan(fast_fit(3));
   theta = theta < 0. ? theta + M_PI : theta;
+    
+  // Prepare the Rotation Matrix to rotate the points
+  Eigen::Matrix<double, 2, 2> rot;
+  rot << sin(theta), cos(theta), -cos(theta), sin(theta);
 
+  
   // PROJECTION ON THE CILINDER
   //
   // p2D will be:
@@ -1241,16 +973,14 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
   // s values will be ordinary x-values
   // z values will be ordinary y-values
 
-  Matrix2xNd p2D(2, n);
+  Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero(); 
   Eigen::Matrix<double, 2, 6> Jx;
 
-  p2D << MatrixXd::Zero(2, n);
-  Jx << MatrixXd::Zero(2, 6);
-
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
   printf("Line_fit - B: %g\n", B);
   printIt(&hits, "Line_fit points: ");
-  printIt(&hits_cov, "Line_fit covs: ");
+  printIt(&hits_ge, "Line_fit covs: ");
+  printIt(&rot, "Line_fit rot: ");
 #endif
   // x & associated Jacobian
   // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
@@ -1260,9 +990,8 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
   const Vector2d o(circle.par(0), circle.par(1));
 
   // associated Jacobian, used in weights and errors computation
-  Matrix2Nd cov_sz = MatrixXd::Zero(2 * n, 2 * n);
-  Matrix6d Cov(6,6);
-  Matrix2d Cov_sz_single(2, 2);
+  Matrix6d Cov = Matrix6d::Zero();
+  Matrix2d cov_sz[4];  // FIXME: should be "N"
   for (u_int i = 0; i < n; ++i)
   {
     Vector2d p = hits.block(0, i, 2, 1) - o;
@@ -1287,39 +1016,34 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
     Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
 
-    Cov << MatrixXd::Zero(6, 6);
-    Cov_sz_single << MatrixXd::Zero(2, 2);
+
+    
     Cov.block(0, 0, 3, 3) = circle.cov;
-    Cov(3, 3) = hits_cov(i, i);                        // x errors
-    Cov(4, 4) = hits_cov(i + n, i + n);                // y errors
-    Cov(5, 5) = hits_cov(i + 2*n, i + 2*n);            // z errors
-    Cov(3, 4) = Cov(4, 3) = hits_cov(i, i + n);        // cov_xy
-    Cov(3, 5) = Cov(5, 3) = hits_cov(i, i + 2*n);      // cov_xz
-    Cov(4, 5) = Cov(5, 4) = hits_cov(i + n, i + 2*n);  // cov_yz
-    Cov_sz_single = Jx * Cov * Jx.transpose();
-    cov_sz(i, i) = Cov_sz_single(0, 0);
-    cov_sz(i + n, i + n) = Cov_sz_single(1, 1);
-    cov_sz(i, i + n) = cov_sz(i + n, i) = Cov_sz_single(0, 1);
+    Cov(3, 3) = hits_ge.col(i)[0];                // x errors
+    Cov(4, 4) = hits_ge.col(i)[2];                // y errors
+    Cov(5, 5) = hits_ge.col(i)[5];                // z errors
+    Cov(3, 4) = Cov(4, 3) =  hits_ge.col(i)[1];   // cov_xy
+    Cov(3, 5) = Cov(5, 3) =  hits_ge.col(i)[3];   // cov_xz
+    Cov(4, 5) = Cov(5, 4) =  hits_ge.col(i)[4];   // cov_yz
+    Matrix2d tmp = Jx * Cov * Jx.transpose();
+    cov_sz[i].noalias() = rot*tmp*rot.transpose();
   }
   // Math of d_{X0,Y0,R,x,y} all verified by hand
   p2D.row(1) = hits.row(2);
 
   // The following matrix will contain errors orthogonal to the rotated S
   // component only, with the Multiple Scattering properly treated!!
-  MatrixNd cov_with_ms = Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B);
-#if RFIT_DEBUG
-  printIt(&cov_sz, "line_fit - cov_sz:");
+  MatrixNd<N> cov_with_ms; 
+  Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B,cov_with_ms);
+#ifdef RFIT_DEBUG
+  printIt(cov_sz, "line_fit - cov_sz:");
   printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
 #endif
 
-  // Prepare the Rotation Matrix to rotate the points
-  Eigen::Matrix<double, 2, 2> rot = Eigen::Matrix<double, 2, 2>::Zero();
-  rot << sin(theta), cos(theta), -cos(theta), sin(theta);
-
   // Rotate Points with the shape [2, n]
-  Matrix2xNd p2D_rot = rot*p2D;
+  Matrix2xNd<N> p2D_rot = rot*p2D;
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
   printf("Fast fit Tan(theta): %g\n", fast_fit(3));
   printf("Rotation angle: %g\n", theta);
   printIt(&rot, "Rotation Matrix:");
@@ -1329,15 +1053,15 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
 #endif
 
   // Build the A Matrix
-  Matrix2xNd A(2,n);
+  Matrix2xNd<N> A;
   A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
   printIt(&A, "A Matrix:");
 #endif
 
   // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
-  MatrixNd Vy_inv = cov_with_ms.inverse();
+  MatrixNd<N> Vy_inv = cov_with_ms.inverse();
   Eigen::Matrix<double, 2, 2> Inv_Cov = A*Vy_inv*A.transpose();
 
   // Compute the Covariance Matrix of the fit parameters
@@ -1349,20 +1073,20 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
   Eigen::Matrix<double, 2, 1> sol = Cov_params*A*Vy_inv*p2D_rot.row(1).transpose();
 
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
   printIt(&sol, "Rotated solutions:");
 #endif
 
   // We need now to transfer back the results in the original s-z plane
   auto common_factor = 1./(sin(theta)-sol(1,0)*cos(theta));
-  Matrix<double, 2, 2> J = Matrix<double, 2, 2>::Zero();
+  Eigen::Matrix<double, 2, 2> J;
   J << 0., common_factor*common_factor, common_factor, sol(0,0)*cos(theta)*common_factor*common_factor;
 
   double m = common_factor*(sol(1,0)*sin(theta)+cos(theta));
   double q = common_factor*sol(0,0);
   auto cov_mq = J * Cov_params * J.transpose();
 
-  VectorNd res = p2D_rot.row(1).transpose() - A.transpose() * sol;
+  VectorNd<N> res = p2D_rot.row(1).transpose() - A.transpose() * sol;
   double chi2 = res.transpose()*Vy_inv*res;
   chi2 = chi2 / float(n);
 
@@ -1371,7 +1095,7 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
   line.cov << cov_mq;
   line.chi2 = chi2;
 
-#if RFIT_DEBUG
+#ifdef RFIT_DEBUG
   printf("Common_factor: %g\n", common_factor);
   printIt(&J, "Jacobian:");
   printIt(&sol, "Rotated solutions:");
@@ -1393,14 +1117,11 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
     -line fit of hits projected on cylinder surface by orthogonal distance
         regression (see Line_fit for further info). \n
     Points must be passed ordered (from inner to outer layer).
-
     \param hits Matrix3xNd hits coordinates in this form: \n
         |x0|x1|x2|...|xn| \n
         |y0|y1|y2|...|yn| \n
         |z0|z1|z2|...|zn|
-
     \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
-
    |(x0,x0)|(x1,x0)|(x2,x0)|.|(y0,x0)|(y1,x0)|(y2,x0)|.|(z0,x0)|(z1,x0)|(z2,x0)| \n
    |(x0,x1)|(x1,x1)|(x2,x1)|.|(y0,x1)|(y1,x1)|(y2,x1)|.|(z0,x1)|(z1,x1)|(z2,x1)| \n
    |(x0,x2)|(x1,x2)|(x2,x2)|.|(y0,x2)|(y1,x2)|(y2,x2)|.|(z0,x2)|(z1,x2)|(z2,x2)| \n
@@ -1412,31 +1133,31 @@ __host__ __device__ inline line_fit Line_fit(const Matrix3xNd& hits,
    |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n
    |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n
    |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)|
-
    \param B magnetic field in the center of the detector in Gev/cm/c
    unit, in order to perform pt calculation.
    \param error flag for error computation.
    \param scattering flag for multiple scattering treatment.
    (see Circle_fit() documentation for further info).
-
    \warning see Circle_fit(), Line_fit() and Fast_fit() warnings.
-
    \bug see Circle_fit(), Line_fit() and Fast_fit() bugs.
 */
 
-inline helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, const double B,
-                           const bool error = true)
+template<int N>
+inline helix_fit Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float,6,4>& hits_ge, const double B,
+                           const bool error)
 {
     u_int n = hits.cols();
-    VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+    VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
     // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
-    const Vector4d fast_fit = Fast_fit(hits);
-
+    Vector4d fast_fit; 
+    Fast_fit(hits,fast_fit);
+    Rfit::Matrix2Nd<4> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
+    Rfit::loadCovariance2D(hits_ge,hits_cov);
     circle_fit circle = Circle_fit(hits.block(0, 0, 2, n),
-                                   hits_cov.block(0, 0, 2 * n, 2 * n),
+                                   hits_cov,
                                    fast_fit, rad, B, error);
-    line_fit line = Line_fit(hits, hits_cov, circle, fast_fit, B, error);
+    line_fit line = Line_fit(hits, hits_ge, circle, fast_fit, B, error);
 
     par_uvrtopak(circle, B, error);
 
@@ -1457,4 +1178,4 @@ inline helix_fit Helix_fit(const Matrix3xNd& hits, const Matrix3Nd& hits_cov, co
 
 }  // namespace Rfit
 
-#endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
index c549e05d69f55..d8177a0e9447c 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
@@ -1,4 +1,7 @@
 <use name="cuda"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/Producer"/>
+<use name="HeterogeneousCore/Product"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
 <library file="*.cc *.cu" name="RecoPixelVertexingPixelTrackFittingPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
index eadfda8cb6a26..7f13c7218eafa 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
@@ -1,3 +1,4 @@
+#include "storeTracks.h"
 #include "PixelTrackProducer.h"
 
 #include "FWCore/Framework/interface/Event.h"
@@ -59,62 +60,6 @@ void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es)
   es.get<TrackerTopologyRcd>().get(httopo);
 
   // store tracks
-  store(ev, tracks, *httopo);
+  storeTracks(ev, tracks, *httopo);
 }
 
-void PixelTrackProducer::store(edm::Event& ev, const TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo)
-{
-  auto tracks = std::make_unique<reco::TrackCollection>();
-  auto recHits = std::make_unique<TrackingRecHitCollection>();
-  auto trackExtras = std::make_unique<reco::TrackExtraCollection>();
-
-  int cc = 0, nTracks = tracksWithHits.size();
-
-  for (int i = 0; i < nTracks; i++)
-  {
-    reco::Track* track =  tracksWithHits.at(i).first;
-    const SeedingHitSet& hits = tracksWithHits.at(i).second;
-
-    for (unsigned int k = 0; k < hits.size(); k++)
-    {
-      TrackingRecHit *hit = hits[k]->hit()->clone();
-
-      track->appendHitPattern(*hit, ttopo);
-      recHits->push_back(hit);
-    }
-    tracks->push_back(*track);
-    delete track;
-
-  }
-
-  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event" << "\n";
-  edm::OrphanHandle <TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
-
-  edm::RefProd<TrackingRecHitCollection> hitCollProd(ohRH);
-  for (int k = 0; k < nTracks; k++)
-  {
-    reco::TrackExtra theTrackExtra{};
-
-    //fill the TrackExtra with TrackingRecHitRef
-    unsigned int nHits = tracks->at(k).numberOfValidHits();
-    theTrackExtra.setHits(hitCollProd, cc, nHits);
-    cc +=nHits;
-    AlgebraicVector5 v = AlgebraicVector5(0,0,0,0,0);
-    reco::TrackExtra::TrajParams trajParams(nHits,LocalTrajectoryParameters(v,1.));
-    reco::TrackExtra::Chi2sFive chi2s(nHits,0);
-    theTrackExtra.setTrajParams(std::move(trajParams),std::move(chi2s));
-    trackExtras->push_back(theTrackExtra);
-  }
-
-  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event" << "\n";
-  edm::OrphanHandle<reco::TrackExtraCollection> ohTE = ev.put(std::move(trackExtras));
-
-  for (int k = 0; k < nTracks; k++)
-  {
-    const reco::TrackExtraRef theTrackExtraRef(ohTE,k);
-    (tracks->at(k)).setExtra(theTrackExtraRef);
-  }
-
-  ev.put(std::move(tracks));
-
-}
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
index 6bc6d2815c8e7..7e0d5d73b03fc 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
@@ -2,7 +2,6 @@
 #define PixelTrackProducer_h
 
 #include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstruction.h"
 
 #include "PixelTrackReconstructionGPU.h"
@@ -22,7 +21,6 @@ class PixelTrackProducer :  public edm::stream::EDProducer<> {
   void produce(edm::Event& ev, const edm::EventSetup& es) override;
 
 private:
-  void store(edm::Event& ev, const pixeltrackfitting::TracksWithTTRHs& selectedTracks, const TrackerTopology& ttopo);
   bool runOnGPU_;
   PixelTrackReconstruction theReconstruction;
   PixelTrackReconstructionGPU theGPUReconstruction;
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
new file mode 100644
index 0000000000000..48abab5237587
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
@@ -0,0 +1,77 @@
+#ifndef RecoPixelVertexingPixelTrackFittingStoreTracks_H
+#define RecoPixelVertexingPixelTrackFittingStoreTracks_H
+
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h"
+
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+
+template<typename Ev>
+void storeTracks(Ev & ev, const pixeltrackfitting::TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo)
+{
+  auto tracks = std::make_unique<reco::TrackCollection>();
+  auto recHits = std::make_unique<TrackingRecHitCollection>();
+  auto trackExtras = std::make_unique<reco::TrackExtraCollection>();
+
+  int cc = 0, nTracks = tracksWithHits.size();
+
+  for (int i = 0; i < nTracks; i++)
+  {
+    reco::Track* track =  tracksWithHits.at(i).first;
+    const SeedingHitSet& hits = tracksWithHits.at(i).second;
+
+    for (unsigned int k = 0; k < hits.size(); k++)
+    {
+      TrackingRecHit *hit = hits[k]->hit()->clone();
+
+      track->appendHitPattern(*hit, ttopo);
+      recHits->push_back(hit);
+    }
+    tracks->push_back(*track);
+    delete track;
+
+  }
+
+  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event" << "\n";
+  edm::OrphanHandle <TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
+
+  edm::RefProd<TrackingRecHitCollection> hitCollProd(ohRH);
+  for (int k = 0; k < nTracks; k++)
+  {
+    reco::TrackExtra theTrackExtra{};
+
+    //fill the TrackExtra with TrackingRecHitRef
+    unsigned int nHits = tracks->at(k).numberOfValidHits();
+    theTrackExtra.setHits(hitCollProd, cc, nHits);
+    cc +=nHits;
+    AlgebraicVector5 v = AlgebraicVector5(0,0,0,0,0);
+    reco::TrackExtra::TrajParams trajParams(nHits,LocalTrajectoryParameters(v,1.));
+    reco::TrackExtra::Chi2sFive chi2s(nHits,0);
+    theTrackExtra.setTrajParams(std::move(trajParams),std::move(chi2s));
+    trackExtras->push_back(theTrackExtra);
+  }
+
+  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event" << "\n";
+  edm::OrphanHandle<reco::TrackExtraCollection> ohTE = ev.put(std::move(trackExtras));
+
+  for (int k = 0; k < nTracks; k++)
+  {
+    const reco::TrackExtraRef theTrackExtraRef(ohTE,k);
+    (tracks->at(k)).setExtra(theTrackExtraRef);
+  }
+
+  ev.put(std::move(tracks));
+
+}
+
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 728b3fec47f39..e868ff1921965 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -50,7 +50,10 @@
     doublets = "pixelTracksHitDoublets",
     SeedComparitorPSet = dict(clusterShapeCacheSrc = 'siPixelClusterShapeCachePreSplitting')
 )
+
 from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelTriplets.caHitQuadrupletHeterogeneousEDProducer_cfi import caHitQuadrupletHeterogeneousEDProducer as _caHitQuadrupletHeterogeneousEDProducer
+gpu.toReplaceWith(pixelTracksHitQuadruplets, _caHitQuadrupletHeterogeneousEDProducer)
 gpu.toModify(pixelTracksHitQuadruplets, trackingRegions = "pixelTracksTrackingRegions")
 
 # for trackingLowPU
@@ -67,6 +70,10 @@
 )
 trackingLowPU.toModify(pixelTracks, SeedingHitSets = "pixelTracksHitTriplets")
 
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromCUDA_cfi import pixelTrackProducerFromCUDA as _pixelTrackProducerFromCUDA
+gpu.toReplaceWith(pixelTracks, _pixelTrackProducerFromCUDA)
+
 pixelTracksTask = cms.Task(
     pixelTracksTrackingRegions,
     pixelFitterByHelixProjections,
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index d6beb57b862b8..b4b5e3a335bcb 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -1,3 +1,4 @@
+<library   file="PixelTrackTest.cc" name="PixelTrackTest">
 <use   name="boost"/>
 <use   name="root"/>
 <use   name="FWCore/Framework"/>
@@ -11,19 +12,48 @@
 <use   name="RecoTracker/TkTrackingRegions"/>
 <use   name="RecoPixelVertexing/PixelTriplets"/>
 <use   name="RecoPixelVertexing/PixelTrackFitting"/>
+  <flags   EDM_PLUGIN="1"/>
+</library>
+
+
+<bin file="testRiemannFit.cpp">
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="testRiemannFit.cpp" name="testRiemannFitDump">
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <flags CXXFLAGS="-g -DRFIT_DEBUG"/>
+</bin>
+
 <bin file="testEigenGPU.cu" name="testEigenGPU_t">
+  <use name="eigen"/>
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
+  <use name="eigen"/>
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CXXFLAGS="-g"/>
 </bin>
-<library   file="PixelTrackTest.cc" name="PixelTrackTest">
-  <flags   EDM_PLUGIN="1"/>
-</library>
 <bin file="PixelTrackRiemannFit.cc">
+  <use   name="eigen"/>
+  <use   name="cuda"/>
+  <use   name="root"/>
+  <flags   CXXFLAGS="-DEIGEN_NO_DEBUG"/>
+</bin>
+<bin file="PixelTrackRiemannFit.cc" name = "PixelTrackRiemannFit_Debug">
+  <use   name="eigen"/>
+  <use   name="cuda"/>
+  <use   name="root"/>
+  <flags   CXXFLAGS="-g"/>
+</bin>
+<bin file="testEigenJacobian.cpp">
+  <use   name="eigen"/>
+  <use   name="cuda"/>
   <flags   CXXFLAGS="-g"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
index 77b5d1bebe6b6..adcabd7dde508 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -5,8 +5,13 @@
 #include <iostream>
 #include <random>
 #include <memory>  // unique_ptr
+#include<chrono>
+
+#include <TFile.h>
+#include <TH1F.h>
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+//#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
 
 using namespace std;
 using namespace Eigen;
@@ -20,9 +25,10 @@ using Vector6d = Eigen::Matrix<double, 6, 1>;
 using Vector8d = Eigen::Matrix<double, 8, 1>;
 };  // namespace Rfit
 
+// quadruplets...
 struct hits_gen {
-  Matrix3xNd hits;
-  Matrix3Nd hits_cov;
+  Matrix3xNd<4> hits;
+  Eigen::Matrix<float,6,4> hits_ge;
   Vector5d true_par;
 };
 
@@ -66,30 +72,31 @@ void smearing(const Vector5d& err, const bool& isbarrel, double& x, double& y, d
   }
 }
 
-void Hits_cov(Matrix3Nd& V, const unsigned int& i, const unsigned int& n, const Matrix3xNd& hits,
+template<int N>
+void Hits_cov(Eigen::Matrix<float,6,4> & V, const unsigned int& i, const unsigned int& n, const Matrix3xNd<N>& hits,
               const Vector5d& err, bool isbarrel) {
   if (isbarrel) {
     double R2 = Rfit::sqr(hits(0, i)) + Rfit::sqr(hits(1, i));
-    V(i, i) =
+    V.col(i)[0] =
         (Rfit::sqr(err[1]) * Rfit::sqr(hits(1, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(0, i))) /
         R2;
-    V(i + n, i + n) =
+    V.col(i)[2] =
         (Rfit::sqr(err[1]) * Rfit::sqr(hits(0, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(1, i))) /
         R2;
-    V(i, i + n) = V(i + n, i) =
+    V.col(i)[1] =
         (Rfit::sqr(err[0]) - Rfit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2;
-    V(i + 2 * n, i + 2 * n) = Rfit::sqr(err[2]);
+    V.col(i)[5] = Rfit::sqr(err[2]);
   } else {
-    V(i, i) = Rfit::sqr(err[3]);
-    V(i + n, i + n) = Rfit::sqr(err[3]);
-    V(i + 2 * n, i + 2 * n) = Rfit::sqr(err[4]);
+    V.col(i)[0] = Rfit::sqr(err[3]);
+    V.col(i)[2] = Rfit::sqr(err[3]);
+    V.col(i)[5] = Rfit::sqr(err[4]);
   }
 }
 
 hits_gen Hits_gen(const unsigned int& n, const Matrix<double, 6, 1>& gen_par) {
   hits_gen gen;
   gen.hits = MatrixXd::Zero(3, n);
-  gen.hits_cov = MatrixXd::Zero(3 * n, 3 * n);
+  gen.hits_ge = Eigen::Matrix<float,6,4>::Zero();
   // err /= 10000.;
   constexpr double rad[8] = {2.95, 6.8, 10.9, 16., 3.1, 7., 11., 16.2};
   // constexpr double R_err[8] = {5./10000, 5./10000, 5./10000, 5./10000, 5./10000,
@@ -123,7 +130,7 @@ hits_gen Hits_gen(const unsigned int& n, const Matrix<double, 6, 1>& gen_par) {
     Vector5d err;
     err << R_err[i], Rp_err[i], z_err[i], 0, 0;
     smearing(err, true, gen.hits(0, i), gen.hits(1, i), gen.hits(2, i));
-    Hits_cov(gen.hits_cov, i, n, gen.hits, err, true);
+    Hits_cov(gen.hits_ge, i, n, gen.hits, err, true);
   }
 
   return gen;
@@ -164,161 +171,265 @@ Matrix<double, 6, 1> New_par(const Matrix<double, 6, 1>& gen_par, const int& cha
   return new_par;
 }
 
-void test_helix_fit() {
+template<typename Fit, size_t N>
+void computePull(std::array<Fit, N> & fit, const char * label,
+    int n_, int iteration, const Vector5d & true_par) {
+  Eigen::Matrix<double, 41, Eigen::Dynamic, 1> score(41, iteration);
+
+  std::string histo_name("Phi Pull");
+  histo_name += label;
+  TH1F phi_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "dxy Pull ";
+  histo_name += label;
+  TH1F dxy_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "dz Pull ";
+  histo_name += label;
+  TH1F dz_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Theta Pull ";
+  histo_name += label;
+  TH1F theta_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Pt Pull ";
+  histo_name += label;
+  TH1F pt_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Phi Error ";
+  histo_name += label;
+  TH1F phi_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "dxy error ";
+  histo_name += label;
+  TH1F dxy_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "dz error ";
+  histo_name += label;
+  TH1F dz_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "Theta error ";
+  histo_name += label;
+  TH1F theta_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "Pt error ";
+  histo_name += label;
+  TH1F pt_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  for (int x = 0; x < iteration; x++) {
+    // Compute PULLS information
+    score(0, x) = (fit[x].par(0) - true_par(0)) / sqrt(fit[x].cov(0, 0));
+    score(1, x) = (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(1, 1));
+    score(2, x) = (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(2, 2));
+    score(3, x) = (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(3, 3));
+    score(4, x) = (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(4, 4));
+    phi_pull.Fill(score(0, x));
+    dxy_pull.Fill(score(1, x));
+    pt_pull.Fill(score(2, x));
+    theta_pull.Fill(score(3, x));
+    dz_pull.Fill(score(4, x));
+    phi_error.Fill(sqrt(fit[x].cov(0, 0)));
+    dxy_error.Fill(sqrt(fit[x].cov(1, 1)));
+    pt_error.Fill(sqrt(fit[x].cov(2, 2)));
+    theta_error.Fill(sqrt(fit[x].cov(3, 3)));
+    dz_error.Fill(sqrt(fit[x].cov(4, 4)));
+    score(5, x) =
+      (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / (fit[x].cov(0, 1));
+    score(6, x) =
+      (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(0, 2));
+    score(7, x) =
+      (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(1, 2));
+    score(8, x) =
+      (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / (fit[x].cov(3, 4));
+    score(9, x) = fit[x].chi2_circle;
+    score(25, x) = fit[x].chi2_line;
+    score(10, x) = sqrt(fit[x].cov(0, 0)) / fit[x].par(0) * 100;
+    score(13, x) = sqrt(fit[x].cov(3, 3)) / fit[x].par(3) * 100;
+    score(14, x) = sqrt(fit[x].cov(4, 4)) / fit[x].par(4) * 100;
+    score(15, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(3) - true_par(3)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(3, 3));
+    score(16, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(3) - true_par(3)) /
+      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(3, 3));
+    score(17, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(3) - true_par(3)) /
+      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(3, 3));
+    score(18, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(4, 4));
+    score(19, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(4, 4));
+    score(20, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(4, 4));
+    score(21, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(1, 1));
+    score(22, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) /
+      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(2, 2));
+    score(23, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) /
+      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(2, 2));
+    score(24, x) = (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) /
+      sqrt(fit[x].cov(3, 3)) / sqrt(fit[x].cov(4, 4));
+    score(30, x) = fit[x].par(0);
+    score(31, x) = fit[x].par(1);
+    score(32, x) = fit[x].par(2);
+    score(33, x) = fit[x].par(3);
+    score(34, x) = fit[x].par(4);
+    score(35, x) = sqrt(fit[x].cov(0,0));
+    score(36, x) = sqrt(fit[x].cov(1,1));
+    score(37, x) = sqrt(fit[x].cov(2,2));
+    score(38, x) = sqrt(fit[x].cov(3,3));
+    score(39, x) = sqrt(fit[x].cov(4,4));
+
+  }
+
+  double phi_ = score.row(0).mean();
+  double a_ = score.row(1).mean();
+  double pt_ = score.row(2).mean();
+  double coT_ = score.row(3).mean();
+  double Zip_ = score.row(4).mean();
+  std::cout << std::setprecision(5) << std::scientific << label << " AVERAGE FITTED VALUES: \n"
+    << "phi: " << score.row(30).mean() << " +/- " << score.row(35).mean() << " [+/-] " << sqrt(score.row(35).array().abs2().mean() - score.row(35).mean()*score.row(35).mean()) << std::endl
+    << "d0:  " << score.row(31).mean() << " +/- " << score.row(36).mean() << " [+/-] " << sqrt(score.row(36).array().abs2().mean() - score.row(36).mean()*score.row(36).mean()) << std::endl
+    << "pt:  " << score.row(32).mean() << " +/- " << score.row(37).mean() << " [+/-] " << sqrt(score.row(37).array().abs2().mean() - score.row(37).mean()*score.row(37).mean()) << std::endl
+    << "coT: " << score.row(33).mean() << " +/- " << score.row(38).mean() << " [+/-] " << sqrt(score.row(38).array().abs2().mean() - score.row(38).mean()*score.row(38).mean()) << std::endl
+    << "Zip: " << score.row(34).mean() << " +/- " << score.row(39).mean() << " [+/-] " << sqrt(score.row(39).array().abs2().mean() - score.row(39).mean()*score.row(39).mean()) << std::endl;
+
+  Matrix5d correlation;
+  correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
+              score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
+              score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
+              score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
+              score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
+              score.row(24).mean(), 1.;
+
+  cout << "\n" << label << " PULLS (mean, sigma, relative_error):\n"
+    << "phi:  " << phi_ << "     "
+    << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(10).mean()) << "%\n"
+    << "a0 :  " << a_ << "     "
+    << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(11).mean()) << "%\n"
+    << "pt :  " << pt_ << "     "
+    << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(12).mean()) << "%\n"
+    << "coT:  " << coT_ << "     "
+    << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(13).mean()) << "%\n"
+    << "Zip:  " << Zip_ << "     "
+    << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
+    << abs(score.row(14).mean()) << "%\n\n"
+    << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
+    << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
+    << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
+    << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
+    << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
+    << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
+    << "correlation matrix:\n"
+    << correlation << "\n\n"
+    << endl;
+
+  phi_pull.Fit("gaus", "Q");
+  dxy_pull.Fit("gaus", "Q");
+  dz_pull.Fit("gaus", "Q");
+  theta_pull.Fit("gaus", "Q");
+  pt_pull.Fit("gaus", "Q");
+  phi_pull.Write();
+  dxy_pull.Write();
+  dz_pull.Write();
+  theta_pull.Write();
+  pt_pull.Write();
+  phi_error.Write();
+  dxy_error.Write();
+  dz_error.Write();
+  theta_error.Write();
+  pt_error.Write();
+}
+
+
+void test_helix_fit(bool getcin) {
   int n_;
-  int iteration;
-  int debug2 = 0;
   bool return_err;
   const double B_field = 3.8 * c_speed / pow(10, 9) / 100;
   Matrix<double, 6, 1> gen_par;
   Vector5d true_par;
   Vector5d err;
-//  while (1) {
-    generator.seed(1);
-    int debug = 0;
-    debug2 = 0;
-    std::cout << std::setprecision(6);
-    cout << "_________________________________________________________________________\n";
-    cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
-//    cin >> n_ >> gen_par(0) >> gen_par(1) >> gen_par(2) >> gen_par(3) >> gen_par(4) >> gen_par(5) >>
-//        iteration >> return_err >> debug2;
-    n_ = 4;
-    gen_par(0) = -0.1;  // x
-    gen_par(1) = 0.1;   // y
-    gen_par(2) = -1.;  // z
-    gen_par(3) = 45.;   // phi
-    gen_par(4) = 10.;   // R (p_t)
-    gen_par(5) = 1.;   // eta
-    iteration = 1;
-    return_err = true;
-    debug2 = 1;
-
-    iteration *= 10;
-    gen_par = New_par(gen_par, 1, B_field);
-    true_par = True_par(gen_par, 1, B_field);
-    Matrix3xNd hits;
-    Matrix3Nd hits_cov;
-    unique_ptr<helix_fit[]> helix(new helix_fit[iteration]);
-//    helix_fit* helix = new helix_fit[iteration];
-    Matrix<double, 41, Dynamic, 1> score(41, iteration);
-
-    for (int i = 0; i < iteration; i++) {
-      if (debug2 == 1 && i == (iteration - 1)) {
-        debug = 1;
-      }
-      hits_gen gen;
-      gen = Hits_gen(n_, gen_par);
-//      gen.hits = MatrixXd::Zero(3, 4);
-//      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
-//      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
-//      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
-//      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
-//      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
-      helix[i] = Rfit::Helix_fit(gen.hits, gen.hits_cov, B_field, return_err);
+  generator.seed(1);
+  std::cout << std::setprecision(6);
+  cout << "_________________________________________________________________________\n";
+  cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
+  if (getcin) {
+    cout << "hits: ";
+    cin  >> n_;
+    cout << "x: ";
+    cin  >> gen_par(0);
+    cout << "y: ";
+    cin  >> gen_par(1);
+    cout << "z: ";
+    cin  >> gen_par(2);
+    cout << "phi: ";
+    cin  >> gen_par(3);
+    cout << "p_t: ";
+    cin  >> gen_par(4);
+    cout << "eta: ";
+    cin  >> gen_par(5);
+  } else {
+     n_ = 4;
+     gen_par(0) = -0.1;  // x
+     gen_par(1) = 0.1;   // y
+     gen_par(2) = -1.;  // z
+     gen_par(3) = 45.;   // phi
+     gen_par(4) = 10.;   // R (p_t)
+     gen_par(5) = 1.;   // eta
+  }
+  return_err = true;
 
-      if (debug)
-        cout << std::setprecision(10)
-            << "phi:  " << helix[i].par(0) << " +/- " << sqrt(helix[i].cov(0, 0)) << " vs "
-            << true_par(0) << endl
-            << "Tip:  " << helix[i].par(1) << " +/- " << sqrt(helix[i].cov(1, 1)) << " vs "
-            << true_par(1) << endl
-            << "p_t:  " << helix[i].par(2) << " +/- " << sqrt(helix[i].cov(2, 2)) << " vs "
-            << true_par(2) << endl
-            << "theta:" << helix[i].par(3) << " +/- " << sqrt(helix[i].cov(3, 3)) << " vs "
-            << true_par(3) << endl
-            << "Zip:  " << helix[i].par(4) << " +/- " << sqrt(helix[i].cov(4, 4)) << " vs "
-            << true_par(4) << endl
-            << "charge:" << helix[i].q << " vs 1" << endl
-            << "covariance matrix:" << endl
-            << helix[i].cov << endl
-            << "Initial hits:\n" << gen.hits << endl
-            << "Initial Covariance:\n" << gen.hits_cov << endl;
-    }
+  const int iteration = 5000;
+  gen_par = New_par(gen_par, 1, B_field);
+  true_par = True_par(gen_par, 1, B_field);
+  // Matrix3xNd<4> hits;
+  std::array<helix_fit, iteration> helixRiemann_fit;
+//  std::array<BrokenLine::helix_fit, iteration> helixBrokenLine_fit;
 
-    for (int x = 0; x < iteration; x++) {
-      // Compute PULLS information
-      score(0, x) = (helix[x].par(0) - true_par(0)) / sqrt(helix[x].cov(0, 0));
-      score(1, x) = (helix[x].par(1) - true_par(1)) / sqrt(helix[x].cov(1, 1));
-      score(2, x) = (helix[x].par(2) - true_par(2)) / sqrt(helix[x].cov(2, 2));
-      score(3, x) = (helix[x].par(3) - true_par(3)) / sqrt(helix[x].cov(3, 3));
-      score(4, x) = (helix[x].par(4) - true_par(4)) / sqrt(helix[x].cov(4, 4));
-      score(5, x) =
-          (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) / (helix[x].cov(0, 1));
-      score(6, x) =
-          (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(0, 2));
-      score(7, x) =
-          (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) / (helix[x].cov(1, 2));
-      score(8, x) =
-          (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) / (helix[x].cov(3, 4));
-      score(9, x) = helix[x].chi2_circle;
-      score(25, x) = helix[x].chi2_line;
-      score(10, x) = sqrt(helix[x].cov(0, 0)) / helix[x].par(0) * 100;
-      score(13, x) = sqrt(helix[x].cov(3, 3)) / helix[x].par(3) * 100;
-      score(14, x) = sqrt(helix[x].cov(4, 4)) / helix[x].par(4) * 100;
-      score(15, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(3) - true_par(3)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(3, 3));
-      score(16, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(3) - true_par(3)) /
-                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(3, 3));
-      score(17, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(3) - true_par(3)) /
-                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(3, 3));
-      score(18, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(4, 4));
-      score(19, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(4, 4));
-      score(20, x) = (helix[x].par(2) - true_par(2)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(2, 2)) / sqrt(helix[x].cov(4, 4));
-      score(21, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(1) - true_par(1)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(1, 1));
-      score(22, x) = (helix[x].par(0) - true_par(0)) * (helix[x].par(2) - true_par(2)) /
-                     sqrt(helix[x].cov(0, 0)) / sqrt(helix[x].cov(2, 2));
-      score(23, x) = (helix[x].par(1) - true_par(1)) * (helix[x].par(2) - true_par(2)) /
-                     sqrt(helix[x].cov(1, 1)) / sqrt(helix[x].cov(2, 2));
-      score(24, x) = (helix[x].par(3) - true_par(3)) * (helix[x].par(4) - true_par(4)) /
-                     sqrt(helix[x].cov(3, 3)) / sqrt(helix[x].cov(4, 4));
-    }
+  std::cout << "\nTrue parameters: "
+    << "phi: " << true_par(0) << " "
+    << "dxy: " << true_par(1) << " "
+    << "pt: " << true_par(2) << " "
+    << "CotT: " << true_par(3) << " "
+    << "Zip: " << true_par(4) << " "
+    << std::endl;
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start-start;
+  for (int i = 0; i < 100*iteration; i++) {
+    hits_gen gen;
+    gen = Hits_gen(n_, gen_par);
+    //      gen.hits = MatrixXd::Zero(3, 4);
+    //      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
+    //      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
+    //      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
+    //      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
+    //      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
+    delta -= std::chrono::high_resolution_clock::now()-start;
+    helixRiemann_fit[i%iteration] = Rfit::Helix_fit(gen.hits, gen.hits_ge, B_field, return_err);
+    delta += std::chrono::high_resolution_clock::now()-start;
 
-    double phi_ = score.row(0).mean();
-    double a_ = score.row(1).mean();
-    double pt_ = score.row(2).mean();
-    double coT_ = score.row(3).mean();
-    double Zip_ = score.row(4).mean();
-    Matrix5d correlation;
-    correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
-        score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
-        score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
-        score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
-        score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
-        score.row(24).mean(), 1.;
+//    helixBrokenLine_fit[i] = BrokenLine::Helix_fit(gen.hits, gen.hits_cov, B_field);
 
-    cout << "\nPULLS:\n"
-         << "phi:  " << phi_ << "     "
-         << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(10).mean()) << "%\n"
-         << "a0 :  " << a_ << "     "
-         << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(11).mean()) << "%\n"
-         << "pt :  " << pt_ << "     "
-         << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(12).mean()) << "%\n"
-         << "coT:  " << coT_ << "     "
-         << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(13).mean()) << "%\n"
-         << "Zip:  " << Zip_ << "     "
-         << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
-         << abs(score.row(14).mean()) << "%\n\n"
-         << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
-         << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
-         << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
-         << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
-         << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
-         << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
-         << "correlation matrix:\n"
-         << correlation << "\n\n"
-         << endl;
-//  }
+    if (helixRiemann_fit[i%iteration].par(0)>10.) std::cout << "error" << std::endl;
+    if (0==i)
+      cout << std::setprecision(6)
+        << "phi:  " << helixRiemann_fit[i].par(0) << " +/- " << sqrt(helixRiemann_fit[i].cov(0, 0)) << " vs "
+        << true_par(0) << endl
+        << "Tip:  " << helixRiemann_fit[i].par(1) << " +/- " << sqrt(helixRiemann_fit[i].cov(1, 1)) << " vs "
+        << true_par(1) << endl
+        << "p_t:  " << helixRiemann_fit[i].par(2) << " +/- " << sqrt(helixRiemann_fit[i].cov(2, 2)) << " vs "
+        << true_par(2) << endl
+        << "theta:" << helixRiemann_fit[i].par(3) << " +/- " << sqrt(helixRiemann_fit[i].cov(3, 3)) << " vs "
+        << true_par(3) << endl
+        << "Zip:  " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs "
+        << true_par(4) << endl
+        << "charge:" << helixRiemann_fit[i].q << " vs 1" << endl
+        << "covariance matrix:" << endl
+        << helixRiemann_fit[i].cov << endl
+        << "Initial hits:\n" << gen.hits << endl
+        << "Initial Covariance:\n" << gen.hits_ge << endl;
+        
+  }
+  std::cout << "elapsted time " << double(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count())/1.e6 << std::endl;
+  computePull(helixRiemann_fit, "Riemann", n_, iteration, true_par);
+//  computePull(helixBrokenLine_fit, "BrokenLine", n_, iteration, true_par);
 }
 
-int main() {
-  test_helix_fit();
+int main(int nargs, char**) {
+  TFile f("TestFitResults.root", "RECREATE");
+  test_helix_fit(nargs>1);
+  f.Close();
   return 0;
 }
+
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 7b1125eebc312..a60eeda935d79 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -3,76 +3,61 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
+
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 #include "test_common.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 using namespace Eigen;
 
-__global__
-void kernelFullFit(Rfit::Matrix3xNd * hits,
-    Rfit::Matrix3Nd * hits_cov,
-    double B,
-    bool errors,
-    Rfit::circle_fit * circle_fit_resultsGPU,
-    Rfit::line_fit * line_fit_resultsGPU) {
-
-  printf("hits size: %d,%d\n", hits->rows(), hits->cols());
-  Rfit::printIt(hits, "KernelFulFit - input hits: ");
-  Vector4d fast_fit = Rfit::Fast_fit(*hits);
-
-  u_int n = hits->cols();
-  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
-
-  Rfit::Matrix2xNd hits2D_local = (hits->block(0,0,2,n)).eval();
-  Rfit::Matrix2Nd hits_cov2D_local = (hits_cov->block(0, 0, 2 * n, 2 * n)).eval();
-  Rfit::printIt(&hits2D_local, "kernelFullFit - hits2D_local: ");
-  Rfit::printIt(&hits_cov2D_local, "kernelFullFit - hits_cov2D_local: ");
-  /*
-  printf("kernelFullFit - hits address: %p\n", hits);
-  printf("kernelFullFit - hits_cov address: %p\n", hits_cov);
-  printf("kernelFullFit - hits_cov2D address: %p\n", &hits2D_local);
-  printf("kernelFullFit - hits_cov2D_local address: %p\n", &hits_cov2D_local);
-  */
-  /* At some point I gave up and locally construct block on the stack, so that
-     the next invocation to Rfit::Circle_fit works properly. Failing to do so
-     implied basically an empty collection of hits and covariances. That could
-     have been partially fixed if values of the passed in matrices would have
-     been printed on screen since that, maybe, triggered internally the real
-     creations of the blocks. To be understood and compared against the myriad
-     of compilation warnings we have.
-     */
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-      fast_fit, rad, B, errors);
-  /*
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits2D_local, hits_cov2D_local,
-      fast_fit, rad, B, errors, scattering);
-   */
-  (*line_fit_resultsGPU) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit_resultsGPU, fast_fit, errors);
-
-  return;
+namespace Rfit {
+  constexpr uint32_t maxNumberOfTracks() { return 5*1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks();}
+  using Matrix3x4d = Eigen::Matrix<double,3,4>;
+  using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
+  using Matrix6x4f = Eigen::Matrix<float,6,4>;
+  using Map6x4f = Eigen::Map<Matrix6x4f,0,Eigen::Stride<6*stride(),stride()> >;
+  using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
+
 }
 
 __global__
-void kernelFastFit(Rfit::Matrix3xNd * hits, Vector4d * results) {
-  (*results) = Rfit::Fast_fit(*hits);
+void kernelFastFit(double * __restrict__ phits, double * __restrict__ presults) {
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  Rfit::Map3x4d hits(phits+i,3,4);
+  Rfit::Map4d result(presults+i,4);
+  Rfit::Fast_fit(hits,  result);
 }
 
 __global__
-void kernelCircleFit(Rfit::Matrix3xNd * hits,
-    Rfit::Matrix3Nd * hits_cov, Vector4d * fast_fit_input, double B,
+void kernelCircleFit(double * __restrict__ phits,
+    float * __restrict__ phits_ge, 
+    double * __restrict__ pfast_fit_input, 
+    double B,
     Rfit::circle_fit * circle_fit_resultsGPU) {
-  u_int n = hits->cols();
-  Rfit::VectorNd rad = (hits->block(0, 0, 2, n).colwise().norm());
-
-#if TEST_DEBUG
-  printf("fast_fit_input(0): %f\n", (*fast_fit_input)(0));
-  printf("fast_fit_input(1): %f\n", (*fast_fit_input)(1));
-  printf("fast_fit_input(2): %f\n", (*fast_fit_input)(2));
-  printf("fast_fit_input(3): %f\n", (*fast_fit_input)(3));
+
+auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  Rfit::Map3x4d hits(phits+i,3,4);
+  Rfit::Map4d   fast_fit_input(pfast_fit_input+i,4);
+  Rfit::Map6x4f hits_ge(phits_ge+i,6,4);
+
+  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
+  constexpr auto n = N;
+  
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
+  Rfit::loadCovariance2D(hits_ge,hits_cov);
+  
+#ifdef TEST_DEBUG
+if (0==i) {
+  printf("hits %f, %f\n", hits.block(0,0,2,n)(0,0), hits.block(0,0,2,n)(0,1));
+  printf("hits %f, %f\n", hits.block(0,0,2,n)(1,0), hits.block(0,0,2,n)(1,1));
+  printf("fast_fit_input(0): %f\n", fast_fit_input(0));
+  printf("fast_fit_input(1): %f\n", fast_fit_input(1));
+  printf("fast_fit_input(2): %f\n", fast_fit_input(2));
+  printf("fast_fit_input(3): %f\n", fast_fit_input(3));
   printf("rad(0,0): %f\n", rad(0,0));
   printf("rad(1,1): %f\n", rad(1,1));
   printf("rad(2,2): %f\n", rad(2,2));
@@ -81,91 +66,126 @@ void kernelCircleFit(Rfit::Matrix3xNd * hits,
   printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
   printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
   printf("B: %f\n", B);
+}
+#endif
+  circle_fit_resultsGPU[i] =
+    Rfit::Circle_fit(hits.block(0,0,2,n), hits_cov,
+      fast_fit_input, rad, B, true);
+#ifdef TEST_DEBUG
+if (0==i) {
+  printf("Circle param %f,%f,%f\n",circle_fit_resultsGPU[i].par(0),circle_fit_resultsGPU[i].par(1),circle_fit_resultsGPU[i].par(2));
+}
 #endif
-  (*circle_fit_resultsGPU) =
-    Rfit::Circle_fit(hits->block(0,0,2,n), hits_cov->block(0, 0, 2 * n, 2 * n),
-      *fast_fit_input, rad, B, false);
 }
 
 __global__
-void kernelLineFit(Rfit::Matrix3xNd * hits,
-                   Rfit::Matrix3Nd * hits_cov,
+void kernelLineFit(double * __restrict__ phits,
+		   float * __restrict__ phits_ge,
+                   double B,
                    Rfit::circle_fit * circle_fit,
-                   Vector4d * fast_fit,
+                   double * __restrict__ pfast_fit,
                    Rfit::line_fit * line_fit)
 {
-  (*line_fit) = Rfit::Line_fit(*hits, *hits_cov, *circle_fit, *fast_fit, true);
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  Rfit::Map3x4d hits(phits+i,3,4);
+  Rfit::Map4d   fast_fit(pfast_fit+i,4);
+  Rfit::Map6x4f hits_ge(phits_ge+i,6,4);
+  line_fit[i] = Rfit::Line_fit(hits, hits_ge, circle_fit[i], fast_fit, B, true);
 }
 
-void fillHitsAndHitsCov(Rfit::Matrix3xNd & hits, Rfit::Matrix3Nd & hits_cov) {
+template<typename M3x4, typename M6x4>
+__device__ __host__
+void fillHitsAndHitsCov(M3x4 & hits, M6x4 & hits_ge) {
   hits << 1.98645, 4.72598, 7.65632, 11.3151,
           2.18002, 4.88864, 7.75845, 11.3134,
           2.46338, 6.99838,  11.808,  17.793;
-  hits_cov(0,0) = 7.14652e-06;
-  hits_cov(1,1) = 2.15789e-06;
-  hits_cov(2,2) = 1.63328e-06;
-  hits_cov(3,3) = 6.27919e-06;
-  hits_cov(4,4) = 6.10348e-06;
-  hits_cov(5,5) = 2.08211e-06;
-  hits_cov(6,6) = 1.61672e-06;
-  hits_cov(7,7) = 6.28081e-06;
-  hits_cov(8,8) = 5.184e-05;
-  hits_cov(9,9) = 1.444e-05;
-  hits_cov(10,10) = 6.25e-06;
-  hits_cov(11,11) = 3.136e-05;
-  hits_cov(0,4) = hits_cov(4,0) = -5.60077e-06;
-  hits_cov(1,5) = hits_cov(5,1) = -1.11936e-06;
-  hits_cov(2,6) = hits_cov(6,2) = -6.24945e-07;
-  hits_cov(3,7) = hits_cov(7,3) = -5.28e-06;
+  hits_ge.col(0)[0] = 7.14652e-06;
+  hits_ge.col(1)[0] = 2.15789e-06;
+  hits_ge.col(2)[0] = 1.63328e-06;
+  hits_ge.col(3)[0] = 6.27919e-06;
+  hits_ge.col(0)[2] = 6.10348e-06;
+  hits_ge.col(1)[2] = 2.08211e-06;
+  hits_ge.col(2)[2] = 1.61672e-06;
+  hits_ge.col(3)[2] = 6.28081e-06;
+  hits_ge.col(0)[5] = 5.184e-05;
+  hits_ge.col(1)[5] = 1.444e-05;
+  hits_ge.col(2)[5] = 6.25e-06;
+  hits_ge.col(3)[5] = 3.136e-05;
+  hits_ge.col(0)[1] = -5.60077e-06;
+  hits_ge.col(1)[1] = -1.11936e-06;
+  hits_ge.col(2)[1] = -6.24945e-07;
+  hits_ge.col(3)[1] = -5.28e-06;
+}
+
+__global__
+void kernelFillHitsAndHitsCov(double * __restrict__ phits,
+  float * phits_ge) {
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  Rfit::Map3x4d hits(phits+i,3,4);
+  Rfit::Map6x4f hits_ge(phits_ge+i,6,4);
+  hits_ge = MatrixXf::Zero(6,4);
+  fillHitsAndHitsCov(hits,hits_ge);
 }
 
 void testFit() {
   constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd hits(3,4);
-  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
-  Rfit::Matrix3xNd * hitsGPU = new Rfit::Matrix3xNd(3,4);
-  Rfit::Matrix3Nd * hits_covGPU = nullptr;
-  Vector4d * fast_fit_resultsGPU = new Vector4d();
-  Vector4d * fast_fit_resultsGPUret = new Vector4d();
-  Rfit::circle_fit * circle_fit_resultsGPU = new Rfit::circle_fit();
+  Rfit::Matrix3xNd<4> hits;
+  Rfit::Matrix6x4f hits_ge = MatrixXf::Zero(6,4);
+  double * hitsGPU = nullptr;;
+  float * hits_geGPU = nullptr;
+  double * fast_fit_resultsGPU = nullptr;
+  double * fast_fit_resultsGPUret = new double[Rfit::maxNumberOfTracks()*sizeof(Vector4d)];
+  Rfit::circle_fit * circle_fit_resultsGPU = nullptr;
   Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
+  Rfit::line_fit * line_fit_resultsGPU = nullptr;
 
-  fillHitsAndHitsCov(hits, hits_cov);
+  fillHitsAndHitsCov(hits, hits_ge);
 
-  // FAST_FIT_CPU
-  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
-#if TEST_DEBUG
+  std::cout << "sizes " << sizeof(hits) << ' ' << sizeof(hits_ge)
+	    << ' ' << sizeof(Vector4d)<< std::endl;
+  
   std::cout << "Generated hits:\n" << hits << std::endl;
-#endif
+  std::cout << "Generated cov:\n" << hits_ge << std::endl;
+
+  // FAST_FIT_CPU
+  Vector4d fast_fit_results; Rfit::Fast_fit(hits, fast_fit_results);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
-  // FAST_FIT GPU
-  cudaMalloc((void**)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4)));
-  cudaMalloc((void**)&fast_fit_resultsGPU, sizeof(Vector4d));
-  cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice);
+  // for timing    purposes we fit    4096 tracks
+  constexpr uint32_t Ntracks = 4096;
+  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix3xNd<4>)));
+  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix6x4f)));
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Vector4d)));
+  cudaCheck(cudaMalloc((void **)&line_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMalloc((void **)&circle_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::circle_fit)));
+
+
+  kernelFillHitsAndHitsCov<<<Ntracks/64, 64>>>(hitsGPU,hits_geGPU);
 
-  kernelFastFit<<<1, 1>>>(hitsGPU, fast_fit_resultsGPU);
+  // FAST_FIT GPU
+  kernelFastFit<<<Ntracks/64, 64>>>(hitsGPU, fast_fit_resultsGPU);
   cudaDeviceSynchronize();
   
-  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, sizeof(Vector4d), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << *fast_fit_resultsGPUret << std::endl;
-  assert(isEqualFuzzy(fast_fit_results, (*fast_fit_resultsGPUret)));
+  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Vector4d), cudaMemcpyDeviceToHost);
+  Rfit::Map4d fast_fit(fast_fit_resultsGPUret+10,4);
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
+  assert(isEqualFuzzy(fast_fit_results, fast_fit));
 
   // CIRCLE_FIT CPU
-  u_int n = hits.cols();
-  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
+  constexpr auto n = N;
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
+  Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
+  Rfit::loadCovariance2D(hits_ge,hits_cov);
   Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
-      hits_cov.block(0, 0, 2 * n, 2 * n),
-      fast_fit_results, rad, B, false);
+      hits_cov,
+      fast_fit_results, rad, B, true);
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
   // CIRCLE_FIT GPU
-  cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12)));
-  cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit));
-  cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice);
 
-  kernelCircleFit<<<1,1>>>(hitsGPU, hits_covGPU,
+  kernelCircleFit<<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU,
       fast_fit_resultsGPU, B, circle_fit_resultsGPU);
   cudaDeviceSynchronize();
 
@@ -175,90 +195,29 @@ void testFit() {
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
 
   // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results, fast_fit_results, true);
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
 
   // LINE_FIT GPU
-  Rfit::line_fit * line_fit_resultsGPU = nullptr;
   Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
 
-  cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit));
-
-  kernelLineFit<<<1,1>>>(hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  kernelLineFit<<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
   cudaDeviceSynchronize();
 
   cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
-}
-
-void testFitOneGo(bool errors, double epsilon=1e-6) {
-  constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd hits(3,4);
-  Rfit::Matrix3Nd hits_cov = MatrixXd::Zero(12,12);
 
-  fillHitsAndHitsCov(hits, hits_cov);
-
-  // FAST_FIT_CPU
-  Vector4d fast_fit_results = Rfit::Fast_fit(hits);
-  // CIRCLE_FIT CPU
-  u_int n = hits.cols();
-  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+  std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl;
+  std::cout << "Fitted cov (LineFit): GPU\n" << line_fit_resultsGPUret->cov << std::endl;
 
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n), 
-      hits_cov.block(0, 0, 2 * n, 2 * n),
-      fast_fit_results, rad, B, errors);
-  // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_cov, circle_fit_results,
-      fast_fit_results, errors);
-
-  // FIT GPU
-  std::cout << "GPU FIT" << std::endl;
-  Rfit::Matrix3xNd * hitsGPU = nullptr; // new Rfit::Matrix3xNd(3,4);
-  Rfit::Matrix3Nd * hits_covGPU = nullptr;
-  Rfit::line_fit * line_fit_resultsGPU = nullptr;
-  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
-  Rfit::circle_fit * circle_fit_resultsGPU = nullptr; // new Rfit::circle_fit();
-  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
-
-  cudaCheck(cudaMalloc((void **)&hitsGPU, sizeof(Rfit::Matrix3xNd(3,4))));
-  cudaCheck(cudaMalloc((void **)&hits_covGPU, sizeof(Rfit::Matrix3Nd(12,12))));
-  cudaCheck(cudaMalloc((void **)&line_fit_resultsGPU, sizeof(Rfit::line_fit)));
-  cudaCheck(cudaMalloc((void **)&circle_fit_resultsGPU, sizeof(Rfit::circle_fit)));
-  cudaCheck(cudaMemcpy(hitsGPU, &hits, sizeof(Rfit::Matrix3xNd(3,4)), cudaMemcpyHostToDevice));
-  cudaCheck(cudaMemcpy(hits_covGPU, &hits_cov, sizeof(Rfit::Matrix3Nd(12,12)), cudaMemcpyHostToDevice));
-
-  kernelFullFit<<<1, 1>>>(hitsGPU, hits_covGPU, B, errors,
-      circle_fit_resultsGPU, line_fit_resultsGPU);
-  cudaCheck(cudaDeviceSynchronize());
-
-  cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
-  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
-
-  std::cout << "Fitted values (CircleFit) CPU:\n" << circle_fit_results.par << std::endl;
-  std::cout << "Fitted values (LineFit): CPU\n" << line_fit_results.par << std::endl;
-  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
-  std::cout << "Fitted values (LineFit): GPU\n" << line_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par, epsilon));
-  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, epsilon));
-
-  cudaCheck(cudaFree(hitsGPU));
-  cudaCheck(cudaFree(hits_covGPU));
-  cudaCheck(cudaFree(line_fit_resultsGPU));
-  cudaCheck(cudaFree(circle_fit_resultsGPU));
-  delete line_fit_resultsGPUret;
-  delete circle_fit_resultsGPUret;
-
-  cudaDeviceReset();
 }
 
 int main (int argc, char * argv[]) {
-//  testFit();
+  testFit();
   std::cout << "TEST FIT, NO ERRORS" << std::endl;
-  testFitOneGo(false);
-
-  std::cout << "TEST FIT, ERRORS AND SCATTER" << std::endl;
-  testFitOneGo(true, 1e-5);
 
   return 0;
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
new file mode 100644
index 0000000000000..e01aa30efc656
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
@@ -0,0 +1,94 @@
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+#include<cmath>
+
+using Rfit::Vector5d;
+using Rfit::Matrix5d;
+
+
+Vector5d transf(Vector5d  p) {
+  auto sinTheta = 1/std::sqrt(1+p(3)*p(3));
+  p(2) = sinTheta/p(2);
+  return p;
+}
+
+Matrix5d transfFast(Matrix5d cov, Vector5d const &  p) {
+  auto sqr = [](auto x) { return x*x;};
+  auto sinTheta = 1/std::sqrt(1+p(3)*p(3));
+  auto cosTheta = p(3)*sinTheta;
+  cov(2,2) = sqr(sinTheta) * (
+              cov(2,2)*sqr(1./(p(2)*p(2)))
+            + cov(3,3)*sqr(cosTheta*sinTheta/p(2))
+            );
+  cov(3,2) = cov(2,3) = cov(3,3) * cosTheta * sqr(sinTheta) / p(2); 
+  // for (int i=0; i<5; ++i) cov(i,2) *= -sinTheta/(p(2)*p(2));
+  // for (int i=0; i<5; ++i) cov(2,i) *= -sinTheta/(p(2)*p(2));
+  return cov;
+
+
+}
+
+Matrix5d Jacobian(Vector5d const &  p) {
+
+  Matrix5d J = Matrix5d::Identity();
+
+  auto sinTheta2 = 1/(1+p(3)*p(3));
+  auto sinTheta = std::sqrt(sinTheta2);
+  J(2,2) = -sinTheta/(p(2)*p(2));
+  J(2,3) = -sinTheta2*sinTheta*p(3)/p(2);
+  return J;
+}
+
+Matrix5d transf(Matrix5d const & cov, Matrix5d const& J) {
+
+   return J*cov*J.transpose();
+
+}  
+
+Matrix5d loadCov(Vector5d const & e) {
+
+  Matrix5d cov = Matrix5d::Zero();
+  for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
+  return cov;
+}
+
+
+#include<iostream>
+int main() {
+
+  //!<(phi,Tip,pt,cotan(theta)),Zip)
+  Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
+  Vector5d del0; del0 << 0.01,0.01,0.035,-0.03,-0.01;
+
+  Matrix5d J = Jacobian(par0);
+
+
+  Vector5d par1 = transf(par0);
+  Vector5d par2 = transf(par0+del0);
+  Vector5d del1 = par2-par1; 
+
+  Matrix5d cov0 = loadCov(del0);
+  Matrix5d cov1 = transf(cov0,J);
+  Matrix5d cov2 = transfFast(cov0,par0);
+
+  // don't ask: guess
+  std::cout << "par0 " << par0.transpose() << std::endl;
+  std::cout << "del0 " << del0.transpose() << std::endl;
+
+
+  std::cout << "par1 " << par1.transpose() << std::endl;
+  std::cout << "del1 " << del1.transpose() << std::endl;
+  std::cout << "del2 " << (J*del0).transpose() << std::endl;
+
+  std::cout << "del1^2 " << (del1.array()*del1.array()).transpose() << std::endl;
+  std::cout << std::endl;
+  std::cout << "J\n" << J << std::endl;
+  
+  std::cout << "cov0\n" << cov0 << std::endl;
+  std::cout << "cov1\n" << cov1 << std::endl;
+  std::cout << "cov2\n" << cov2 << std::endl;
+
+
+  return 0;
+
+
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
new file mode 100644
index 0000000000000..af4a3e52f46fa
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
@@ -0,0 +1,88 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+
+#include "test_common.h"
+
+using namespace Eigen;
+
+namespace Rfit {
+  constexpr uint32_t maxNumberOfTracks() { return 5*1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks();}
+  using Matrix3x4d = Eigen::Matrix<double,3,4>;
+  using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
+  using Matrix6x4f = Eigen::Matrix<float,6,4>;
+  using Map6x4f = Eigen::Map<Matrix6x4f,0,Eigen::Stride<6*stride(),stride()> >;
+  using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
+
+}
+
+template<typename M3x4, typename M6x4>
+void fillHitsAndHitsCov(M3x4 & hits, M6x4 & hits_ge) {
+  hits << 1.98645, 4.72598, 7.65632, 11.3151,
+          2.18002, 4.88864, 7.75845, 11.3134,
+          2.46338, 6.99838,  11.808,  17.793;
+  hits_ge.col(0)[0] = 7.14652e-06;
+  hits_ge.col(1)[0] = 2.15789e-06;
+  hits_ge.col(2)[0] = 1.63328e-06;
+  hits_ge.col(3)[0] = 6.27919e-06;
+  hits_ge.col(0)[2] = 6.10348e-06;
+  hits_ge.col(1)[2] = 2.08211e-06;
+  hits_ge.col(2)[2] = 1.61672e-06;
+  hits_ge.col(3)[2] = 6.28081e-06;
+  hits_ge.col(0)[5] = 5.184e-05;
+  hits_ge.col(1)[5] = 1.444e-05;
+  hits_ge.col(2)[5] = 6.25e-06;
+  hits_ge.col(3)[5] = 3.136e-05;
+  hits_ge.col(0)[1] = -5.60077e-06;
+  hits_ge.col(1)[1] = -1.11936e-06;
+  hits_ge.col(2)[1] = -6.24945e-07;
+  hits_ge.col(3)[1] = -5.28e-06;
+}
+
+void testFit() {
+  constexpr double B = 0.0113921;
+  Rfit::Matrix3xNd<4> hits;
+  Rfit::Matrix6x4f hits_ge = MatrixXf::Zero(6,4);
+
+  fillHitsAndHitsCov(hits, hits_ge);
+
+  std::cout << "sizes " << sizeof(hits) << ' ' << sizeof(hits_ge)
+	    << ' ' << sizeof(Vector4d)<< std::endl;
+  
+  std::cout << "Generated hits:\n" << hits << std::endl;
+  std::cout << "Generated cov:\n" << hits_ge << std::endl;
+
+  // FAST_FIT_CPU
+  Vector4d fast_fit_results; Rfit::Fast_fit(hits, fast_fit_results);
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+
+  // CIRCLE_FIT CPU
+  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
+  constexpr auto n = N;
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
+  Rfit::loadCovariance2D(hits_ge,hits_cov);
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
+      hits_cov,
+      fast_fit_results, rad, B, true);
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
+
+  // LINE_FIT CPU
+  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+
+  std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
+}
+
+int main (int argc, char * argv[]) {
+  testFit();
+  return 0;
+}
+
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
index e22fb5cfbf59b..79bb128eeec8a 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
+++ b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
@@ -5,14 +5,10 @@
 #include <cassert>
 #include <random>
 
-#ifndef TEST_DEBUG
-#define TEST_DEBUG 0
-#endif
-
 template<class C>
 __host__ __device__
 void printIt(C * m) {
-#if TEST_DEBUG
+#ifdef TEST_DEBUG
   printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
   for (u_int r = 0; r < m->rows(); ++r) {
     for (u_int c = 0; c < m->cols(); ++c) {
@@ -22,8 +18,8 @@ void printIt(C * m) {
 #endif
 }
 
-template<class C>
-bool isEqualFuzzy(C a, C b, double epsilon = 1e-6) {
+template<class C1, class C2>
+bool isEqualFuzzy(C1 a, C2 b, double epsilon = 1e-6) {
   for (unsigned int i = 0; i < a.rows(); ++i) {
     for (unsigned int j = 0; j < a.cols(); ++j) {
       assert(std::abs(a(i,j)-b(i,j))
@@ -37,6 +33,7 @@ bool isEqualFuzzy(double a, double b, double epsilon=1e-6) {
   return std::abs(a-b) < std::min(std::abs(a), std::abs(b))*epsilon;
 }
 
+
 template<typename T>
 void fillMatrix(T & t) {
   std::random_device rd;
diff --git a/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
new file mode 100644
index 0000000000000..fa538256ed010
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
@@ -0,0 +1,128 @@
+#ifndef RecoPixelVertexingPixelTripletsCircleEq_H
+#define RecoPixelVertexingPixelTripletsCircleEq_H
+/**
+| 1) circle is parameterized as:                                              |
+|    C*[(X-Xp)**2+(Y-Yp)**2] - 2*alpha*(X-Xp) - 2*beta*(Y-Yp) = 0             |
+|    Xp,Yp is a point on the track;                                           |
+|    C = 1/r0 is the curvature  ( sign of C is charge of particle );          |
+|    alpha & beta are the direction cosines of the radial vector at Xp,Yp     |
+|    i.e.  alpha = C*(X0-Xp),                                                 |
+|          beta  = C*(Y0-Yp),                                                 |
+|    where center of circle is at X0,Y0.                                      |
+|                                                                             |
+|    Slope dy/dx of tangent at Xp,Yp is -alpha/beta.                          |
+| 2) the z dimension of the helix is parameterized by gamma = dZ/dSperp       |
+|    this is also the tangent of the pitch angle of the helix.                |
+|    with this parameterization, (alpha,beta,gamma) rotate like a vector.     |
+| 3) For tracks going inward at (Xp,Yp), C, alpha, beta, and gamma change sign|
+|
+*/
+
+#include<cmath>
+
+template<typename T>
+class CircleEq {
+
+public:
+
+  CircleEq(){}
+
+  constexpr CircleEq(T x1, T y1,
+         T x2, T y2,
+         T x3, T y3) {
+    compute(x1,y1,x2,y2,x3,y3);
+  }
+
+  constexpr void compute(T x1, T y1,
+           T x2, T y2,
+           T x3, T y3);
+
+  // dca to origin divided by curvature
+  constexpr T dca0() const {
+   auto x =  m_c*m_xp + m_alpha;
+   auto y =  m_c*m_yp + m_beta;
+   return std::sqrt(x*x+y*y) - T(1);
+  }
+
+   // dca to given point (divided by curvature)
+  constexpr T dca(T x, T y) const {
+    x =  m_c*(m_xp-x) + m_alpha;
+    y =  m_c*(m_yp-y) + m_beta;
+   return std::sqrt(x*x+y*y) - T(1);
+
+  }
+
+  // curvature
+  constexpr auto curvature() const { return m_c;}
+
+
+  // alpha and beta
+  constexpr std::pair<T,T> cosdir() const {
+    return std::make_pair(m_alpha, m_beta);
+  }
+
+
+  // alpha and beta af given point
+  constexpr std::pair<T,T> cosdir(T x, T y) const {
+      return std::make_pair(m_alpha - m_c*(x-m_xp), m_beta - m_c*(y-m_yp));
+  }
+
+  // center
+  constexpr std::pair<T,T> center() const {
+   return std::make_pair(m_xp + m_alpha/m_c, m_yp + m_beta/m_c);
+  }
+
+  constexpr auto radius() const { return T(1)/m_c;}
+
+  T m_xp=0;
+  T m_yp=0;
+  T m_c=0;
+  T m_alpha=0;
+  T m_beta=0;
+
+};
+
+
+template<typename T>
+constexpr void CircleEq<T>::compute(T x1, T y1,
+                T x2, T y2,
+                T x3, T y3) {
+  bool noflip = std::abs(x3-x1) < std::abs(y3-y1);
+
+  auto x1p = noflip ? x1-x2 : y1-y2;
+  auto y1p = noflip ? y1-y2 : x1-x2;
+  auto d12 = x1p*x1p + y1p*y1p;
+  auto x3p = noflip ? x3-x2 : y3-y2;
+  auto y3p = noflip ? y3-y2 : x3-x2;
+  auto d32 = x3p*x3p + y3p*y3p;
+
+  auto num = x1p*y3p-y1p*x3p;  // num also gives correct sign for CT
+  auto det = d12*y3p-d32*y1p;
+
+  /*
+  auto ct  = num/det;
+  auto sn  = det>0 ? T(1.) : T(-1.);
+  auto st2 = (d12*x3p-d32*x1p)/det;
+  auto seq = T(1.) +st2*st2;
+  auto al2 = sn/std::sqrt(seq);
+  auto be2 = -st2*al2;
+  ct *= T(2.)*al2;
+  */
+
+  auto st2 = (d12*x3p-d32*x1p);
+  auto seq = det*det +st2*st2;
+  auto al2 = T(1.)/std::sqrt(seq);
+  auto be2 = -st2*al2;
+  auto ct = T(2.)*num*al2;
+  al2 *=det;
+
+  m_xp = x2;
+  m_yp = y2;
+  m_c = noflip ? ct : -ct;
+  m_alpha = noflip ? al2 : -be2;
+  m_beta  = noflip ? be2 : -al2;
+
+}
+
+#endif
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 77cb6c4da68a4..3c8397cf572f6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -11,7 +11,8 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
-<flags CXXFLAGS="-fno-math-errno"/>
+<flags CXXFLAGS="-g -fno-math-errno"/>
+<flags CUDA_FLAGS="-g"/>
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
new file mode 100644
index 0000000000000..942404a9313e3
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -0,0 +1,34 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
+
+
+namespace CAConstants {
+
+   // constants
+   constexpr uint32_t maxNumberOfQuadruplets() { return 10000; }
+   constexpr uint32_t maxCellsPerHit() { return 128; }
+   constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
+   constexpr uint32_t maxNumberOfLayers() { return 10; }
+   constexpr uint32_t maxNumberOfDoublets() { return 262144; }
+   constexpr uint32_t maxTuples() { return 10000;}
+
+   // types
+   using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct
+   using tindex_type = uint16_t; //  for tuples
+   using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>;
+   using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5*maxTuples()>;
+   using HitToTuple = OneToManyAssoc<tindex_type, PixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
+
+}
+
+
+
+#endif
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 43fcd88fa30de..dbd4eecbaab3c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -10,19 +10,25 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+#include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
 
-struct Quadruplet {
-   using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
-   hindex_type hitId[4];
-};
 
+#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
 class GPUCACell {
 public:
 
+  static constexpr int maxCellsPerHit = 128; // was 256
+  using OuterHitOfCell = GPU::VecArray< unsigned int, maxCellsPerHit>;
+
+
   using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
   using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
 
+  using TmpTuple = GPU::VecArray<uint32_t,6>;
+
+  using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+
   GPUCACell() = default;
 #ifdef __CUDACC__
 
@@ -39,6 +45,7 @@ class GPUCACell {
     theInnerZ = __ldg(hh.zg_d+innerHitId);
     theInnerR = __ldg(hh.rg_d+innerHitId);
     theOuterNeighbors.reset();
+    theTracks.reset();
   }
 
   __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
@@ -50,6 +57,9 @@ class GPUCACell {
   __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
   __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }
 
+  __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theInnerHitId); }
+  __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theOuterHitId); }
+
   constexpr unsigned int get_inner_hit_id() const {
     return theInnerHitId;
   }
@@ -69,10 +79,9 @@ class GPUCACell {
 
   __device__
   bool check_alignment(Hits const & hh,
-      GPUCACell const & otherCell, const float ptmin,
-      const float region_origin_x, const float region_origin_y,
-      const float region_origin_radius, const float thetaCut,
-      const float phiCut, const float hardPtCut) const
+      GPUCACell const & otherCell, 
+      const float ptmin,
+      const float hardCurvCut) const
   {
     auto ri = get_inner_r(hh);
     auto zi = get_inner_z(hh);
@@ -82,11 +91,9 @@ class GPUCACell {
 
     auto r1 = otherCell.get_inner_r(hh);
     auto z1 = otherCell.get_inner_z(hh);
-    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut);
-    return (aligned &&
-            haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x,
-                                 region_origin_y, region_origin_radius, phiCut,
-                                 hardPtCut));
+    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, 0.003f); // 2.f*thetaCut); // FIXME tune cuts
+    return (aligned &&  dcaCut(hh, otherCell, otherCell.get_inner_detId(hh)<96 ? 0.15f : 0.25f, hardCurvCut));  // FIXME tune cuts
+                            // region_origin_radius_plus_tolerance,  hardCurvCut));
   }
 
   __device__ __forceinline__
@@ -106,13 +113,12 @@ class GPUCACell {
     return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
   }
 
-  __device__ 
+  
+  __device__
   bool
-  haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell,
-                       const float ptmin, const float region_origin_x,
-                       const float region_origin_y,
-                       const float region_origin_radius, const float phiCut,
-                       const float hardPtCut) const {
+  dcaCut(Hits const & hh, GPUCACell const & otherCell,
+                       const float region_origin_radius_plus_tolerance,
+                       const float maxCurv) const {
 
     auto x1 = otherCell.get_inner_x(hh);
     auto y1 = otherCell.get_inner_y(hh);
@@ -123,69 +129,12 @@ class GPUCACell {
     auto x3 = get_outer_x(hh);
     auto y3 = get_outer_y(hh);
 
-    float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3);
-    float tan_12_13_half_mul_distance_13_squared =
-        fabs(y1 * (x2 - x3) + y2 * (x3 - x1) + y3 * (x1 - x2));
-    // high pt : just straight
-    if (tan_12_13_half_mul_distance_13_squared * ptmin <=
-        1.0e-4f * distance_13_squared) {
-
-      float distance_3_beamspot_squared =
-          (x3 - region_origin_x) * (x3 - region_origin_x) +
-          (y3 - region_origin_y) * (y3 - region_origin_y);
-
-      float dot_bs3_13 = ((x1 - x3) * (region_origin_x - x3) +
-                          (y1 - y3) * (region_origin_y - y3));
-      float proj_bs3_on_13_squared =
-          dot_bs3_13 * dot_bs3_13 / distance_13_squared;
-
-      float distance_13_beamspot_squared =
-          distance_3_beamspot_squared - proj_bs3_on_13_squared;
-
-      return distance_13_beamspot_squared <
-             (region_origin_radius + phiCut) * (region_origin_radius + phiCut);
-    } 
-
-    // 87 cm/GeV = 1/(3.8T * 0.3)
+    CircleEq<float> eq(x1,y1,x2,y2,x3,y3);  
 
-    // take less than radius given by the hardPtCut and reject everything below
-    float minRadius = hardPtCut * 87.f; // FIXME move out and use real MagField
+    if (eq.curvature() > maxCurv) return false;
 
-    auto det = (x1 - x2) * (y2 - y3) - (x2 - x3) * (y1 - y2);
+    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance*std::abs(eq.curvature());
 
-    auto offset = x2 * x2 + y2 * y2;
-
-    auto bc = (x1 * x1 + y1 * y1 - offset) * 0.5f;
-
-    auto cd = (offset - x3 * x3 - y3 * y3) * 0.5f;
-
-    auto idet = 1.f / det;
-
-    auto x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
-    auto y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;
-
-    auto radius = std::sqrt((x2 - x_center) * (x2 - x_center) +
-                            (y2 - y_center) * (y2 - y_center));
-
-    if (radius < minRadius)
-      return false; // hard cut on pt
-
-    auto centers_distance_squared =
-        (x_center - region_origin_x) * (x_center - region_origin_x) +
-        (y_center - region_origin_y) * (y_center - region_origin_y);
-    auto region_origin_radius_plus_tolerance = region_origin_radius + phiCut;
-    auto minimumOfIntersectionRange =
-        (radius - region_origin_radius_plus_tolerance) *
-        (radius - region_origin_radius_plus_tolerance);
-
-    if (centers_distance_squared >= minimumOfIntersectionRange) {
-      auto maximumOfIntersectionRange =
-          (radius + region_origin_radius_plus_tolerance) *
-          (radius + region_origin_radius_plus_tolerance);
-      return centers_distance_squared <= maximumOfIntersectionRange;
-    }
-
-    return false;
   }
 
   // trying to free the track building process from hardcoded layers, leaving
@@ -195,9 +144,10 @@ class GPUCACell {
 
   __device__
   inline void find_ntuplets(
-      GPUCACell const * __restrict__ cells,
-      GPU::SimpleVector<Quadruplet> *foundNtuplets,
-      GPU::VecArray<hindex_type,3> &tmpNtuplet,
+      GPUCACell * __restrict__ cells,
+      TuplesOnGPU::Container & foundNtuplets, 
+      AtomicPairCounter & apc,
+      TmpTuple & tmpNtuplet,
       const unsigned int minHitsPerNtuplet) const
   {
     // the building process for a track ends if:
@@ -206,34 +156,35 @@ class GPUCACell {
     // the ntuplets is then saved if the number of hits it contains is greater
     // than a threshold
 
-    tmpNtuplet.push_back_unsafe(theInnerHitId);
-    assert(tmpNtuplet.size()<=3);
+    tmpNtuplet.push_back_unsafe(theDoubletId);
+    assert(tmpNtuplet.size()<=4);
 
-    if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) {
-      Quadruplet tmpQuadruplet;
-      for (unsigned int i = 0; i < minHitsPerNtuplet-1; ++i) {
-        tmpQuadruplet.hitId[i] = tmpNtuplet[i];
-      }
-      tmpQuadruplet.hitId[minHitsPerNtuplet-1] = theOuterHitId;
-      foundNtuplets->push_back(tmpQuadruplet);
-    }
-    else {
+    if(theOuterNeighbors.size()>0) { // continue
       for (int j = 0; j < theOuterNeighbors.size(); ++j) {
         auto otherCell = theOuterNeighbors[j];
-        cells[otherCell].find_ntuplets(cells, foundNtuplets, tmpNtuplet,
+        cells[otherCell].find_ntuplets(cells, foundNtuplets, apc, tmpNtuplet,
                                        minHitsPerNtuplet);
       }
+    } else {  // if long enough save...
+      if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) {
+        hindex_type hits[6]; auto nh=0U;
+        for (auto c : tmpNtuplet) hits[nh++] = cells[c].theInnerHitId;
+        hits[nh] = theOuterHitId; 
+        uint16_t it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
+        for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it);
+      }
     }
     tmpNtuplet.pop_back();
-    assert(tmpNtuplet.size() < 3);
+    assert(tmpNtuplet.size() < 4);
   }
 
 #endif // __CUDACC__
 
-  GPU::VecArray< unsigned int, 40> theOuterNeighbors;
+  GPU::VecArray< uint32_t, 36> theOuterNeighbors;
+  GPU::VecArray< uint16_t, 42> theTracks;
 
-  int theDoubletId;
-  int theLayerPairId;
+  int32_t theDoubletId;
+  int32_t theLayerPairId;
 
 private:
   float theInnerZ;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
new file mode 100644
index 0000000000000..fe95e10a48b5a
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -0,0 +1,35 @@
+#include "RiemannFitOnGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+void RiemannFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, Rfit::helix_fit * helix_fit_results) {
+
+  tuples_d = tuples;
+  helix_fit_results_d = helix_fit_results;
+
+  assert(tuples_d); assert(helix_fit_results_d);
+
+  cudaCheck(cudaMalloc(&hitsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
+  cudaCheck(cudaMemset(hitsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
+
+  cudaCheck(cudaMalloc(&hits_geGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)));
+  cudaCheck(cudaMemset(hits_geGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)));
+
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)));
+  cudaCheck(cudaMemset(fast_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)));
+
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMemset(circle_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)));
+
+}
+
+void RiemannFitOnGPU::deallocateOnGPU() {
+
+  cudaFree(hitsGPU_);
+  cudaFree(hits_geGPU_);
+  cudaFree(fast_fit_resultsGPU_);
+  cudaFree(circle_fit_resultsGPU_);
+
+}
+
+
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
new file mode 100644
index 0000000000000..1bcfb847d2ae8
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -0,0 +1,195 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include "RiemannFitOnGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+
+using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+
+using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+
+using namespace Eigen;
+
+__global__
+void kernelFastFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
+    HitsOnGPU const * __restrict__ hhp,
+    int hits_in_fit,
+    double * __restrict__ phits,
+    float * __restrict__ phits_ge,
+    double * __restrict__ pfast_fit,
+    uint32_t offset)
+{
+
+  assert(hits_in_fit==4); // FixMe later template
+
+  assert(pfast_fit); assert(foundNtuplets);
+
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  auto helix_start = local_start + offset;
+
+  if (helix_start>=foundNtuplets->nbins()) return;
+  if (foundNtuplets->size(helix_start)<hits_in_fit) {
+    return;
+  }
+
+  Rfit::Map3x4d hits(phits+local_start);
+  Rfit::Map4d   fast_fit(pfast_fit+local_start);
+  Rfit::Map6x4f hits_ge(phits_ge+local_start);
+
+  // Prepare data structure
+  auto const * hitId = foundNtuplets->begin(helix_start);
+  for (unsigned int i = 0; i < hits_in_fit; ++i) {
+    auto hit = hitId[i];
+    // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
+    float ge[6];
+    hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
+    // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+
+    hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
+    hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
+  }
+  Rfit::Fast_fit(hits,fast_fit);
+
+  // no NaN here....
+  assert(fast_fit(0)==fast_fit(0));
+  assert(fast_fit(1)==fast_fit(1));
+  assert(fast_fit(2)==fast_fit(2));
+  assert(fast_fit(3)==fast_fit(3));
+
+}
+
+__global__
+void kernelCircleFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
+    int hits_in_fit,
+    double B,
+    double * __restrict__ phits,
+    float * __restrict__ phits_ge,
+    double * __restrict__ pfast_fit_input,
+    Rfit::circle_fit *circle_fit,
+    uint32_t offset)
+{
+  assert(circle_fit); 
+
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  auto helix_start = local_start + offset;
+
+  if (helix_start>=foundNtuplets->nbins()) return;
+  if (foundNtuplets->size(helix_start)<hits_in_fit) {
+    return;
+  }
+
+  Rfit::Map3x4d hits(phits+local_start);
+  Rfit::Map4d   fast_fit(pfast_fit_input+local_start);
+  Rfit::Map6x4f hits_ge(phits_ge+local_start);
+
+  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
+  constexpr auto n = N;
+
+  assert(4==n); // later will be templated...
+
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<4>::Zero();
+  Rfit::loadCovariance2D(hits_ge,hits_cov);
+
+  circle_fit[local_start] =
+      Rfit::Circle_fit(hits.block(0, 0, 2, n),
+                       hits_cov,
+                       fast_fit, rad, B, true);
+
+#ifdef GPU_DEBUG
+//  printf("kernelCircleFitAllHits circle.par(0,1,2): %d %f,%f,%f\n", helix_start, 
+//         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
+#endif
+}
+
+__global__
+void kernelLineFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
+    int hits_in_fit,
+    double B,
+    Rfit::helix_fit *results,
+    double * __restrict__ phits,
+    float * __restrict__ phits_ge,
+    double * __restrict__ pfast_fit,
+    Rfit::circle_fit * __restrict__ circle_fit,
+    uint32_t offset)
+{
+
+  assert(results); assert(circle_fit);
+
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  auto helix_start = local_start + offset;
+
+  if (helix_start>=foundNtuplets->nbins()) return;
+  if (foundNtuplets->size(helix_start)<hits_in_fit) {
+    return;
+  }
+
+  Rfit::Map3x4d hits(phits+local_start);
+  Rfit::Map4d   fast_fit(pfast_fit+local_start);
+  Rfit::Map6x4f hits_ge(phits_ge+local_start);
+  auto const & line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true);
+
+  par_uvrtopak(circle_fit[local_start], B, true);
+
+  // Grab helix_fit from the proper location in the output vector
+  auto & helix = results[helix_start];
+  helix.par << circle_fit[local_start].par, line_fit.par;
+
+  // TODO: pass properly error booleans
+
+  helix.cov = Rfit::Matrix5d::Zero();
+  helix.cov.block(0, 0, 3, 3) = circle_fit[local_start].cov;
+  helix.cov.block(3, 3, 2, 2) = line_fit.cov;
+
+  helix.q = circle_fit[local_start].q;
+  helix.chi2_circle = circle_fit[local_start].chi2;
+  helix.chi2_line = line_fit.chi2;
+
+#ifdef GPU_DEBUG
+  printf("kernelLineFitAllHits circle.par(0,1,2): %d %f,%f,%f\n", helix_start,
+         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
+  printf("kernelLineFitAllHits line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0),line_fit.par(1));
+  printf("kernelLineFitAllHits chi2 cov %f/%f %f,%f,%f,%f,%f\n",helix.chi2_circle,helix.chi2_line, 
+         helix.cov(0,0),helix.cov(1,1),helix.cov(2,2),helix.cov(3,3),helix.cov(4,4));
+#endif
+}
+
+
+void RiemannFitOnGPU::launchKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+{
+    assert(tuples_d); assert(fast_fit_resultsGPU_);
+
+    auto blockSize = 128;
+    auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+    for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
+      kernelFastFitAllHits<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, hh.gpu_d, 4,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelCircleFitAllHits<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, 4, bField_,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+      cudaCheck(cudaGetLastError());
+
+
+      kernelLineFitAllHits<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tuples_d, 4,  bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             offset);
+      cudaCheck(cudaGetLastError());
+    }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
new file mode 100644
index 0000000000000..fac88ac2c2bd4
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -0,0 +1,60 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_RiemannFitOnGPU_h
+#define RecoPixelVertexing_PixelTrackFitting_plugins_RiemannFitOnGPU_h
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
+
+namespace siPixelRecHitsHeterogeneousProduct {
+   struct HitsOnCPU;
+}
+
+namespace Rfit {
+  constexpr uint32_t maxNumberOfConcurrentFits() { return 2*1024;}
+  constexpr uint32_t stride() { return maxNumberOfConcurrentFits();}
+  using Matrix3x4d = Eigen::Matrix<double,3,4>;
+  using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
+  using Matrix6x4f = Eigen::Matrix<float,6,4>;
+  using Map6x4f = Eigen::Map<Matrix6x4f,0,Eigen::Stride<6*stride(),stride()> >;
+  using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
+
+}
+
+
+class RiemannFitOnGPU {
+public:
+
+   using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+   using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+
+   using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+
+   RiemannFitOnGPU() = default;
+   ~RiemannFitOnGPU() { deallocateOnGPU();}
+
+   void setBField(double bField) { bField_ = bField;}
+   void launchKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+
+   void allocateOnGPU(TuplesOnGPU::Container const * tuples, Rfit::helix_fit * helix_fit_results);
+   void deallocateOnGPU();
+
+
+private:
+
+    static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
+
+    // fowarded
+    TuplesOnGPU::Container const * tuples_d = nullptr;
+    double bField_;
+    Rfit::helix_fit * helix_fit_results_d = nullptr;
+
+
+
+   // Riemann Fit internals
+   double *hitsGPU_ = nullptr;
+   float *hits_geGPU_ = nullptr;
+   double *fast_fit_resultsGPU_ = nullptr;
+   Rfit::circle_fit *circle_fit_resultsGPU_ = nullptr;
+
+};
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
new file mode 100644
index 0000000000000..717cbf777fcdb
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -0,0 +1,93 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+
+#include "GPUCACell.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+
+namespace gpuPixelDoublets {
+
+//  __device__
+//  __forceinline__
+  __global__
+  void fishbone(
+               GPUCACell::Hits const *  __restrict__ hhp,
+               GPUCACell * cells, uint32_t const * __restrict__ nCells,
+               GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
+               uint32_t nHits,
+               uint32_t stride, bool checkTrack) {
+
+    constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
+
+
+    auto const & hh = *hhp;
+    uint8_t const * __restrict__ layerp =  hh.phase1TopologyLayer_d;
+    auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};
+
+    auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
+    auto idx = ldx/stride;
+    auto first = ldx - idx*stride;
+    assert(first<stride);
+
+    if (idx>=nHits) return;
+    auto const & vc = isOuterHitOfCell[idx];
+    auto s = vc.size();
+    if (s<2) return;
+    // if alligned kill one of the two.
+    auto const & c0 = cells[vc[0]];
+    auto xo = c0.get_outer_x(hh);
+    auto yo = c0.get_outer_y(hh);
+    auto zo = c0.get_outer_z(hh);
+    float x[maxCellsPerHit], y[maxCellsPerHit],z[maxCellsPerHit], n[maxCellsPerHit];
+    uint16_t d[maxCellsPerHit]; // uint8_t l[maxCellsPerHit];
+    uint32_t cc[maxCellsPerHit];
+    auto sg=0;
+    for (uint32_t ic=0; ic<s; ++ic) {
+      auto & ci = cells[vc[ic]];
+      if (checkTrack && 0==ci.theTracks.size()) continue;
+      cc[sg] = vc[ic];
+      d[sg] = ci.get_inner_detId(hh);
+//      l[sg] = layer(d[sg]);
+      x[sg] = ci.get_inner_x(hh) -xo;
+      y[sg] = ci.get_inner_y(hh) -yo;
+      z[sg] = ci.get_inner_z(hh) -zo;
+      n[sg] = x[sg]*x[sg]+y[sg]*y[sg]+z[sg]*z[sg];
+      ++sg;
+    }
+    if (sg<2) return;   
+    // here we parallelize
+    for (uint32_t ic=first; ic<sg-1;  ic+=stride) {
+      auto & ci = cells[cc[ic]];
+      for    (auto jc=ic+1; jc<sg; ++jc) {
+        auto & cj = cells[cc[jc]];
+        // must be different detectors (in the same layer)
+//        if (d[ic]==d[jc]) continue;
+        // || l[ic]!=l[jc]) continue;
+        auto cos12 = x[ic]*x[jc]+y[ic]*y[jc]+z[ic]*z[jc];
+        if (d[ic]!=d[jc] && cos12*cos12 >= 0.99999f*n[ic]*n[jc]) {
+         // alligned:  kill farthest  (prefer consecutive layers)
+         if (n[ic]>n[jc]) {
+           ci.theDoubletId=-1; 
+           break;
+         } else {
+           cj.theDoubletId=-1;
+         }
+        }
+      } //cj   
+    } // ci
+  }
+
+}
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index d4b44f64573c6..02a175fcc2903 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -13,10 +13,11 @@
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
 #include "GPUCACell.h"
+#include "CAConstants.h"
 
 namespace gpuPixelDoublets {
 
-  constexpr uint32_t MaxNumOfDoublets = 1024*1024*256;
+  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
 
   template<typename Hist>
   __device__
@@ -29,7 +30,7 @@ namespace gpuPixelDoublets {
                          Hist const & __restrict__ hist,
                          uint32_t const * __restrict__ offsets,
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
-                         GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
+                         GPUCACell::OuterHitOfCell * isOuterHitOfCell,
                          int16_t const * __restrict__ phicuts,
                          float const * __restrict__ minz,
                          float const * __restrict__ maxz,
@@ -122,7 +123,8 @@ namespace gpuPixelDoublets {
           if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
             continue;
           if (z0cutoff(oi) || ptcut(oi)) continue;
-          auto ind = atomicInc(nCells, MaxNumOfDoublets);
+          auto ind = atomicAdd(nCells, 1); 
+          if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector??
           // int layerPairId, int doubletId, int innerHitId, int outerHitId)
           cells[ind].init(hh, pairLayerId, ind, i, oi);
           isOuterHitOfCell[oi].push_back(ind);
@@ -130,9 +132,10 @@ namespace gpuPixelDoublets {
           ++tot;
         }
       }
+#ifdef GPU_DEBUG
       if (tooMany > 0)
-        printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
-
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
+#endif
     }  // loop in block...
   }
 
@@ -144,7 +147,7 @@ namespace gpuPixelDoublets {
   void getDoubletsFromHisto(GPUCACell * cells,
                             uint32_t * nCells,
                             siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
-                            GPU::VecArray<unsigned int, 256> * isOuterHitOfCell)
+                            GPUCACell::OuterHitOfCell * isOuterHitOfCell)
   {
     constexpr int nPairs = 13;
     constexpr const uint8_t layerPairs[2*nPairs] = {
diff --git a/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py
index 8497eba9f759f..c72c07ae5a721 100644
--- a/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py
+++ b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py
@@ -2,7 +2,3 @@
 from RecoPixelVertexing.PixelTriplets.caHitQuadrupletDefaultEDProducer_cfi import caHitQuadrupletDefaultEDProducer as _caHitQuadrupletDefaultEDProducer
 
 caHitQuadrupletEDProducer = _caHitQuadrupletDefaultEDProducer.clone()
-
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoPixelVertexing.PixelTriplets.caHitQuadrupletHeterogeneousEDProducer_cfi import caHitQuadrupletHeterogeneousEDProducer as _caHitQuadrupletHeterogeneousEDProducer
-gpu.toReplaceWith(caHitQuadrupletEDProducer, _caHitQuadrupletHeterogeneousEDProducer)
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 1de3629887ec9..9f5d10ad020e9 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -21,3 +21,4 @@
 </bin>
 
 <bin file="fastDPHI_t.cpp"/>
+<bin file="CircleEq_t.cpp"/>
diff --git a/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp
new file mode 100644
index 0000000000000..cbbcea96d1ee8
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp
@@ -0,0 +1,99 @@
+#include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
+#include <cassert>
+
+
+struct OriCircle {
+
+   using T = float;
+
+   float radius=0;
+   float x_center=0;
+   float y_center=0;
+
+
+  constexpr OriCircle(T x1, T y1,
+         T x2, T y2,
+         T x3, T y3) {
+    compute(x1,y1,x2,y2,x3,y3);
+  }
+
+  // dca to origin
+  constexpr T dca0() const {
+   return std::sqrt(x_center*x_center + y_center*y_center) - radius;
+  }
+
+   // dca to given point
+  constexpr T dca(T x, T y) const {
+    x-=x_center;
+    y-=y_center;
+   return std::sqrt(x*x+y*y)-radius;
+  }
+
+
+  constexpr void compute(T x1, T y1,
+           T x2, T y2,
+           T x3, T y3) {
+
+      auto det = (x1 - x2) * (y2 - y3) - (x2 - x3) * (y1 - y2);
+
+      auto offset = x2 * x2 + y2 * y2;
+
+      auto bc = (x1 * x1 + y1 * y1 - offset) * 0.5f;
+ 
+      auto cd = (offset - x3 * x3 - y3 * y3) * 0.5f;
+
+      auto idet = 1.f / det;
+
+      x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
+      y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;
+
+      radius = std::sqrt((x2 - x_center) * (x2 - x_center) +
+                              (y2 - y_center) * (y2 - y_center));
+
+    }
+};
+
+
+#include<iostream>
+
+template<typename T>
+bool equal(T a, T b) {
+  //  return float(a-b)==0;
+  return std::abs(float(a-b)) < std::abs(0.01f*a);
+}
+
+
+
+int main() {
+
+  float r1=4, r2=8, r3=15;
+  for(float phi=-3; phi<3.1; phi+=0.5) {
+    float x1=r1*cos(phi);
+    float x2=r2*cos(phi);
+    float y1=r1*sin(phi);
+    float y2=r2*sin(phi);
+    for(float phi3=phi-0.31; phi3<phi+0.31; phi3+=0.05) {
+      float x3=r3*cos(phi3);
+      float y3=r3*sin(phi3);
+
+      OriCircle ori(x1,y1,x2,y2,x3,y3);
+      CircleEq<float> eq(x1,y1,x2,y2,x3,y3);
+      // std::cout << "r " << ori.radius <<' '<< eq.radius() << std::endl;
+      assert( equal(ori.radius, std::abs(eq.radius())) );
+      auto c = eq.center();
+      auto dir = eq.cosdir();
+      assert (equal(1.f,dir.first*dir.first+dir.second*dir.second));
+      assert( equal(ori.x_center,c.first) );
+      assert( equal(ori.y_center,c.second) );
+      // std::cout << "dca " << ori.dca0() <<' '<< eq.radius()*eq.dca0() << std::endl;
+      assert( equal( std::abs(ori.dca0()), std::abs(eq.radius()*eq.dca0())) );
+      // std::cout << "dca " << ori.dca(1.,1.) <<' '<< eq.radius()*eq.dca(1.,1.) << std::endl;
+      assert( equal( std::abs(ori.dca(1.,1.)), std::abs(eq.radius()*eq.dca(1.,1.))) );
+
+    }
+  }
+
+
+
+  return 0;
+}

From 3deb206cf6f3a2457a57952c931478efe7f5871c Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 17 Jan 2019 08:31:51 +0100
Subject: [PATCH 036/102] Remove unnecessary pragmas (cms-patatrack#249)

`#pragma unroll` is not supported by GCC, leading to compilation
warnings in host code.
GCC 8 supports `#pragma GCC unroll N` which could be used instead.

However, benchmarking on a V100 with and without the `#pragma unroll`
there is no observable difference, so it is simpler to remove them.
---
 RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 3e93aab13d00d..35cf3d3256b6b 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -827,11 +827,9 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
         constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
 
         Matrix6d E;  // cov matrix of the 6 independent elements of A
-        #pragma unroll
         for (u_int a = 0; a < 6; ++a)
         {
             const u_int i = nu[a][0], j = nu[a][1];
-            #pragma unroll
             for (u_int b = a; b < 6; ++b)
             {
                 const u_int k = nu[b][0], l = nu[b][1];
@@ -875,7 +873,6 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
         printIt(&E, "circle_fit - E:");
 
         Eigen::Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
-        #pragma unroll
         for (u_int a = 0; a < 6; ++a)
         {
             const u_int i = nu[a][0], j = nu[a][1];

From 0f2c2e011175caf755f866392c84d86ee184e777 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 17 Jan 2019 15:50:11 +0100
Subject: [PATCH 037/102] Skip CUDA-related tests if no GPU is present
 (cms-patatrack#252)

Make unit tests that require a CUDA device skip the test and exit
succesfully if the CUDA runtime is not available, or no CUDA devices
are available.
---
 .../PixelTrackFitting/test/BuildFile.xml      | 59 ++++++++++---------
 .../PixelTrackFitting/test/testEigenGPU.cu    |  7 ++-
 .../test/testEigenGPUNoFit.cu                 |  3 +
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index b4b5e3a335bcb..e6c67be031342 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -1,21 +1,20 @@
-<library   file="PixelTrackTest.cc" name="PixelTrackTest">
-<use   name="boost"/>
-<use   name="root"/>
-<use   name="FWCore/Framework"/>
-<use   name="FWCore/PluginManager"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="Geometry/Records"/>
-<use   name="Geometry/CommonDetUnit"/>
-<use   name="Geometry/TrackerGeometryBuilder"/>
-<use   name="DataFormats/TrackerRecHit2D"/>
-<use   name="RecoTracker/TkHitPairs"/>
-<use   name="RecoTracker/TkTrackingRegions"/>
-<use   name="RecoPixelVertexing/PixelTriplets"/>
-<use   name="RecoPixelVertexing/PixelTrackFitting"/>
-  <flags   EDM_PLUGIN="1"/>
+<library file="PixelTrackTest.cc" name="PixelTrackTest">
+<use name="boost"/>
+<use name="root"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/PluginManager"/>
+<use name="FWCore/ParameterSet"/>
+<use name="Geometry/Records"/>
+<use name="Geometry/CommonDetUnit"/>
+<use name="Geometry/TrackerGeometryBuilder"/>
+<use name="DataFormats/TrackerRecHit2D"/>
+<use name="RecoTracker/TkHitPairs"/>
+<use name="RecoTracker/TkTrackingRegions"/>
+<use name="RecoPixelVertexing/PixelTriplets"/>
+<use name="RecoPixelVertexing/PixelTrackFitting"/>
+  <flags EDM_PLUGIN="1"/>
 </library>
 
-
 <bin file="testRiemannFit.cpp">
   <use name="eigen"/>
   <use name="cuda"/>
@@ -32,28 +31,34 @@
   <use name="eigen"/>
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g"/>
 </bin>
+
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
   <use name="eigen"/>
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g"/>
 </bin>
+
 <bin file="PixelTrackRiemannFit.cc">
-  <use   name="eigen"/>
-  <use   name="cuda"/>
-  <use   name="root"/>
-  <flags   CXXFLAGS="-DEIGEN_NO_DEBUG"/>
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <use name="root"/>
+  <flags CXXFLAGS="-DEIGEN_NO_DEBUG"/>
 </bin>
+
 <bin file="PixelTrackRiemannFit.cc" name = "PixelTrackRiemannFit_Debug">
-  <use   name="eigen"/>
-  <use   name="cuda"/>
-  <use   name="root"/>
-  <flags   CXXFLAGS="-g"/>
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <use name="root"/>
+  <flags CXXFLAGS="-g"/>
 </bin>
+
 <bin file="testEigenJacobian.cpp">
-  <use   name="eigen"/>
-  <use   name="cuda"/>
-  <flags   CXXFLAGS="-g"/>
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <flags CXXFLAGS="-g"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index a60eeda935d79..3917de89a8185 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -3,11 +3,10 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
-
-#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-
 #include "test_common.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
 
 using namespace Eigen;
 
@@ -216,6 +215,8 @@ void testFit() {
 }
 
 int main (int argc, char * argv[]) {
+  exitSansCUDADevices();
+
   testFit();
   std::cout << "TEST FIT, NO ERRORS" << std::endl;
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index ead2e3cc00504..17413f4ef3e2a 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -5,6 +5,8 @@
 
 #include "test_common.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
 using namespace Eigen;
 
 __host__ __device__ void eigenValues(Matrix3d * m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType * ret) {
@@ -183,6 +185,7 @@ std::cout << "*************************\n\n" << std::endl;
 
 
 int main (int argc, char * argv[]) {
+  exitSansCUDADevices();
 
   testEigenvalues();
   testInverse3x3();

From 64b28b4076f5f7534c612b712ddc0dbfe1f2a2a6 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 24 Jan 2019 14:21:32 +0100
Subject: [PATCH 038/102] Speed up the doublet finder (cms-patatrack#260)

Introduce the inner loop parallelization in the doublet finder using the
stride pattern already used in the "fishbone", and make use of a 2D grid
instead of a hand-made stride.
---
 .../PixelTriplets/plugins/gpuFishbone.h       | 19 +++++++++----------
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 12 ++++++++----
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 717cbf777fcdb..796241eaf50ff 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
                GPUCACell * cells, uint32_t const * __restrict__ nCells,
                GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
                uint32_t nHits,
-               uint32_t stride, bool checkTrack) {
+               bool checkTrack) {
 
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
@@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
     uint8_t const * __restrict__ layerp =  hh.phase1TopologyLayer_d;
     auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};
 
-    auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
-    auto idx = ldx/stride;
-    auto first = ldx - idx*stride;
-    assert(first<stride);
+    // x run faster...
+    auto idy = threadIdx.y + blockIdx.y * blockDim.y;
+    auto first = threadIdx.x;
 
-    if (idx>=nHits) return;
-    auto const & vc = isOuterHitOfCell[idx];
+    if (idy>=nHits) return;
+    auto const & vc = isOuterHitOfCell[idy];
     auto s = vc.size();
     if (s<2) return;
     // if alligned kill one of the two.
@@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
       ++sg;
     }
     if (sg<2) return;   
-    // here we parallelize
-    for (uint32_t ic=first; ic<sg-1;  ic+=stride) {
+    // here we parallelize 
+    for (uint32_t ic=first; ic<sg-1;  ic+=blockDim.x) {
       auto & ci = cells[cc[ic]];
       for    (auto jc=ic+1; jc<sg; ++jc) {
         auto & cj = cells[cc[jc]];
@@ -90,4 +89,4 @@ namespace gpuPixelDoublets {
 
 }
 
-#endif
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 02a175fcc2903..192efd0d8919f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -50,8 +50,11 @@ namespace gpuPixelDoublets {
     }
     auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
 
       uint32_t pairLayerId=0;
       while (j >= innerLayerCumulativeSize[pairLayerId++]);
@@ -115,7 +118,8 @@ namespace gpuPixelDoublets {
           nmin += hist.size(kk+hoff);
         auto const * __restrict__ p = hist.begin(kk+hoff);
         auto const * __restrict__ e = hist.end(kk+hoff);
-        for (;p < e; ++p) {
+        p+=first;
+        for (;p < e; p+=stride) {
           auto oi=__ldg(p);
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
@@ -139,7 +143,7 @@ namespace gpuPixelDoublets {
     }  // loop in block...
   }
 
-  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
   __global__

From 5a77d602de8202fc56b00287f8617473e592bba3 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 28 Jan 2019 11:37:35 +0100
Subject: [PATCH 039/102] Synchronise with CMSSW_10_5_0_pre1

---
 Validation/RecoTrack/python/TrackValidation_cff.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index bc84c87cf191f..b15a3489bb918 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -836,7 +836,6 @@ def _uniqueFirstLayers(layerList):
 ## customization for timing
 from Configuration.Eras.Modifier_phase2_timing_layer_cff import phase2_timing_layer
 phase2_timing_layer.toModify( generalTracksFromPV, 
-                              vertexTag = cms.InputTag('offlinePrimaryVertices4D'),
                               timesTag  = cms.InputTag('trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModel'), 
                               timeResosTag = cms.InputTag('trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModelResolution'), 
                               nSigmaDtVertex = cms.double(3) )
@@ -850,3 +849,9 @@ def _uniqueFirstLayers(layerList):
                               label_vertex = cms.untracked.InputTag('offlinePrimaryVertices4D') )
 phase2_timing_layer.toModify( trackValidatorGsfTracks,
                               label_vertex = cms.untracked.InputTag('offlinePrimaryVertices4D') )
+
+from Configuration.Eras.Modifier_phase2_timing_layer_tile_cff import phase2_timing_layer_tile
+from Configuration.Eras.Modifier_phase2_timing_layer_bar_cff import phase2_timing_layer_bar
+(phase2_timing_layer_tile | phase2_timing_layer_bar).toModify( generalTracksFromPV, 
+                              timesTag  = cms.InputTag('tofPID:t0'), 
+                              timeResosTag = cms.InputTag('tofPID:sigmat0') )

From 96b2f736c6992a9c85fef02067bd9922b48c8d69 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 13 Mar 2019 10:04:32 -0500
Subject: [PATCH 040/102] Next prototype of the framework integration
 (cms-patatrack#100)

Provide a mechanism for a chain of modules to share a resource, that can be e.g. CUDA device memory or a CUDA stream.
Minimize data movements between the CPU and the device, and support multiple devices.
Allow the same job configuration to be used on all hardware combinations.

See HeterogeneousCore/CUDACore/README.md for a more detailed description and examples.
---
 .../Configuration/python/customizePixelTracksForProfiling.py    | 2 --
 RecoPixelVertexing/PixelTriplets/test/BuildFile.xml             | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 15224adb78cc3..58935e9a6991c 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process):
     process = customizePixelTracksForProfiling(process)
 
     # Disable conversions to legacy
-    process.siPixelClustersPreSplitting.gpuEnableConversion = False
     process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
     process.pixelTracksHitQuadruplets.gpuEnableConversion = False
     process.pixelTracks.gpuEnableConversion = False
@@ -33,7 +32,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
     process = customizePixelTracksForProfilingDisableConversion(process)
 
     # Disable "unnecessary" transfers to CPU
-    process.siPixelClustersPreSplitting.gpuEnableTransfer = False
     process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
     process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
     process.pixelVertices.gpuEnableTransfer = False
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 9f5d10ad020e9..767d140a5d5ed 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -18,6 +18,7 @@
 </bin>
 <bin file="PixelTriplets_InvPrbl_prec.cpp">
   <use   name="RecoPixelVertexing/PixelTriplets"/>
+  <flags NO_TESTRUN="1"/>
 </bin>
 
 <bin file="fastDPHI_t.cpp"/>

From 4ed9088812ecc334025d77f10f24aae65fb3b1a7 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 15 Mar 2019 09:26:12 -0500
Subject: [PATCH 041/102] Various updates to pixel track/vertex DQM and MTV
 (cms-patatrack#285)

* Add DQM for pixel vertices

* Add pT>0.9GeV pixel track collections to MTV

* Add dzPV0p1, Pt0to1, Pt1 variants of pixel track DQM
---
 .../pixelTrackingEffFromHitPattern_cff.py     |  5 +-
 .../python/pixelTracksMonitoring_cff.py       | 93 +++++++++++++++----
 .../RecoTrack/python/TrackValidation_cff.py   | 14 ++-
 Validation/RecoTrack/python/plotting/html.py  | 44 ++++++---
 .../python/plotting/trackingPlots.py          |  9 +-
 5 files changed, 127 insertions(+), 38 deletions(-)

diff --git a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py
index 15ceaf93ed20a..cff85e56d94f7 100644
--- a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py
+++ b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py
@@ -21,7 +21,10 @@ def _layers(suffix, quant, histoPostfix):
     ]
 
 pixelTrackingEffFromHitPattern = DQMEDHarvester("DQMGenericClient",
-    subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/HitEffFromHitPattern*"),
+    subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/pixelTracks/HitEffFromHitPattern*",
+                                    "Tracking/PixelTrackParameters/dzPV0p1/HitEffFromHitPattern*",
+                                    "Tracking/PixelTrackParameters/pt_0to1/HitEffFromHitPattern*",
+                                    "Tracking/PixelTrackParameters/pt_1/HitEffFromHitPattern*"),
     efficiency = cms.vstring(
         _layers("PU", "GoodNumVertices", "") +
         _layers("BX", "BX", "VsBX") +
diff --git a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
index c91dc2b2730de..d5deba78b46c8 100644
--- a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
+++ b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
@@ -1,22 +1,77 @@
 import FWCore.ParameterSet.Config as cms
 
 import DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi
-pixelTracksMonitoring = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone()
-pixelTracksMonitoring.FolderName                = 'Tracking/PixelTrackParameters'
-pixelTracksMonitoring.TrackProducer             = 'pixelTracks'
-pixelTracksMonitoring.allTrackProducer          = 'pixelTracks'
-pixelTracksMonitoring.beamSpot                  = 'offlineBeamSpot'
-pixelTracksMonitoring.primaryVertex             = 'pixelVertices'
-pixelTracksMonitoring.pvNDOF                    = 1
-pixelTracksMonitoring.doAllPlots                = True
-pixelTracksMonitoring.doLumiAnalysis            = True
-pixelTracksMonitoring.doProfilesVsLS            = True
-pixelTracksMonitoring.doDCAPlots                = True
-pixelTracksMonitoring.doProfilesVsLS            = True
-pixelTracksMonitoring.doPlotsVsGoodPVtx         = True
-pixelTracksMonitoring.doEffFromHitPatternVsPU   = False
-pixelTracksMonitoring.doEffFromHitPatternVsBX   = False
-pixelTracksMonitoring.doEffFromHitPatternVsLUMI = False
-pixelTracksMonitoring.doPlotsVsGoodPVtx         = True
-pixelTracksMonitoring.doPlotsVsLUMI             = True
-pixelTracksMonitoring.doPlotsVsBX               = True
+pixelTracksMonitor = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone()
+pixelTracksMonitor.FolderName                = 'Tracking/PixelTrackParameters/pixelTracks'
+pixelTracksMonitor.TrackProducer             = 'pixelTracks'
+pixelTracksMonitor.allTrackProducer          = 'pixelTracks'
+pixelTracksMonitor.beamSpot                  = 'offlineBeamSpot'
+pixelTracksMonitor.primaryVertex             = 'pixelVertices'
+pixelTracksMonitor.pvNDOF                    = 1
+pixelTracksMonitor.doAllPlots                = True
+pixelTracksMonitor.doLumiAnalysis            = True
+pixelTracksMonitor.doProfilesVsLS            = True
+pixelTracksMonitor.doDCAPlots                = True
+pixelTracksMonitor.doProfilesVsLS            = True
+pixelTracksMonitor.doPlotsVsGoodPVtx         = True
+pixelTracksMonitor.doEffFromHitPatternVsPU   = False
+pixelTracksMonitor.doEffFromHitPatternVsBX   = False
+pixelTracksMonitor.doEffFromHitPatternVsLUMI = False
+pixelTracksMonitor.doPlotsVsGoodPVtx         = True
+pixelTracksMonitor.doPlotsVsLUMI             = True
+pixelTracksMonitor.doPlotsVsBX               = True
+
+_trackSelector = cms.EDFilter('TrackSelector',
+    src = cms.InputTag('pixelTracks'),
+    cut = cms.string("")
+)
+
+pixelTracksPt0to1 = _trackSelector.clone(cut = "pt >= 0 & pt < 1 ")
+pixelTracksPt1 = _trackSelector.clone(cut = "pt >= 1 ")
+from DQM.TrackingMonitorSource.TrackCollections2monitor_cff import highPurityPV0p1 as _highPurityPV0p1
+pixelTracksPV0p1 = _highPurityPV0p1.clone(
+    src = "pixelTracks",
+    quality = "",
+    vertexTag = "goodPixelVertices"
+)
+
+pixelTracksMonitorPt0to1 = pixelTracksMonitor.clone(
+    TrackProducer = "pixelTracksPt0to1",
+    FolderName = "Tracking/PixelTrackParameters/pt_0to1"
+)
+pixelTracksMonitorPt1 = pixelTracksMonitor.clone(
+    TrackProducer = "pixelTracksPt1",
+    FolderName = "Tracking/PixelTrackParameters/pt_1"
+)
+pixelTracksMonitorPV0p1 = pixelTracksMonitor.clone(
+    TrackProducer = "pixelTracksPV0p1",
+    FolderName = "Tracking/PixelTrackParameters/dzPV0p1"
+)
+
+
+from CommonTools.ParticleFlow.goodOfflinePrimaryVertices_cfi import goodOfflinePrimaryVertices as _goodOfflinePrimaryVertices
+goodPixelVertices = _goodOfflinePrimaryVertices.clone(
+    src = "pixelVertices",
+)
+
+from DQM.TrackingMonitor.primaryVertexResolution_cfi import primaryVertexResolution as _primaryVertexResolution
+pixelVertexResolution = _primaryVertexResolution.clone(
+    vertexSrc = "goodPixelVertices",
+    rootFolder = "OfflinePixelPV/Resolution",
+)
+
+pixelTracksMonitoringTask = cms.Task(
+    goodPixelVertices,
+    pixelTracksPt0to1,
+    pixelTracksPt1,
+    pixelTracksPV0p1,
+)
+
+pixelTracksMonitoring = cms.Sequence(
+    pixelTracksMonitor +
+    pixelTracksMonitorPt0to1 +
+    pixelTracksMonitorPt1 +
+    pixelTracksMonitorPV0p1 +
+    pixelVertexResolution,
+    pixelTracksMonitoringTask
+)
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index b15a3489bb918..fb5b1cdba04b2 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -756,15 +756,17 @@ def _uniqueFirstLayers(layerList):
     trackAssociation = "trackingParticlePixelTrackAsssociation"
 )
 
-pixelTracksFromPV = generalTracksFromPV.clone(
+_pixelTracksCustom = dict(
     src = "pixelTracks",
     vertexTag = "pixelVertices",
-    quality = "undefQuality",
 )
+pixelTracksPt09 = generalTracksPt09.clone(quality = ["undefQuality"], **_pixelTracksCustom)
+pixelTracksFromPV = generalTracksFromPV.clone(quality = "undefQuality", **_pixelTracksCustom)
+pixelTracksFromPVPt09 = pixelTracksPt09.clone(src = "pixelTracksFromPV")
 
 trackValidatorPixelTrackingOnly = trackValidator.clone(
     dirName = "Tracking/PixelTrack/",
-    label = ["pixelTracks"],
+    label = ["pixelTracks", "pixelTracksPt09"],
     doResolutionPlotsForLabels = [],
     trackCollectionForDrCalculation = "pixelTracks",
     associators = ["trackingParticlePixelTrackAsssociation"],
@@ -774,7 +776,7 @@ def _uniqueFirstLayers(layerList):
 )
 trackValidatorFromPVPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone(
     dirName = "Tracking/PixelTrackFromPV/",
-    label = ["pixelTracksFromPV"],
+    label = ["pixelTracksFromPV", "pixelTracksFromPVPt09"],
     label_tp_effic = "trackingParticlesSignal",
     label_tp_fake = "trackingParticlesSignal",
     label_tp_effic_refvector = True,
@@ -811,7 +813,9 @@ def _uniqueFirstLayers(layerList):
 tracksPreValidationPixelTrackingOnly = cms.Task(
     tracksValidationTruthPixelTrackingOnly,
     trackingParticlesSignal,
-    pixelTracksFromPV,
+    pixelTracksPt09,
+    pixelTracksFromPV, 
+    pixelTracksFromPVPt09,
 )
 tracksValidationPixelTrackingOnly = cms.Sequence(
     trackValidatorPixelTrackingOnly +
diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py
index 1cac97b736941..ac80f1f936448 100644
--- a/Validation/RecoTrack/python/plotting/html.py
+++ b/Validation/RecoTrack/python/plotting/html.py
@@ -61,6 +61,8 @@ def _allToHP(s):
     return s.replace("All", "High purity")
 def _allToBTV(s):
     return s.replace("All", "BTV-like")
+def _allPtCut(s):
+    return s.replace("All tracks", "Tracks pT &gt; 0.9 GeV")
 def _ptCut(s):
     return s.replace("Tracks", "Tracks pT &gt; 0.9 GeV").replace("tracks", "tracks pT &gt; 0.9 GeV")
 def _allToPixel(s):
@@ -77,8 +79,8 @@ def _toPixel(s):
     ("building_", "Built tracks"),
     ("", _allName),
     ("highPurity", _allToHP(_allName)),
-    ("Pt09", "Tracks pT &gt; 0.9 GeV"),
-    ("highPurityPt09", "High purity tracks pT &gt; 0.9 GeV"),
+    ("Pt09", _allPtCut(_allName)),
+    ("highPurityPt09", _ptCut(_allToHP(_allName))),
     ("ByOriginalAlgo", _toOriAlgo(_allName)),
     ("highPurityByOriginalAlgo", _toOriAlgo(_toHP(_allName))),
     ("ByAlgoMask", _toAlgoMask(_allName)),
@@ -114,6 +116,15 @@ def _toPixel(s):
     ("bhadron_ByAlgoMask", _toAlgoMask(_bhadronName)),
     ("bhadron_highPurityByAlgoMask", _toAlgoMask(_allToHP(_bhadronName))),
     ("bhadron_btvLike", _allToBTV(_bhadronName)),
+    # Pixel tracks
+    ("pixel_", _allToPixel(_allName)),
+    ("pixel_Pt09", _ptCut(_allToPixel(_allName))),
+    ("pixelFromPV_", _toPixel(_fromPVName)),
+    ("pixelFromPV_Pt09", _ptCut(_toPixel(_fromPVName))),
+    ("pixelFromPVAllTP_", _toPixel(_fromPVAllTPName)),
+    ("pixelFromPVAllTP_Pt09", _ptCut(_toPixel(_fromPVAllTPName))),
+    ("pixelbhadron_", _allToPixel(_bhadronName)),
+    ("pixelbhadron_Pt09", _ptCut(_allToPixel(_bhadronName))),
 ])
 
 _trackAlgoName = {
@@ -128,6 +139,7 @@ def _toPixel(s):
     "iter7" : "Iterative Step 7",
     "iter9" : "Iterative Step 9",
     "iter10": "Iterative Step 10",
+    "pixel": "Pixel tracks",
 }
 
 _trackAlgoOrder = [
@@ -162,6 +174,7 @@ def _toPixel(s):
     'iter7',
     'iter9',
     'iter10',
+    "pixel",
 ]
 
 _pageNameMap = {
@@ -171,17 +184,16 @@ def _toPixel(s):
     "miniaod": "MiniAOD",
     "timing": "Timing",
     "hlt": "HLT",
-    "pixel": "Pixel tracks",
 }
 
 _sectionNameMapOrder = collections.OrderedDict([
     # These are for the summary page
     ("seeding_seeds", "Seeds"),
     ("building", "Built tracks"),
-    ("", "All tracks"),
-    ("Pt09", "All tracks (pT&gt;0.9 GeV)"),
-    ("highPurity", "High purity tracks"),
-    ("highPurityPt09", "High purity tracks (pT&gt;0.9 GeV)"),
+    ("", _allName),
+    ("Pt09", _allPtCut(_allName)),
+    ("highPurity", _allToHP(_allName)),
+    ("highPurityPt09", _ptCut(_allToHP(_allName))),
     ("tpPtLess09", _tpPtLess09Name),
     ("tpPtLess09_highPurity", _allToHP(_tpPtLess09Name)),
     ("btvLike", "BTV-like"),
@@ -197,10 +209,14 @@ def _toPixel(s):
     ("bhadron", _bhadronName),
     ("bhadron_highPurity", _allToHP(_bhadronName)),
     # Pixel tracks
-    ("pixel", "Pixel tracks"),
+    ("pixel", _allToPixel(_allName)),
+    ("pixelPt09", _ptCut(_allToPixel(_allName))),
     ("pixelFromPV", _toPixel(_fromPVName)),
+    ("pixelFromPVPt09", _ptCut(_toPixel(_fromPVName))),
     ("pixelFromPVAllTP", _toPixel(_fromPVAllTPName)),
+    ("pixelFromPVAllTPPt09", _ptCut(_toPixel(_fromPVAllTPName))),
     ("pixelbhadron", _allToPixel(_bhadronName)),
+    ("pixelbhadronPt09", _ptCut(_allToPixel(_bhadronName))),
     # These are for vertices
     ("genvertex", "Gen vertices"),
     ("pixelVertices", "Pixel vertices"),
@@ -224,6 +240,7 @@ def _toPixel(s):
 _fromPVAllTP2Legend = "Tracks from reco PV (another method), fake rate numerator contains all TrackingParticles (separates fake tracks from pileup tracks)"
 _fromPVAllTPPt2Legend = "Tracks (pT &gt; 0.9 GeV) from reco PV (another method), fake rate numerator contains all TrackingParticles (separates fake tracks from pileup tracks)"
 _bhadronLegend = "All tracks, efficiency denominator contains only TrackingParticles from B-hadron decays"
+_bhadronPtLegend = "Tracks (pT &gt; 0.9 GeV), efficiency denominator contains only TrackingParticles from B-hadron decays"
 
 def _sectionNameLegend():
     return {
@@ -249,9 +266,12 @@ def _sectionNameLegend():
         "bhadron_": _bhadronLegend,
         "bhadron_highPurity": _allToHP(_bhadronLegend),
         "bhadron_btvLike": _bhadronLegend.replace("All tracks", _btvLegend),
-        "pixelFromPV": _fromPVLegend,
-        "pixelFromPVAllTP": _fromPVAllTPLegend,
-        "pixelbhadron": _bhadronLegend,
+        "pixelFromPV_": _fromPVLegend,
+        "pixelFromPV_Pt09": _fromPVPtLegend,
+        "pixelFromPVAllTP_": _fromPVAllTPLegend,
+        "pixelFromPVAllTP_Pt09": _fromPVAllTPPtLegend,
+        "pixelbhadron_": _bhadronLegend,
+        "pixelbhadron_Pt09": _bhadronPtLegend,
     }
 
 class Table:
@@ -687,7 +707,7 @@ def __init__(self, sample, title, fastVsFull, pileupComparison):
         self._miniaodPage = PageSet(*params)
         self._timingPage = PageSet(*params)
         self._hltPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0])
-        self._pixelPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0])
+        self._pixelPages = TrackingPageSet(*params)
         self._otherPages = PageSet(*params)
 
         self._purposePageMap = {
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index a14dbe41dfc9b..584d10c47eb74 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -614,6 +614,8 @@ def _mapCollectionToAlgoQuality(collName):
     prefixes = ["cutsreco", "cutsrecofrompv", "cutsrecofrompv2", "cutsrecofrompvalltp"]
     if collNameLow in ["general", "generalfrompv"]+prefixes:
         algo = "ootb"
+    elif collNameLow in ["pixel", "pixelfrompv", "pixelfrompvalltp"]:
+        algo = "pixel"
     else:
         def testColl(coll):
             for pfx in prefixes:
@@ -938,6 +940,7 @@ class HighPurityPt09: pass
     class BTVLike: pass
     class AK4PFJets: pass
     class Pixel: pass
+    class PixelPt09: pass
 
     def __init__(self, section, collection=GeneralTracks):
         self._collection = collection
@@ -980,6 +983,8 @@ def _getN(hname):
                 return _getAlgoQuality(data, "ak4PFJets", "")
             elif self._collection == TrackingSummaryTable.Pixel:
                 return _getAlgoQuality(data, "pixel", "")
+            elif self._collection == TrackingSummaryTable.PixelPt09:
+                return _getAlgoQuality(data, "pixel", "Pt09")
             else:
                 raise Exception("Collection not recognized, %s" % str(self._collection))
         def _formatOrNone(num, func):
@@ -1349,14 +1354,16 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
 _appendTrackingPlots("TrackBHadron", "bhadron", _simBasedPlots+_recoBasedPlots, onlyForBHadron=True)
 # Pixel tracks
 def _appendPixelTrackingPlots(lastDirName, name):
-    _common = dict(section=name, purpose=PlotPurpose.Pixel, page="pixel")
+    _common = dict(purpose=PlotPurpose.Pixel, page="pixel")
     _folders = _trackingFolders(lastDirName)
 
     plotter.append(name, _folders, TrackingPlotFolder(*(_simBasedPlots+_recoBasedPlots), **_common))
     plotterExt.append(name, _folders, TrackingPlotFolder(*_extendedPlots, **_common))
 
     plotter.append(name+"_summary",  _folders, PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section=name))
+    plotter.append(name+"_summary",  _folders, PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section=name+"Pt09"))
     plotter.appendTable(name+"_summary", _folders, TrackingSummaryTable(section=name, collection=TrackingSummaryTable.Pixel))
+    plotter.appendTable(name+"_summary", _folders, TrackingSummaryTable(section=name+"Pt09", collection=TrackingSummaryTable.PixelPt09))
 _appendPixelTrackingPlots("PixelTrack", "pixel")
 _appendPixelTrackingPlots("PixelTrackFromPV", "pixelFromPV")
 _appendPixelTrackingPlots("PixelTrackFromPVAllTP", "pixelFromPVAllTP")

From 3e828dd30091cf82dd114b698555a1f2241940a7 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 26 Mar 2019 14:45:42 +0100
Subject: [PATCH 042/102] Synchronise with CMSSW_10_6_0_pre2

---
 .../python/PostProcessorTracker_cfi.py        |  4 ++
 .../RecoTrack/python/TrackValidation_cff.py   | 66 +++++++++++++++----
 .../python/plotting/trackingPlots.py          | 51 ++++++++------
 3 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index 6b5a19f799035..9cd28e6512bf0 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -35,6 +35,7 @@
     "duplicatesRate_vertpos 'Duplicates Rate vs vertpos' num_duplicate_vertpos num_reco_vertpos",
     "duplicatesRate_zpos 'Duplicates Rate vs zpos' num_duplicate_zpos num_reco_zpos",
     "duplicatesRate_dr 'Duplicates Rate vs dr' num_duplicate_dr num_reco_dr",
+    "duplicatesRate_drj 'Duplicates Rate vs dr (track,jet)' num_duplicate_drj num_reco_drj",
     "duplicatesRate_chi2 'Duplicates Rate vs normalized #chi^{2}' num_duplicate_chi2 num_reco_chi2",
     "duplicatesRate_seedingLayerSet 'Duplicates rate vs. seedingLayerSet' num_duplicate_seedingLayerSet num_reco_seedingLayerSet",
     "chargeMisIdRate 'Charge MisID Rate vs #eta' num_chargemisid_eta num_reco_eta",
@@ -55,6 +56,7 @@
     "effic_vs_vertpos 'Efficiency vs vertpos' num_assoc(simToReco)_vertpos num_simul_vertpos",
     "effic_vs_zpos 'Efficiency vs zpos' num_assoc(simToReco)_zpos num_simul_zpos",
     "effic_vs_dr 'Efficiency vs dr' num_assoc(simToReco)_dr num_simul_dr",
+    "effic_vs_drj 'Efficiency vs dr (track,jet)' num_assoc(simToReco)_drj num_simul_drj",
     "effic_vertcount_barrel 'efficiency in barrel vs N of pileup vertices' num_assoc(simToReco)_vertcount_barrel num_simul_vertcount_barrel",
     "effic_vertcount_fwdpos 'efficiency in endcap(+) vs N of pileup vertices' num_assoc(simToReco)_vertcount_fwdpos num_simul_vertcount_fwdpos",
     "effic_vertcount_fwdneg 'efficiency in endcap(-) vs N of pileup vertices' num_assoc(simToReco)_vertcount_fwdneg num_simul_vertcount_fwdneg",
@@ -78,6 +80,7 @@
     "pileuprate_vertpos 'Pileup rate vs vertpos' num_pileup_vertpos num_reco_vertpos",
     "pileuprate_zpos 'Pileup rate vs zpos' num_pileup_zpos num_reco_zpos",
     "pileuprate_dr 'Pileup rate vs dr' num_pileup_dr num_reco_dr",
+    "pileuprate_drj 'Pileup rate vs dr (track,jet)' num_pileup_drj num_reco_drj",
     "pileuprate_chi2 'Pileup rate vs normalized #chi^{2}' num_pileup_chi2 num_reco_chi2",
     "pileuprate_seedingLayerSet 'Pileup rate vs. seedingLayerSet' num_pileup_seedingLayerSet num_reco_seedingLayerSet",
     "fakerate 'Fake rate vs #eta' num_assoc(recoToSim)_eta num_reco_eta fake",
@@ -97,6 +100,7 @@
     "fakerate_vs_vertpos 'Fake rate vs vertpos' num_assoc(recoToSim)_vertpos num_reco_vertpos fake",
     "fakerate_vs_zpos 'Fake rate vs vertpos' num_assoc(recoToSim)_zpos num_reco_zpos fake",
     "fakerate_vs_dr 'Fake rate vs dr' num_assoc(recoToSim)_dr num_reco_dr fake",
+    "fakerate_vs_drj 'Fake rate vs dr (track,jet)' num_assoc(recoToSim)_drj num_reco_drj fake",
     "fakerate_vs_chi2 'Fake rate vs normalized #chi^{2}' num_assoc(recoToSim)_chi2 num_reco_chi2 fake",
     "fakerate_vs_seedingLayerSet 'Fake rate vs. seedingLayerSet' num_assoc(recoToSim)_seedingLayerSet num_reco_seedingLayerSet fake",
     "fakerate_vertcount_barrel 'fake rate in barrel vs N of pileup vertices' num_assoc(recoToSim)_vertcount_barrel num_reco_vertcount_barrel fake",
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index fb5b1cdba04b2..52bb93d4ee858 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -1,6 +1,7 @@
+from __future__ import absolute_import
 import FWCore.ParameterSet.Config as cms
 
-import SimTracker.TrackAssociatorProducers.trackAssociatorByChi2_cfi 
+import SimTracker.TrackAssociatorProducers.trackAssociatorByChi2_cfi
 from SimTracker.TrackAssociatorProducers.quickTrackAssociatorByHits_cfi import *
 from SimTracker.TrackAssociation.trackingParticleRecoTrackAsssociation_cfi import *
 import Validation.RecoTrack.MultiTrackValidator_cfi
@@ -8,7 +9,7 @@
 from SimTracker.TrackAssociation.LhcParametersDefinerForTP_cfi import *
 from SimTracker.TrackAssociation.CosmicParametersDefinerForTP_cfi import *
 from Validation.RecoTrack.PostProcessorTracker_cfi import *
-import cutsRecoTracks_cfi
+from . import cutsRecoTracks_cfi
 
 from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import *
 from SimTracker.VertexAssociation.VertexAssociatorByPositionAndTracks_cfi import *
@@ -348,6 +349,10 @@ def _getMVASelectors(postfix):
     ptMin = 0,
 )
 
+# Select jets for JetCore tracking
+highPtJets = cms.EDFilter("CandPtrSelector", src = cms.InputTag("ak4CaloJets"), cut = cms.string("pt()>1000"))
+highPtJetsForTrk = highPtJetsForTrk = highPtJets.clone(src = "ak4CaloJetsForTrk")
+
 # Select B-hadron TPs
 trackingParticlesBHadron = _trackingParticleBHadronRefSelector.clone()
 
@@ -360,7 +365,7 @@ def _getMVASelectors(postfix):
     #,maxpT = cms.double(3)
     #,nintpT = cms.int32(40)
 )
-fastSim.toModify(trackValidator, 
+fastSim.toModify(trackValidator,
                       dodEdxPlots = False)
 
 for _eraName, _postfix, _era in _relevantEras:
@@ -572,6 +577,7 @@ def _uniqueFirstLayers(layerList):
 fastSim.toModify(tracksValidationTruth, lambda x: x.remove(tpClusterProducer))
 
 tracksPreValidation = cms.Task(
+    highPtJetsForTrk,
     tracksValidationSelectors,
     tracksValidationSelectorsPt09,
     tracksValidationSelectorsFromPV,
@@ -630,31 +636,52 @@ def _uniqueFirstLayers(layerList):
                 names = "_selectorsFromPVPt09Standalone", task = "_tracksValidationSelectorsFromPVPt09Standalone")
 
 # MTV instances
-trackValidatorStandalone = trackValidator.clone()
-trackValidatorTPPtLess09Standalone = trackValidatorTPPtLess09.clone()
+trackValidatorStandalone = trackValidator.clone(
+    cores = "highPtJets"
+)
+trackValidatorTPPtLess09Standalone = trackValidatorTPPtLess09.clone(
+    cores = "highPtJets"
+)
 for _eraName, _postfix, _era in _relevantEras:
     _setForEra(trackValidatorStandalone, _eraName, _era, label = trackValidator.label + locals()["_selectorsByAlgoMask"+_postfix] + locals()["_selectorsPt09Standalone"+_postfix])
     _setForEra(trackValidatorTPPtLess09Standalone, _eraName, _era, label = trackValidatorTPPtLess09.label + locals()["_selectorsByAlgoMask"+_postfix] + locals()["_selectorsPt09Standalone"+_postfix])
 
-trackValidatorFromPVStandalone = trackValidatorFromPV.clone()
+trackValidatorFromPVStandalone = trackValidatorFromPV.clone(
+    cores = "highPtJets"
+)
 for _eraName, _postfix, _era in _relevantEras:
     _setForEra(trackValidatorFromPVStandalone, _eraName, _era, label = trackValidatorFromPV.label + locals()["_selectorsFromPVStandalone"+_postfix] + locals()["_selectorsFromPVPt09Standalone"+_postfix])
 # do resolutions as in the standard version
 
 trackValidatorFromPVAllTPStandalone = trackValidatorFromPVAllTP.clone(
-    label = trackValidatorFromPVStandalone.label.value()
+    label = trackValidatorFromPVStandalone.label.value(),
+    cores = "highPtJets"
+
 )
 trackValidatorAllTPEfficStandalone = trackValidatorAllTPEffic.clone(
-    label = [ x for x in trackValidator.label.value() if x not in ["cutsRecoTracksBtvLike", "cutsRecoTracksAK4PFJets"] and "Pt09" not in x]
+    label = [ x for x in trackValidator.label.value() if x not in ["cutsRecoTracksBtvLike", "cutsRecoTracksAK4PFJets"] and "Pt09" not in x],
+    cores = "highPtJets"
+
 )
 
-trackValidatorConversionStandalone = trackValidatorConversion.clone( label = [x for x in trackValidatorConversion.label if x != "convStepTracks"])
+trackValidatorConversionStandalone = trackValidatorConversion.clone(
+    label = [x for x in trackValidatorConversion.label if x != "convStepTracks"],
+    cores = "highPtJets"
+)
 
-trackValidatorBHadronStandalone = trackValidatorBHadron.clone(label = [x for x in trackValidatorStandalone.label if "Pt09" not in x])
+trackValidatorBHadronStandalone = trackValidatorBHadron.clone(
+    label = [x for x in trackValidatorStandalone.label if "Pt09" not in x],
+    cores = "highPtJets"
+)
+
+trackValidatorGsfTracksStandalone = trackValidatorGsfTracks.clone(
+    cores = "highPtJets"
+)
 
 # sequences
 tracksPreValidationStandalone = tracksPreValidation.copy()
 tracksPreValidationStandalone.add(trackingParticlesBHadron)
+tracksPreValidationStandalone.replace(highPtJetsForTrk,highPtJets)
 fastSim.toReplaceWith(tracksPreValidationStandalone, tracksPreValidation)
 
 tracksValidationSelectorsStandalone = cms.Task(
@@ -673,7 +700,7 @@ def _uniqueFirstLayers(layerList):
     trackValidatorFromPVAllTPStandalone +
     trackValidatorAllTPEfficStandalone +
     trackValidatorConversionStandalone +
-    trackValidatorGsfTracks +
+    trackValidatorGsfTracksStandalone +
     trackValidatorBHadronStandalone
 )
 trackValidatorsStandalone = _trackValidatorsBase.copy()
@@ -695,7 +722,10 @@ def _uniqueFirstLayers(layerList):
 tracksValidationSeedSelectorsTrackingOnly.add(tracksValidationSeedSelectorsPreSplittingTrackingOnly)
 
 # MTV instances
-trackValidatorTrackingOnly = trackValidatorStandalone.clone(label = [ x for x in trackValidatorStandalone.label if x != "cutsRecoTracksAK4PFJets"] )
+trackValidatorTrackingOnly = trackValidatorStandalone.clone(
+    label = [ x for x in trackValidatorStandalone.label if x != "cutsRecoTracksAK4PFJets"],
+    cores = "highPtJetsForTrk"
+ )
 
 trackValidatorSeedingTrackingOnly = _trackValidatorSeedingBuilding.clone(
     dirName = "Tracking/TrackSeeding/",
@@ -718,18 +748,27 @@ def _uniqueFirstLayers(layerList):
 
 trackValidatorBHadronTrackingOnly = trackValidatorBHadron.clone(label = [x for x in trackValidatorTrackingOnly.label if "Pt09" not in x])
 
+trackValidatorTPPtLess09TrackingOnly = trackValidatorTPPtLess09Standalone.clone(cores = "highPtJetsForTrk")
+trackValidatorFromPVTrackingOnly = trackValidatorFromPVStandalone.clone(cores = "highPtJetsForTrk")
+trackValidatorFromPVAllTPTrackingOnly = trackValidatorFromPVAllTPStandalone.clone(cores = "highPtJetsForTrk")
+trackValidatorAllTPEfficTrackingOnly = trackValidatorAllTPEfficStandalone.clone(cores = "highPtJetsForTrk")
 # sequences
 tracksPreValidationTrackingOnly = tracksPreValidationStandalone.copy()
 tracksPreValidationTrackingOnly.replace(tracksValidationSelectors, tracksValidationSelectorsTrackingOnly)
+tracksPreValidationTrackingOnly.replace(highPtJets,highPtJetsForTrk)
 
 trackValidatorsTrackingOnly = _trackValidatorsBase.copy()
 trackValidatorsTrackingOnly.replace(trackValidatorStandalone, trackValidatorTrackingOnly)
+trackValidatorsTrackingOnly.replace(trackValidatorTPPtLess09Standalone,trackValidatorTPPtLess09TrackingOnly)
+trackValidatorsTrackingOnly.replace(trackValidatorFromPVStandalone,trackValidatorFromPVTrackingOnly)
+trackValidatorsTrackingOnly.replace(trackValidatorFromPVAllTPStandalone,trackValidatorFromPVAllTPTrackingOnly)
+trackValidatorsTrackingOnly.replace(trackValidatorAllTPEfficStandalone,trackValidatorAllTPEfficTrackingOnly)
 trackValidatorsTrackingOnly += trackValidatorSeedingTrackingOnly
 trackValidatorsTrackingOnly += trackValidatorSeedingPreSplittingTrackingOnly
 trackValidatorsTrackingOnly += trackValidatorBuilding
 trackValidatorsTrackingOnly += trackValidatorBuildingPreSplitting
 trackValidatorsTrackingOnly.replace(trackValidatorConversionStandalone, trackValidatorConversionTrackingOnly)
-trackValidatorsTrackingOnly.remove(trackValidatorGsfTracks)
+trackValidatorsTrackingOnly.remove(trackValidatorGsfTracksStandalone)
 trackValidatorsTrackingOnly.replace(trackValidatorBHadronStandalone, trackValidatorBHadronTrackingOnly)
 fastSim.toReplaceWith(trackValidatorsTrackingOnly, trackValidatorsTrackingOnly.copyAndExclude([
     trackValidatorBuildingPreSplitting,
@@ -773,6 +812,7 @@ def _uniqueFirstLayers(layerList):
     label_vertex = "pixelVertices",
     vertexAssociator = "PixelVertexAssociatorByPositionAndTracks",
     dodEdxPlots = False,
+    cores = cms.InputTag(""),
 )
 trackValidatorFromPVPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone(
     dirName = "Tracking/PixelTrackFromPV/",
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index 584d10c47eb74..dee8c75ac5172 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -1,3 +1,4 @@
+from __future__ import absolute_import
 import os
 import copy
 import collections
@@ -7,11 +8,11 @@
 ROOT.gROOT.SetBatch(True)
 ROOT.PyConfig.IgnoreCommandLineOptions = True
 
-from plotting import Subtract, FakeDuplicate, CutEfficiency, Transform, AggregateBins, ROC, Plot, PlotEmpty, PlotGroup, PlotOnSideGroup, PlotFolder, Plotter
-from html import PlotPurpose
-import plotting
-import validation
-import html
+from .plotting import Subtract, FakeDuplicate, CutEfficiency, Transform, AggregateBins, ROC, Plot, PlotEmpty, PlotGroup, PlotOnSideGroup, PlotFolder, Plotter
+from .html import PlotPurpose
+from . import plotting
+from . import validation
+from . import html
 
 ########################################
 #
@@ -214,8 +215,9 @@ def _makeMVAPlots(num, hp=False):
 )
 _effandfakeDeltaRPU = PlotGroup("effandfakeDeltaRPU",
                                 _makeEffFakeDupPlots("dr"     , "#DeltaR", effopts=dict(xtitle="TP min #DeltaR"), fakeopts=dict(xtitle="track min #DeltaR"), common=dict(xlog=True)) +
+                                _makeEffFakeDupPlots("drj" , "#DeltaR(track, jet)", effopts=dict(xtitle="#DeltaR(TP, jet)", ytitle="efficiency vs #DeltaR(TP, jet"), fakeopts=dict(xtitle="#DeltaR(track, jet)"), common=dict(xlog=True))+
                                 _makeEffFakeDupPlots("pu"     , "PU"     , common=dict(xtitle="Pileup", xmin=_minPU, xmax=_maxPU)),
-                                legendDy=_legendDy_2rows
+                                legendDy=_legendDy_4rows
 )
 
 
@@ -259,8 +261,9 @@ def _makeMVAPlots(num, hp=False):
 )
 _dupandfakeDeltaRPU = PlotGroup("dupandfakeDeltaRPU",
                                 _makeFakeDupPileupPlots("dr"     , "#DeltaR", xquantity="min #DeltaR", common=dict(xlog=True)) +
+                                _makeFakeDupPileupPlots("drj"     , "#DeltaR(track, jet)", xtitle="#DeltaR(track, jet)", common=dict(xlog=True)) +
                                 _makeFakeDupPileupPlots("pu"     , "PU"     , xtitle="Pileup", common=dict(xmin=_minPU, xmax=_maxPU)),
-                                ncols=3, legendDy=_legendDy_2rows_3cols
+                                ncols=3
 )
 _seedingLayerSet_common = dict(removeEmptyBins=True, xbinlabelsize=8, xbinlabeloption="d", adjustMarginRight=0.1)
 _dupandfakeSeedingPlots = _makeFakeDupPileupPlots("seedingLayerSet", "seeding layers", xtitle="", common=_seedingLayerSet_common)
@@ -420,12 +423,16 @@ def _makeMVAPlots(num, hp=False):
                                _makeDistPlots("3Dlayer"   , "3D layers"   , common=dict(xmin=_min3DLayers, xmax=_max3DLayers)),
                                ncols=4, legendDy=_legendDy_4rows,
 )
-_extDistPosDeltaR = PlotGroup("distPosDeltaR",
+_extDistPos = PlotGroup("distPos",
                               _makeDistPlots("vertpos", "ref. point r (cm)", common=dict(xlog=True)) +
                               _makeDistPlots("zpos"   , "ref. point z (cm)") +
-                              _makeDistPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)) +
-                              _makeDistPlots("dr"     , "min #DeltaR", common=dict(xlog=True)),
-                              ncols=4, legendDy=_legendDy_4rows,
+                              _makeDistPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)),
+                              ncols=3,
+)
+_extDistDeltaR = PlotGroup("distDeltaR",
+                              _makeDistPlots("dr"     , "min #DeltaR", common=dict(xlog=True)) +
+                              _makeDistPlots("drj"     , "#DeltaR(track, jet)", common=dict(xlog=True)),
+                              ncols=2, legendDy=_legendDy_2rows,
 )
 _extDistSeedingPlots = _makeDistPlots("seedingLayerSet", "seeding layers", common=dict(xtitle="", **_seedingLayerSet_common))
 _extDistChi2Seeding = PlotGroup("distChi2Seeding",
@@ -487,12 +494,16 @@ def _makeMVAPlots(num, hp=False):
                                   _makeDistSimPlots("3Dlayer"   , "3D layers"   , common=dict(xmin=_min3DLayers, xmax=_max3DLayers)),
                                   ncols=2, legendDy=_legendDy_4rows,
 )
-_extDistSimPosDeltaR = PlotGroup("distsimPosDeltaR",
+_extDistSimPos = PlotGroup("distsimPos",
                                  _makeDistSimPlots("vertpos", "vert r (cm)", common=dict(xlog=True)) +
                                  _makeDistSimPlots("zpos"   , "vert z (cm)") +
-                                 _makeDistSimPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)) +
-                                 _makeDistSimPlots("dr"     , "min #DeltaR", common=dict(xlog=True)),
-                                 ncols=2, legendDy=_legendDy_4rows,
+                                 _makeDistSimPlots("simpvz" , "Sim. PV z (cm)", common=dict(xmin=_minZ, xmax=_maxZ)),
+                                 ncols=3,
+)
+_extDistSimDeltaR = PlotGroup("distsimDeltaR",
+                                 _makeDistSimPlots("dr"     , "min #DeltaR", common=dict(xlog=True)) +
+                                 _makeDistSimPlots("drj" , "#DeltaR(TP, jet)", common=dict(xlog=True)),
+                                 ncols=2, legendDy=_legendDy_2rows,
 )
 
 ########################################
@@ -1241,7 +1252,8 @@ def _trackingFolders(lastDirName="Track"):
     _extDistDxyDzBS,
     _extDistDxyDzPV,
     _extDistHitsLayers,
-    _extDistPosDeltaR,
+    _extDistPos,
+    _extDistDeltaR,
     _extDistChi2Seeding,
     _extDistSeedingTable,
     _extResidualEta,
@@ -1252,7 +1264,8 @@ def _trackingFolders(lastDirName="Track"):
     _extDistSimDxyDzBS,
     _extDistSimDxyDzPV,
     _extDistSimHitsLayers,
-    _extDistSimPosDeltaR,
+    _extDistSimPos,
+    _extDistSimDeltaR,
 ]
 _summaryPlots = [
     _summary,
@@ -1680,7 +1693,7 @@ def create(self, tdirectory):
             return None
 
         iterMap = copy.copy(_collLabelMapHp)
-        del iterMap["generalTracks"] 
+        del iterMap["generalTracks"]
         del iterMap["jetCoreRegionalStep"] # this is expensive per track on purpose
         if self._selectedTracks:
             renameBin = lambda bl: _summaryBinRename(bl, highPurity=True, byOriginalAlgo=False, byAlgoMask=True, ptCut=False, seeds=False)
@@ -1917,5 +1930,3 @@ def headers(self):
 ], PlotFolder(
     _tplifetime,
 ))
-
-

From a74051b05de52dc4525c1b1d930a70b22de50d98 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 8 May 2019 18:08:34 +0200
Subject: [PATCH 043/102] Implementation of the broken line fit
 (cms-patatrack#340)

Create modifiers for enabling the broken line fit on the cpu and on the gpu.

Use dinamically-sized-matrices: the advantage over statically-sized ones
is that the code would also work with n>4); the switch can be easily done at
the start of the file.

Update Eigen tests with the features used by the broken line fit.
---
 .../PixelTrackFitting/interface/BrokenLine.h  | 614 ++++++++++++++++++
 .../PixelTrackFitting/interface/RiemannFit.h  |   1 -
 .../python/PixelTracks_cff.py                 |  10 +
 .../PixelTrackFitting/test/BuildFile.xml      |  26 +-
 .../test/testEigenGPUNoFit.cu                 | 112 ++--
 5 files changed, 708 insertions(+), 55 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
new file mode 100644
index 0000000000000..382ff5dec3fec
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -0,0 +1,614 @@
+#ifndef RECOPIXELVERTEXING_PIXELTRACKFITTING_BROKENLINE_H
+#define RECOPIXELVERTEXING_PIXELTRACKFITTING_BROKENLINE_H
+
+#include <Eigen/Core>
+#include <Eigen/Cholesky>
+#include <Eigen/Eigenvalues>
+#include <cuda_runtime.h>
+
+namespace BrokenLine {
+	
+	using namespace Eigen;
+	
+	constexpr unsigned int max_nop = 4;  //!< In order to avoid use of dynamic memory
+	
+	// WARNING: USE STATIC DIMENSIONS ON GPUs. To do so, comment these definitions and uncomment the others following
+	
+	using MatrixNd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
+	using MatrixNplusONEd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop + 1, max_nop + 1>;
+	using Matrix3Nd = Eigen::Matrix<double, Dynamic, Dynamic, 0, 3 * max_nop, 3 * max_nop>;
+	using Matrix2xNd = Eigen::Matrix<double, 2, Dynamic, 0, 2, max_nop>;
+	using Matrix3xNd = Eigen::Matrix<double, 3, Dynamic, 0, 3, max_nop>;
+	using VectorNd = Eigen::Matrix<double, Dynamic, 1, 0, max_nop, 1>;
+	using VectorNplusONEd = Eigen::Matrix<double, Dynamic, 1, 0, max_nop + 1, 1>;
+	using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
+	using Matrix5d = Eigen::Matrix<double, 5, 5>;
+	using Vector5d = Eigen::Matrix<double, 5, 1>;
+	using u_int    = unsigned int;
+	
+	/*
+	 using MatrixNd = Eigen::Matrix<double, max_nop, max_nop, 0, max_nop, max_nop>;
+	 using MatrixNplusONEd = Eigen::Matrix<double, max_nop + 1, max_nop + 1, 0, max_nop + 1, max_nop + 1>;
+	 using Matrix3Nd = Eigen::Matrix<double, 3 * max_nop, 3 * max_nop, 0, 3 * max_nop, 3 * max_nop>;
+	 using Matrix2xNd = Eigen::Matrix<double, 2, max_nop, 0, 2, max_nop>;
+	 using Matrix3xNd = Eigen::Matrix<double, 3, max_nop, 0, 3, max_nop>;
+	 using VectorNd = Eigen::Matrix<double, max_nop, 1, 0, max_nop, 1>;
+	 using VectorNplusONEd = Eigen::Matrix<double, max_nop + 1, 1, 0, max_nop + 1, 1>;
+	 using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
+	 using Matrix5d = Eigen::Matrix<double, 5, 5>;
+	 using Vector5d = Eigen::Matrix<double, 5, 1>;
+	 using u_int    = unsigned int;
+	 */
+	
+	struct karimaki_circle_fit {
+		Vector3d par;  //!< Karimäki's parameters: (phi, d, k=1/R)
+		Matrix3d cov;
+		/*!< covariance matrix: \n
+		 |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
+		 |cov(phi, d )|cov( d , d )|cov( k , d )| \n
+		 |cov(phi, k )|cov( d , k )|cov( k , k )|
+		 */
+		int q;  //!< particle charge
+		double chi2;
+	};
+	
+	struct line_fit {
+		Vector2d par;  //!< parameters: (cotan(theta),Zip)
+		Matrix2d cov;
+		/*!< covariance matrix: \n
+		 |cov(c_t,c_t)|cov(Zip,c_t)| \n
+		 |cov(c_t,Zip)|cov(Zip,Zip)|
+		 */
+		double chi2;
+	};
+	
+	struct helix_fit {
+		Vector5d par;  //!< parameters: (phi,Tip,p_t,cotan(theta)),Zip)
+		Matrix5d cov;
+		/*!< covariance matrix: \n
+		 |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
+		 |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
+		 |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
+		 |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
+		 |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
+		 */
+		double chi2_circle;
+		double chi2_line;
+		Vector4d fast_fit;
+		int q;  //!< particle charge
+	} __attribute__ ((aligned(16)) );
+	
+	/*!
+	 \brief data needed for the Broken Line fit procedure.
+	 */
+	struct PreparedBrokenLineData {
+		int q; //!< particle charge
+		Matrix2xNd radii; //!< xy data in the system in which the pre-fitted center is the origin
+		VectorNd s; //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach
+		VectorNd S; //!< total distance traveled (three-dimensional)
+		VectorNd Z; //!< orthogonal coordinate to the pre-fitted line in the sz plane
+		VectorNd VarBeta; //!< kink angles in the SZ plane
+	};
+	
+	/*!
+	 \brief raise to square.
+	 */
+	__host__ __device__ inline double sqr(const double a) {
+		return a*a;
+	}
+	
+	/*!
+	 \brief Computes the Coulomb multiple scattering variance of the planar angle.
+	 
+	 \param length length of the track in the material.
+	 \param B magnetic field in Gev/cm/c.
+	 \param R radius of curvature (needed to evaluate p).
+	 \param Layer denotes which of the four layers of the detector is the endpoint of the multiple scattered track. For example, if Layer=3, then the particle has just gone through the material between the second and the third layer.
+	 
+	 \todo add another Layer variable to identify also the start point of the track, so if there are missing hits or multiple hits, the part of the detector that the particle has traversed can be exactly identified.
+	 
+	 \warning the formula used here assumes beta=1, and so neglects the dependence of theta_0 on the mass of the particle at fixed momentum.
+	 
+	 \return the variance of the planar angle ((theta_0)^2 /3).
+	 */
+	__host__ __device__ inline double MultScatt(const double& length, const double B, const double& R, int Layer, double slope) {
+		double XX_0; //!< radiation length of the material in cm
+		if(Layer==1) XX_0=16/0.06;
+		else XX_0=16/0.06;
+		XX_0*=1;
+		double geometry_factor=0.7; //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+		return geometry_factor*sqr((13.6/1000)/(1*B*R*sqrt(1+sqr(slope))))*(abs(length)/XX_0)*sqr(1+0.038*log(abs(length)/XX_0));
+	}
+	
+	/*!
+	 \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
+	 
+	 \param slope tangent of the angle of rotation.
+	 
+	 \return 2D rotation matrix.
+	 */
+	__host__ __device__ inline Matrix2d RotationMatrix(const double& slope) {
+		Matrix2d Rot;
+		Rot(0,0)=1/sqrt(1+sqr(slope));
+		Rot(0,1)=slope*Rot(0,0);
+		Rot(1,0)=-Rot(0,1);
+		Rot(1,1)=Rot(0,0);
+		return Rot;
+	}
+	
+	/*!
+	 \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a translation of the coordinate system, such that the old origin has coordinates (x0,y0) in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
+	 
+	 \param circle circle fit in the old coordinate system.
+	 \param x0 x coordinate of the translation vector.
+	 \param y0 y coordinate of the translation vector.
+	 \param Jacob passed by reference in order to save stack.
+	 */
+	__host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, const double& x0, const double& y0, Matrix3d& Jacob) {
+		double A,U,BB,C,DO,DP,uu,xi,v,mu,lambda,zeta;
+		DP=x0*cos(circle.par(0))+y0*sin(circle.par(0));
+		DO=x0*sin(circle.par(0))-y0*cos(circle.par(0))+circle.par(1);
+		uu=1+circle.par(2)*circle.par(1);
+		C=-circle.par(2)*y0+uu*cos(circle.par(0));
+		BB=circle.par(2)*x0+uu*sin(circle.par(0));
+		A=2*DO+circle.par(2)*(sqr(DO)+sqr(DP));
+		U=sqrt(1+circle.par(2)*A);
+		xi=1/(sqr(BB)+sqr(C));
+		v=1+circle.par(2)*DO;
+		lambda=(A/2)/(U*sqr(1+U));
+		mu=1/(U*(1+U))+circle.par(2)*lambda;
+		zeta=sqr(DO)+sqr(DP);
+		
+		Jacob << xi*uu*v, -xi*sqr(circle.par(2))*DP, xi*DP,
+		2*mu*uu*DP, 2*mu*v, mu*zeta-lambda*A,
+		0, 0, 1;
+		
+		circle.par(0)=atan2(BB,C);
+		circle.par(1)=A/(1+U);
+		// circle.par(2)=circle.par(2);
+		
+		circle.cov=Jacob*circle.cov*Jacob.transpose();
+	}
+	
+	/*!
+	 \brief Compute cross product of two 2D vector (assuming z component 0), returning the z component of the result.
+	 
+	 \param a first 2D vector in the product.
+	 \param b second 2D vector in the product.
+	 
+	 \return z component of the cross product.
+	 */
+	
+	__host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b) {
+		return a.x()*b.y()-a.y()*b.x();
+	}
+	
+	/*!
+	 \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
+	 
+	 \param hits hits coordinates.
+	 \param hits_cov hits covariance matrix.
+	 \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+	 \param B magnetic field in Gev/cm/c.
+	 \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
+	 */
+	__host__ __device__ inline void PrepareBrokenLineData(const Matrix3xNd& hits,
+														  const Matrix3Nd& hits_cov,
+														  const Vector4d& fast_fit,
+														  const double B,
+														  PreparedBrokenLineData & results) {
+		u_int n=hits.cols();
+		u_int i;
+		Vector2d d;
+		Vector2d e;
+		results.radii=Matrix2xNd::Zero(2,n);
+		results.s=VectorNd::Zero(n);
+		results.S=VectorNd::Zero(n);
+		results.Z=VectorNd::Zero(n);
+		results.VarBeta=VectorNd::Zero(n);
+		
+		results.q=1;
+		d=hits.block(0,1,2,1)-hits.block(0,0,2,1);
+		e=hits.block(0,n-1,2,1)-hits.block(0,n-2,2,1);
+		if(cross2D(d,e)>0) results.q=-1;
+		
+		const double slope=-results.q/fast_fit(3);
+		
+		Matrix2d R=RotationMatrix(slope);
+		
+		// calculate radii and s
+		results.radii=hits.block(0,0,2,n)-fast_fit.head(2)*MatrixXd::Constant(1,n,1);
+		e=-fast_fit(2)*fast_fit.head(2)/fast_fit.head(2).norm();
+		for(i=0;i<n;i++) {
+			d=results.radii.block(0,i,2,1);
+			results.s(i)=results.q*fast_fit(2)*atan2(cross2D(d,e),d.dot(e)); // calculates the arc length
+			//if(results.s(i)<=0);
+		}
+		VectorNd z=hits.block(2,0,1,n).transpose();
+		
+		//calculate S and Z
+		Matrix2xNd pointsSZ=Matrix2xNd::Zero(2,n);
+		for(i=0;i<n;i++) {
+			pointsSZ(0,i)=results.s(i);
+			pointsSZ(1,i)=z(i);
+			pointsSZ.block(0,i,2,1)=R*pointsSZ.block(0,i,2,1);
+		}
+		results.S=pointsSZ.block(0,0,1,n).transpose();
+		results.Z=pointsSZ.block(1,0,1,n).transpose();
+		
+		//calculate VarBeta
+		for(i=1;i<n-1;i++) {
+			results.VarBeta(i)=MultScatt(results.S(i+1)-results.S(i),B,fast_fit(2),i+2,slope)+MultScatt(results.S(i)-results.S(i-1),B,fast_fit(2),i+1,slope);
+		}
+	}
+	
+	/*!
+	 \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. This is the whole matrix in the case of the line fit and the main n-by-n block in the case of the circle fit.
+	 
+	 \param w weights of the first part of the cost function, the one with the measurements and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
+	 \param S total distance traveled by the particle from the pre-fitted closest approach.
+	 \param VarBeta kink angles' variance.
+	 
+	 \return the n-by-n matrix of the linear system
+	 */
+	__host__ __device__ inline MatrixNd MatrixC_u(const VectorNd& w, const VectorNd& S, const VectorNd& VarBeta) {
+		u_int n=S.rows();
+		u_int i;
+		
+		MatrixNd C_U=MatrixNd::Zero(n,n);
+		for(i=0;i<n;i++) {
+			C_U(i,i)=w(i);
+			if(i>1) C_U(i,i)+=1/(VarBeta(i-1)*sqr(S(i)-S(i-1)));
+			if(i>0 && i<n-1) C_U(i,i)+=(1/VarBeta(i))*sqr((S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
+			if(i<n-2) C_U(i,i)+=1/(VarBeta(i+1)*sqr(S(i+1)-S(i)));
+			
+			if(i>0 && i<n-1) C_U(i,i+1)=1/(VarBeta(i)*(S(i+1)-S(i)))*(-(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
+			if(i<n-2) C_U(i,i+1)+=1/(VarBeta(i+1)*(S(i+1)-S(i)))*(-(S(i+2)-S(i))/((S(i+2)-S(i+1))*(S(i+1)-S(i))));
+			
+			if(i<n-2) C_U(i,i+2)=1/(VarBeta(i+1)*(S(i+2)-S(i+1))*(S(i+1)-S(i)));
+			
+			C_U(i,i)=C_U(i,i)/2;
+		}
+		MatrixNd C_u;
+		C_u=C_U+C_U.transpose();
+		
+		return C_u;
+	}
+	
+	/*!
+	 \brief A very fast helix fit.
+	 
+	 \param hits the measured hits.
+	 
+	 \return (X0,Y0,R,tan(theta)).
+	 
+	 \warning sign of theta is (intentionally, for now) mistaken for negative charges.
+	 */
+	
+	__host__ __device__ inline Vector4d BL_Fast_fit(const Matrix3xNd& hits) {
+		Vector4d result;
+		u_int n=hits.cols();
+		
+		const Vector2d a=hits.block(0,n/2,2,1)-hits.block(0,0,2,1);
+		const Vector2d b=hits.block(0,n-1,2,1)-hits.block(0,n/2,2,1);
+		const Vector2d c=hits.block(0,0,2,1)-hits.block(0,n-1,2,1);
+		
+		result(0)=hits(0,0)-(a(1)*c.squaredNorm()+c(1)*a.squaredNorm())/(2*cross2D(c,a));
+		result(1)=hits(1,0)+(a(0)*c.squaredNorm()+c(0)*a.squaredNorm())/(2*cross2D(c,a));
+		// check Wikipedia for these formulas
+		
+		result(2)=(a.norm()*b.norm()*c.norm())/(2*abs(cross2D(b,a)));
+		// Using Math Olympiad's formula R=abc/(4A)
+		
+		const Vector2d d=hits.block(0,0,2,1)-result.head(2);
+		const Vector2d e=hits.block(0,n-1,2,1)-result.head(2);
+		
+		result(3)=result(2)*atan2(cross2D(d, e), d.dot(e))/(hits(2,n-1)-hits(2,0));
+		// ds/dz slope between last and first point
+		
+		return result;
+	}
+	
+	/*!
+	 \brief Performs the Broken Line fit in the curved track case (that is, the fit parameters are the interceptions u and the curvature correction \Delta\kappa).
+	 
+	 \param hits hits coordinates.
+	 \param hits_cov hits covariance matrix.
+	 \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+	 \param B magnetic field in Gev/cm/c.
+	 \param data PreparedBrokenLineData.
+	 \param circle_results struct to be filled with the results in this form:
+	 -par parameter of the line in this form: (phi, d, k); \n
+	 -cov covariance matrix of the fitted parameter; \n
+	 -chi2 value of the cost function in the minimum.
+	 \param Jacob passed by reference in order to save stack.
+	 \param C_U passed by reference in order to sake stack.
+	 
+	 \details The function implements the steps 2 and 3 of the Broken Line fit with the curvature correction.\n
+	 The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and \Delta\kappa and their covariance matrix.
+	 The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+	 */
+	
+	__host__ __device__ inline void BL_Circle_fit(const Matrix3xNd& hits,
+												  const Matrix3Nd& hits_cov,
+												  const Vector4d& fast_fit,
+												  const double B,
+												  PreparedBrokenLineData& data,
+												  karimaki_circle_fit & circle_results,
+												  Matrix3d& Jacob,
+												  MatrixNplusONEd& C_U) {
+		u_int n=hits.cols();
+		u_int i;
+		
+		circle_results.q=data.q;
+		Matrix2xNd& radii=data.radii;
+		const VectorNd& s=data.s;
+		const VectorNd& S=data.S;
+		VectorNd& Z=data.Z;
+		VectorNd& VarBeta=data.VarBeta;
+		const double slope=-circle_results.q/fast_fit(3);
+		VarBeta*=1+sqr(slope); // the kink angles are projected!
+		
+		for(i=0;i<n;i++) {
+			Z(i)=radii.block(0,i,2,1).norm()-fast_fit(2);
+		}
+		
+		Matrix2d V; // covariance matrix
+		VectorNd w=VectorNd::Zero(n); // weights
+		Matrix2d RR; // rotation matrix point by point
+		//double Slope; // slope of the circle point by point
+		for(i=0;i<n;i++) {
+			V(0,0)=hits_cov(i,i); // I could not find an easy access to sub-matrices in Eigen...
+			V(0,1)=hits_cov(i,i+n);
+			V(1,0)=hits_cov(i+n,i);
+			V(1,1)=hits_cov(i+n,i+n);
+			//Slope=-radii(0,i)/radii(1,i);
+			RR=RotationMatrix(-radii(0,i)/radii(1,i));
+			w(i)=1/((RR*V*RR.transpose())(1,1)); // compute the orthogonal weight point by point
+		}
+		
+		VectorNplusONEd r_u=VectorNplusONEd::Zero(n+1);
+		for(i=0;i<n;i++) {
+			r_u(i)=w(i)*Z(i);
+		} r_u(n)=0;
+		
+		C_U=MatrixNplusONEd::Zero(n+1,n+1);
+		//add the border to the C_u matrix
+		for(i=0;i<n;i++) {
+			if(i>0 && i<n-1) {
+				C_U(i,n)+=-(s(i+1)-s(i-1))/(2*VarBeta(i))*(s(i+1)-s(i-1))/((s(i+1)-s(i))*(s(i)-s(i-1)));
+				C_U(n,i)=C_U(i,n);
+			}
+			if(i>1) {
+				C_U(i,n)+=(s(i)-s(i-2))/(2*VarBeta(i-1)*(s(i)-s(i-1)));
+				C_U(n,i)=C_U(i,n);
+			}
+			if(i<n-2) {
+				C_U(i,n)+=(s(i+2)-s(i))/(2*VarBeta(i+1)*(s(i+1)-s(i)));
+				C_U(n,i)=C_U(i,n);
+			}
+			
+			if(i>0 && i<n-1) C_U(n,n)+=sqr(s(i+1)-s(i-1))/(4*VarBeta(i));
+		}
+		C_U.block(0,0,n,n)=MatrixC_u(w,s,VarBeta);
+		MatrixNplusONEd& I=C_U;
+		I=C_U.inverse();//MatrixNplusONEd I=C_U.inverse();
+		
+		VectorNplusONEd& u=r_u;
+		u=I*r_u; // obtain the fitted parameters by solving the linear system
+		
+		// compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
+		
+		radii.block(0,0,2,1)/=radii.block(0,0,2,1).norm();
+		radii.block(0,1,2,1)/=radii.block(0,1,2,1).norm();
+		
+		Vector2d d=hits.block(0,0,2,1)+(-Z(0)+u(0))*radii.block(0,0,2,1);
+		Vector2d e=hits.block(0,1,2,1)+(-Z(1)+u(1))*radii.block(0,1,2,1);
+		
+		circle_results.par << atan2((e-d)(1),(e-d)(0)),
+		-circle_results.q*(fast_fit(2)-sqrt(sqr(fast_fit(2))-(e-d).squaredNorm()/4)),
+		circle_results.q*(1/fast_fit(2)+u(n));
+		
+		assert(circle_results.q*circle_results.par(1)<=0);
+		
+		Vector2d eMinusd=e-d;
+		double tmp1=eMinusd.squaredNorm();
+		
+		Jacob << (radii(1,0)*eMinusd(0)-eMinusd(1)*radii(0,0))/tmp1,(radii(1,1)*eMinusd(0)-eMinusd(1)*radii(0,1))/tmp1,0,
+		(circle_results.q/2)*(eMinusd(0)*radii(0,0)+eMinusd(1)*radii(1,0))/sqrt(sqr(2*fast_fit(2))-tmp1),(circle_results.q/2)*(eMinusd(0)*radii(0,1)+eMinusd(1)*radii(1,1))/sqrt(sqr(2*fast_fit(2))-tmp1),0,
+		0,0,circle_results.q;
+		
+		circle_results.cov << I(0,0), I(0,1), I(0,n),
+		I(1,0), I(1,1), I(1,n),
+		I(n,0), I(n,1), I(n,n);
+		
+		circle_results.cov=Jacob*circle_results.cov*Jacob.transpose();
+		
+		//...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
+		
+		TranslateKarimaki(circle_results,(e-d)(0)/2,(e-d)(1)/2,Jacob);
+		circle_results.cov(0,0)+=(1+sqr(slope))*MultScatt(S(1)-S(0),B,fast_fit(2),2,slope);
+		
+		//...And translate back to the original system
+		
+		TranslateKarimaki(circle_results,d(0),d(1),Jacob);
+		
+		// compute chi2
+		circle_results.chi2=0;
+		for(i=0;i<n;i++) {
+			circle_results.chi2+=w(i)*sqr(Z(i)-u(i));
+			if(i>0 && i<n-1) circle_results.chi2+=sqr(u(i-1)/(s(i)-s(i-1))-u(i)*(s(i+1)-s(i-1))/((s(i+1)-s(i))*(s(i)-s(i-1)))+u(i+1)/(s(i+1)-s(i))+(s(i+1)-s(i-1))*u(n)/2)/VarBeta(i);
+		}
+		
+		assert(circle_results.chi2>=0);
+	}
+	
+	/*!
+	 \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
+	 
+	 \param hits hits coordinates.
+	 \param hits_cov hits covariance matrix.
+	 \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+	 \param B magnetic field in Gev/cm/c.
+	 \param data PreparedBrokenLineData.
+	 \param line_results struct to be filled with the results in this form:
+	 -par parameter of the line in this form: (cot(theta), Zip); \n
+	 -cov covariance matrix of the fitted parameter; \n
+	 -chi2 value of the cost function in the minimum.
+	 
+	 \details The function implements the steps 2 and 3 of the Broken Line fit without the curvature correction.\n
+	 The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and their covariance matrix.
+	 The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+	 */
+	
+	__host__ __device__ inline void BL_Line_fit(const Matrix3xNd& hits,
+												const Matrix3Nd& hits_cov,
+												const Vector4d& fast_fit,
+												const double B,
+												const PreparedBrokenLineData& data,
+												line_fit & line_results) {
+		u_int n=hits.cols();
+		u_int i;
+		
+		const Matrix2xNd& radii=data.radii;
+		const VectorNd& S=data.S;
+		const VectorNd& Z=data.Z;
+		const VectorNd& VarBeta=data.VarBeta;
+		
+		const double slope=-data.q/fast_fit(3);
+		Matrix2d R=RotationMatrix(slope);
+		
+		Matrix3d V=Matrix3d::Zero(); // covariance matrix XYZ
+		Matrix2x3d JacobXYZtosZ=Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz)
+		VectorNd w=VectorNd::Zero(n);
+		for(i=0;i<n;i++) {
+			V(0,0)=hits_cov(i,i); // I could not find an easy way to access the sub-matrices in Eigen...
+			V(0,1)=hits_cov(i,i+n);
+			V(0,2)=hits_cov(i,i+2*n);
+			V(1,0)=hits_cov(i+n,i);
+			V(1,1)=hits_cov(i+n,i+n);
+			V(1,2)=hits_cov(i+n,i+2*n);
+			V(2,0)=hits_cov(i+2*n,i);
+			V(2,1)=hits_cov(i+2*n,i+n);
+			V(2,2)=hits_cov(i+2*n,i+2*n);
+			JacobXYZtosZ(0,0)=radii(1,i)/radii.block(0,i,2,1).norm();
+			JacobXYZtosZ(0,1)=-radii(0,i)/radii.block(0,i,2,1).norm();
+			JacobXYZtosZ(1,2)=1;
+			w(i)=1/((R*JacobXYZtosZ*V*JacobXYZtosZ.transpose()*R.transpose())(1,1)); // compute the orthogonal weight point by point
+		}
+		
+		VectorNd r_u=VectorNd::Zero(n);
+		for(i=0;i<n;i++) {
+			r_u(i)=w(i)*Z(i);
+		}
+		
+		MatrixNd I=MatrixC_u(w,S,VarBeta).inverse();
+		VectorNd u=I*r_u; // obtain the fitted parameters by solving the linear system
+		
+		// line parameters in the system in which the first hit is the origin and with axis along SZ
+		line_results.par << (u(1)-u(0))/(S(1)-S(0)), u(0);
+		line_results.cov << (I(0,0)-2*I(0,1)+I(1,1))/sqr(S(1)-S(0))+MultScatt(S(1)-S(0),B,fast_fit(2),2,slope), (I(0,1)-I(0,0))/(S(1)-S(0)),
+		(I(0,1)-I(0,0))/(S(1)-S(0)), I(0,0);
+		
+		// translate to the original SZ system
+		Matrix2d Jacob;
+		Jacob(0,0)=1;
+		Jacob(0,1)=0;
+		Jacob(1,0)=-S(0);
+		Jacob(1,1)=1;
+		line_results.par(1)+=-line_results.par(0)*S(0);
+		line_results.cov=Jacob*line_results.cov*Jacob.transpose();
+		
+		// rotate to the original sz system
+		double tmp=R(0,0)-line_results.par(0)*R(0,1);
+		Jacob(0,0)=1/sqr(tmp);
+		Jacob(0,1)=0;
+		Jacob(1,0)=line_results.par(1)*R(0,1)/sqr(tmp);
+		Jacob(1,1)=1/tmp;
+		line_results.par(1)=line_results.par(1)/tmp;
+		line_results.par(0)=(R(0,1)+line_results.par(0)*R(0,0))/tmp;
+		line_results.cov=Jacob*line_results.cov*Jacob.transpose();
+		
+		// compute chi2
+		line_results.chi2=0;
+		for(i=0;i<n;i++) {
+			line_results.chi2+=w(i)*sqr(Z(i)-u(i));
+			if(i>0 && i<n-1) line_results.chi2+=sqr(u(i-1)/(S(i)-S(i-1))-u(i)*(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1)))+u(i+1)/(S(i+1)-S(i)))/VarBeta(i);
+		}
+		
+		assert(line_results.chi2>=0);
+	}
+	
+	/*!
+	 \brief Helix fit by three step:
+	 -fast pre-fit (see Fast_fit() for further info); \n
+	 -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n
+	 -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n
+	 Points must be passed ordered (from inner to outer layer).
+	 
+	 \param hits Matrix3xNd hits coordinates in this form: \n
+	 |x1|x2|x3|...|xn| \n
+	 |y1|y2|y3|...|yn| \n
+	 |z1|z2|z3|...|zn|
+	 \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+	 |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n
+	 |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n
+	 |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n
+	 |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n
+	 .       .       .       .       . .       .       .       .       . .       .       .       .       . \n
+	 |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n
+	 |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n
+	 |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n
+	 |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n
+	 .       .       .    .          . .       .       .       .       . .       .       .       .       . \n
+	 |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n
+	 |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
+	 |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
+	 |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
+	 \param B magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
+	 
+	 \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
+	 
+	 \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs.
+	 
+	 \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
+	 */
+	
+	__host__ __device__ inline helix_fit Helix_fit(const Matrix3xNd& hits,
+												   const Matrix3Nd& hits_cov,
+												   const double B) {
+		helix_fit helix;
+		
+		helix.fast_fit=BL_Fast_fit(hits);
+		
+		PreparedBrokenLineData data;
+		karimaki_circle_fit circle;
+		line_fit line;
+		Matrix3d Jacob;
+		MatrixNplusONEd C_U;
+		
+		PrepareBrokenLineData(hits,hits_cov,helix.fast_fit,B,data);
+		BL_Line_fit(hits,hits_cov,helix.fast_fit,B,data,line);
+		BL_Circle_fit(hits,hits_cov,helix.fast_fit,B,data,circle,Jacob,C_U);
+		
+		// the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
+		Jacob << 1,0,0,
+		0,1,0,
+		0,0,-abs(circle.par(2))*B/(sqr(circle.par(2))*circle.par(2));
+		circle.par(2)=B/abs(circle.par(2));
+		circle.cov=Jacob*circle.cov*Jacob.transpose();
+		
+		helix.par << circle.par, line.par;
+		helix.cov=MatrixXd::Zero(5, 5);
+		helix.cov.block(0,0,3,3)=circle.cov;
+		helix.cov.block(3,3,2,2)=line.cov;
+		helix.q=circle.q;
+		helix.chi2_circle=circle.chi2;
+		helix.chi2_line=line.chi2;
+		
+		return helix;
+	}
+	
+}  // namespace BrokenLine
+
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 35cf3d3256b6b..9171df9cb9bfc 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -3,7 +3,6 @@
 
 #include "FitResult.h"
 
-
 namespace Rfit
 {
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index e868ff1921965..51bf679c91cf4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -13,6 +13,7 @@
 from RecoTracker.TkSeedingLayers.TTRHBuilderWithoutAngle4PixelTriplets_cfi import *
 from RecoPixelVertexing.PixelTrackFitting.pixelFitterByHelixProjections_cfi import pixelFitterByHelixProjections
 from RecoPixelVertexing.PixelTrackFitting.pixelFitterByRiemannParaboloid_cfi import pixelFitterByRiemannParaboloid
+from RecoPixelVertexing.PixelTrackFitting.pixelFitterByBrokenLine_cfi import pixelFitterByBrokenLine
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackFilterByKinematics_cfi import pixelTrackFilterByKinematics
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackCleanerBySharedHits_cfi import pixelTrackCleanerBySharedHits
 from RecoPixelVertexing.PixelTrackFitting.pixelTracks_cfi import pixelTracks as _pixelTracks
@@ -96,4 +97,13 @@
 _pixelTracksTask_riemannFit.replace(pixelFitterByHelixProjections, pixelFitterByRiemannParaboloid)
 riemannFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_riemannFit)
 
+# Use BrokenLine fit and substitute previous Fitter producer with the BrokenLine one
+from Configuration.ProcessModifiers.brokenLine_cff import brokenLine
+from Configuration.ProcessModifiers.brokenLineGPU_cff import brokenLineGPU
+brokenLine.toModify(pixelTracks, Fitter = "pixelFitterByBrokenLine")
+brokenLineGPU.toModify(pixelTracks, runOnGPU = True)
+_pixelTracksTask_brokenLine = pixelTracksTask.copy()
+_pixelTracksTask_brokenLine.replace(pixelFitterByHelixProjections, pixelFitterByBrokenLine)
+brokenLine.toReplaceWith(pixelTracksTask, _pixelTracksTask_brokenLine)
+
 pixelTracksSequence = cms.Sequence(pixelTracksTask)
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index e6c67be031342..c2a87db9444f7 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -1,17 +1,17 @@
 <library file="PixelTrackTest.cc" name="PixelTrackTest">
-<use name="boost"/>
-<use name="root"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/PluginManager"/>
-<use name="FWCore/ParameterSet"/>
-<use name="Geometry/Records"/>
-<use name="Geometry/CommonDetUnit"/>
-<use name="Geometry/TrackerGeometryBuilder"/>
-<use name="DataFormats/TrackerRecHit2D"/>
-<use name="RecoTracker/TkHitPairs"/>
-<use name="RecoTracker/TkTrackingRegions"/>
-<use name="RecoPixelVertexing/PixelTriplets"/>
-<use name="RecoPixelVertexing/PixelTrackFitting"/>
+  <use name="boost"/>
+  <use name="root"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="Geometry/Records"/>
+  <use name="Geometry/CommonDetUnit"/>
+  <use name="Geometry/TrackerGeometryBuilder"/>
+  <use name="DataFormats/TrackerRecHit2D"/>
+  <use name="RecoTracker/TkHitPairs"/>
+  <use name="RecoTracker/TkTrackingRegions"/>
+  <use name="RecoPixelVertexing/PixelTriplets"/>
+  <use name="RecoPixelVertexing/PixelTrackFitting"/>
   <flags EDM_PLUGIN="1"/>
 </library>
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index 17413f4ef3e2a..ebaea2037eb2a 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -3,17 +3,19 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
-#include "test_common.h"
-
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "test_common.h"
 
 using namespace Eigen;
 
-__host__ __device__ void eigenValues(Matrix3d * m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType * ret) {
+using Matrix5d = Matrix<double, 5, 5>;
+
+__host__ __device__ void eigenValues(Matrix3d *m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret) {
 #if TEST_DEBUG
-  printf("Matrix(0,0): %f\n", (*m)(0,0));
-  printf("Matrix(1,1): %f\n", (*m)(1,1));
-  printf("Matrix(2,2): %f\n", (*m)(2,2));
+  printf("Matrix(0,0): %f\n", (*m)(0, 0));
+  printf("Matrix(1,1): %f\n", (*m)(1, 1));
+  printf("Matrix(2,2): %f\n", (*m)(2, 2));
 #endif
   SelfAdjointEigenSolver<Matrix3d> es;
   es.computeDirect(*m);
@@ -21,31 +23,26 @@ __host__ __device__ void eigenValues(Matrix3d * m, Eigen::SelfAdjointEigenSolver
   return;
 }
 
-__global__ void kernel(Matrix3d * m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType * ret) {
+__global__ void kernel(Matrix3d *m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret) {
   eigenValues(m, ret);
 }
 
-__global__ void kernelInverse3x3(Matrix3d * in, Matrix3d * out) {
-  (*out) = in->inverse();
-}
+__global__ void kernelInverse3x3(Matrix3d *in, Matrix3d *out) { (*out) = in->inverse(); }
 
-__global__ void kernelInverse4x4(Matrix4d * in, Matrix4d * out) {
-  (*out) = in->inverse();
-}
+__global__ void kernelInverse4x4(Matrix4d *in, Matrix4d *out) { (*out) = in->inverse(); }
 
+__global__ void kernelInverse5x5(Matrix5d *in, Matrix5d *out) { (*out) = in->inverse(); }
 
-template<typename M1, typename M2, typename M3>
-__global__ void kernelMultiply(M1 * J,
-                               M2 * C,
-                               M3 * result) {
+template <typename M1, typename M2, typename M3>
+__global__ void kernelMultiply(M1 *J, M2 *C, M3 *result) {
 //  Map<M3> res(result->data());
 #if TEST_DEBUG
   printf("*** GPU IN ***\n");
 #endif
   printIt(J);
   printIt(C);
-//  res.noalias() = (*J) * (*C);
-//  printIt(&res);
+  //  res.noalias() = (*J) * (*C);
+  //  printIt(&res);
   (*result) = (*J) * (*C);
 #if TEST_DEBUG
   printf("*** GPU OUT ***\n");
@@ -53,19 +50,20 @@ __global__ void kernelMultiply(M1 * J,
   return;
 }
 
-template<int row1, int col1, int row2, int col2>
+template <int row1, int col1, int row2, int col2>
 void testMultiply() {
   std::cout << "TEST MULTIPLY" << std::endl;
-  std::cout << "Product of type " << row1 << "x" << col1
-    << " * " << row2 << "x" << col2 << std::endl;
+  std::cout << "Product of type " << row1 << "x" << col1 << " * " << row2 << "x" << col2 << std::endl;
   Eigen::Matrix<double, row1, col1> J;
   fillMatrix(J);
   Eigen::Matrix<double, row2, col2> C;
   fillMatrix(C);
   Eigen::Matrix<double, row1, col2> multiply_result = J * C;
 #if TEST_DEBUG
-  std::cout << "Input J:" << std::endl; printIt(&J);
-  std::cout << "Input C:" << std::endl; printIt(&C);
+  std::cout << "Input J:" << std::endl;
+  printIt(&J);
+  std::cout << "Input C:" << std::endl;
+  printIt(&C);
   std::cout << "Output:" << std::endl;
   printIt(&multiply_result);
 #endif
@@ -82,11 +80,11 @@ void testMultiply() {
   cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice);
   cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice);
 
-  kernelMultiply<<<1,1>>>(JGPU, CGPU, multiply_resultGPU);
+  kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(multiply_resultGPUret, multiply_resultGPU,
-      sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
+  cudaMemcpy(
+      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
   printIt(multiply_resultGPUret);
   assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
 }
@@ -110,7 +108,7 @@ void testInverse3x3() {
   cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
   cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
 
-  kernelInverse3x3<<<1,1>>>(mGPU, mGPUret);
+  kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
   cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
@@ -139,7 +137,7 @@ void testInverse4x4() {
   cudaMalloc((void **)&mGPUret, sizeof(Matrix4d));
   cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice);
 
-  kernelInverse4x4<<<1,1>>>(mGPU, mGPUret);
+  kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
   cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost);
@@ -149,16 +147,47 @@ void testInverse4x4() {
   assert(isEqualFuzzy(m_inv, *mCPUret));
 }
 
+void testInverse5x5() {
+  std::cout << "TEST INVERSE 5x5" << std::endl;
+  Matrix5d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix5d m_inv = m.inverse();
+  Matrix5d *mGPU = nullptr;
+  Matrix5d *mGPUret = nullptr;
+  Matrix5d *mCPUret = new Matrix5d();
+
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
+  cudaMalloc((void **)&mGPU, sizeof(Matrix5d));
+  cudaMalloc((void **)&mGPUret, sizeof(Matrix5d));
+  cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice);
+
+  kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost);
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
 void testEigenvalues() {
   std::cout << "TEST EIGENVALUES" << std::endl;
   Matrix3d m;
   fillMatrix(m);
   m += m.transpose().eval();
 
-  Matrix3d * m_gpu = nullptr;
-  Matrix3d * mgpudebug = new Matrix3d();
-  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret = new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
-  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret1 = new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Matrix3d *m_gpu = nullptr;
+  Matrix3d *mgpudebug = new Matrix3d();
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret =
+      new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret1 =
+      new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
   Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret_gpu = nullptr;
   eigenValues(&m, ret);
 #if TEST_DEBUG
@@ -170,26 +199,27 @@ void testEigenvalues() {
   cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType));
   cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
 
-  kernel<<<1,1>>>(m_gpu, ret_gpu);
+  kernel<<<1, 1>>>(m_gpu, ret_gpu);
   cudaDeviceSynchronize();
 
   cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
   cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost);
 #if TEST_DEBUG
-std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
-std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
-std::cout << "*************************\n\n" << std::endl;
+  std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
+  std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
+  std::cout << "*************************\n\n" << std::endl;
 #endif
   assert(isEqualFuzzy(*ret, *ret1));
 }
 
-
-int main (int argc, char * argv[]) {
+int main(int argc, char *argv[]) {
   exitSansCUDADevices();
 
   testEigenvalues();
   testInverse3x3();
   testInverse4x4();
+  testInverse5x5();
+
   testMultiply<1, 2, 2, 1>();
   testMultiply<1, 2, 2, 2>();
   testMultiply<1, 2, 2, 3>();
@@ -205,12 +235,12 @@ int main (int argc, char * argv[]) {
   testMultiply<2, 3, 3, 4>();
   testMultiply<2, 3, 3, 5>();
   testMultiply<3, 2, 2, 3>();
-  testMultiply<2, 3, 3, 3>(); // DOES NOT COMPILE W/O PATCHING EIGEN
+  testMultiply<2, 3, 3, 3>();  // DOES NOT COMPILE W/O PATCHING EIGEN
   testMultiply<3, 3, 3, 3>();
   testMultiply<8, 8, 8, 8>();
   testMultiply<3, 4, 4, 3>();
   testMultiply<2, 4, 4, 2>();
-  testMultiply<3, 4, 4, 2>(); // DOES NOT COMPILE W/O PATCHING EIGEN
+  testMultiply<3, 4, 4, 2>();  // DOES NOT COMPILE W/O PATCHING EIGEN
 
   return 0;
 }

From 874102ce11408a9ec56cae85c798887ff89e1d81 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 29 Jan 2019 15:52:25 +0100
Subject: [PATCH 044/102] Rework the Riemann fit and broken line fit
 (cms-patatrack#338)

Merge the Riemann and broken line fits into single configurable pixel
n-tuplet fitter, and extend it to work with up to 5 hits.

Mmake the broken line fit the default algorithm.

Try both triplets and quadruplets in the pixel "hole".

Limit pT used to compute the multple scattering.

Use the inline Cholesky decomposition.

Generic clean up and improvements.
---
 .../PixelTrackFitting/interface/BrokenLine.h  | 1144 ++++++++---------
 .../PixelTrackFitting/interface/FitResult.h   |   93 +-
 .../PixelTrackFitting/interface/FitUtils.h    |  203 +++
 .../interface/PixelNtupletsFitter.h           |   27 +
 .../PixelTrackFitting/interface/RiemannFit.h  |  172 +--
 .../PixelTrackFitting/plugins/BuildFile.xml   |    2 +-
 .../plugins/PixelNtupletsFitterProducer.cc    |   51 +
 .../plugins/PixelTrackProducer.cc             |    9 +-
 .../plugins/PixelTrackProducer.h              |    4 -
 .../python/PixelTracks_cff.py                 |   26 +-
 .../python/pixelNtupletsFitter_cfi.py         |    6 +
 .../src/PixelNtupletsFitter.cc                |  117 ++
 .../PixelTrackFitting/test/BuildFile.xml      |   23 +-
 .../test/PixelTrackRiemannFit.cc              |   21 +-
 .../PixelTrackFitting/test/testEigenGPU.cu    |  262 +++-
 .../PixelTrackFitting/test/testRiemannFit.cpp |  125 +-
 .../plugins/BrokenLineFitOnGPU.cu             |  236 ++++
 .../PixelTriplets/plugins/CAConstants.h       |    1 +
 .../PixelTriplets/plugins/GPUCACell.h         |   42 +-
 .../{RiemannFitOnGPU.cc => HelixFitOnGPU.cc}  |    9 +-
 .../{RiemannFitOnGPU.h => HelixFitOnGPU.h}    |   34 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  |  194 ++-
 .../PixelTriplets/plugins/gpuFishbone.h       |    2 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |   11 +-
 24 files changed, 1784 insertions(+), 1030 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
 rename RecoPixelVertexing/PixelTriplets/plugins/{RiemannFitOnGPU.cc => HelixFitOnGPU.cc} (75%)
 rename RecoPixelVertexing/PixelTriplets/plugins/{RiemannFitOnGPU.h => HelixFitOnGPU.h} (55%)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
index 382ff5dec3fec..32894c8aa432a 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -1,614 +1,548 @@
 #ifndef RECOPIXELVERTEXING_PIXELTRACKFITTING_BROKENLINE_H
 #define RECOPIXELVERTEXING_PIXELTRACKFITTING_BROKENLINE_H
 
-#include <Eigen/Core>
-#include <Eigen/Cholesky>
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
 #include <Eigen/Eigenvalues>
-#include <cuda_runtime.h>
 
 namespace BrokenLine {
-	
-	using namespace Eigen;
-	
-	constexpr unsigned int max_nop = 4;  //!< In order to avoid use of dynamic memory
-	
-	// WARNING: USE STATIC DIMENSIONS ON GPUs. To do so, comment these definitions and uncomment the others following
-	
-	using MatrixNd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop, max_nop>;
-	using MatrixNplusONEd = Eigen::Matrix<double, Dynamic, Dynamic, 0, max_nop + 1, max_nop + 1>;
-	using Matrix3Nd = Eigen::Matrix<double, Dynamic, Dynamic, 0, 3 * max_nop, 3 * max_nop>;
-	using Matrix2xNd = Eigen::Matrix<double, 2, Dynamic, 0, 2, max_nop>;
-	using Matrix3xNd = Eigen::Matrix<double, 3, Dynamic, 0, 3, max_nop>;
-	using VectorNd = Eigen::Matrix<double, Dynamic, 1, 0, max_nop, 1>;
-	using VectorNplusONEd = Eigen::Matrix<double, Dynamic, 1, 0, max_nop + 1, 1>;
-	using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
-	using Matrix5d = Eigen::Matrix<double, 5, 5>;
-	using Vector5d = Eigen::Matrix<double, 5, 1>;
-	using u_int    = unsigned int;
-	
-	/*
-	 using MatrixNd = Eigen::Matrix<double, max_nop, max_nop, 0, max_nop, max_nop>;
-	 using MatrixNplusONEd = Eigen::Matrix<double, max_nop + 1, max_nop + 1, 0, max_nop + 1, max_nop + 1>;
-	 using Matrix3Nd = Eigen::Matrix<double, 3 * max_nop, 3 * max_nop, 0, 3 * max_nop, 3 * max_nop>;
-	 using Matrix2xNd = Eigen::Matrix<double, 2, max_nop, 0, 2, max_nop>;
-	 using Matrix3xNd = Eigen::Matrix<double, 3, max_nop, 0, 3, max_nop>;
-	 using VectorNd = Eigen::Matrix<double, max_nop, 1, 0, max_nop, 1>;
-	 using VectorNplusONEd = Eigen::Matrix<double, max_nop + 1, 1, 0, max_nop + 1, 1>;
-	 using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
-	 using Matrix5d = Eigen::Matrix<double, 5, 5>;
-	 using Vector5d = Eigen::Matrix<double, 5, 1>;
-	 using u_int    = unsigned int;
-	 */
-	
-	struct karimaki_circle_fit {
-		Vector3d par;  //!< Karimäki's parameters: (phi, d, k=1/R)
-		Matrix3d cov;
-		/*!< covariance matrix: \n
-		 |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
-		 |cov(phi, d )|cov( d , d )|cov( k , d )| \n
-		 |cov(phi, k )|cov( d , k )|cov( k , k )|
-		 */
-		int q;  //!< particle charge
-		double chi2;
-	};
-	
-	struct line_fit {
-		Vector2d par;  //!< parameters: (cotan(theta),Zip)
-		Matrix2d cov;
-		/*!< covariance matrix: \n
-		 |cov(c_t,c_t)|cov(Zip,c_t)| \n
-		 |cov(c_t,Zip)|cov(Zip,Zip)|
-		 */
-		double chi2;
-	};
-	
-	struct helix_fit {
-		Vector5d par;  //!< parameters: (phi,Tip,p_t,cotan(theta)),Zip)
-		Matrix5d cov;
-		/*!< covariance matrix: \n
-		 |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
-		 |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
-		 |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
-		 |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
-		 |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
-		 */
-		double chi2_circle;
-		double chi2_line;
-		Vector4d fast_fit;
-		int q;  //!< particle charge
-	} __attribute__ ((aligned(16)) );
-	
-	/*!
-	 \brief data needed for the Broken Line fit procedure.
-	 */
-	struct PreparedBrokenLineData {
-		int q; //!< particle charge
-		Matrix2xNd radii; //!< xy data in the system in which the pre-fitted center is the origin
-		VectorNd s; //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach
-		VectorNd S; //!< total distance traveled (three-dimensional)
-		VectorNd Z; //!< orthogonal coordinate to the pre-fitted line in the sz plane
-		VectorNd VarBeta; //!< kink angles in the SZ plane
-	};
-	
-	/*!
-	 \brief raise to square.
-	 */
-	__host__ __device__ inline double sqr(const double a) {
-		return a*a;
-	}
-	
-	/*!
-	 \brief Computes the Coulomb multiple scattering variance of the planar angle.
-	 
-	 \param length length of the track in the material.
-	 \param B magnetic field in Gev/cm/c.
-	 \param R radius of curvature (needed to evaluate p).
-	 \param Layer denotes which of the four layers of the detector is the endpoint of the multiple scattered track. For example, if Layer=3, then the particle has just gone through the material between the second and the third layer.
-	 
-	 \todo add another Layer variable to identify also the start point of the track, so if there are missing hits or multiple hits, the part of the detector that the particle has traversed can be exactly identified.
-	 
-	 \warning the formula used here assumes beta=1, and so neglects the dependence of theta_0 on the mass of the particle at fixed momentum.
-	 
-	 \return the variance of the planar angle ((theta_0)^2 /3).
-	 */
-	__host__ __device__ inline double MultScatt(const double& length, const double B, const double& R, int Layer, double slope) {
-		double XX_0; //!< radiation length of the material in cm
-		if(Layer==1) XX_0=16/0.06;
-		else XX_0=16/0.06;
-		XX_0*=1;
-		double geometry_factor=0.7; //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
-		return geometry_factor*sqr((13.6/1000)/(1*B*R*sqrt(1+sqr(slope))))*(abs(length)/XX_0)*sqr(1+0.038*log(abs(length)/XX_0));
-	}
-	
-	/*!
-	 \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
-	 
-	 \param slope tangent of the angle of rotation.
-	 
-	 \return 2D rotation matrix.
-	 */
-	__host__ __device__ inline Matrix2d RotationMatrix(const double& slope) {
-		Matrix2d Rot;
-		Rot(0,0)=1/sqrt(1+sqr(slope));
-		Rot(0,1)=slope*Rot(0,0);
-		Rot(1,0)=-Rot(0,1);
-		Rot(1,1)=Rot(0,0);
-		return Rot;
-	}
-	
-	/*!
-	 \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a translation of the coordinate system, such that the old origin has coordinates (x0,y0) in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
-	 
-	 \param circle circle fit in the old coordinate system.
-	 \param x0 x coordinate of the translation vector.
-	 \param y0 y coordinate of the translation vector.
-	 \param Jacob passed by reference in order to save stack.
-	 */
-	__host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, const double& x0, const double& y0, Matrix3d& Jacob) {
-		double A,U,BB,C,DO,DP,uu,xi,v,mu,lambda,zeta;
-		DP=x0*cos(circle.par(0))+y0*sin(circle.par(0));
-		DO=x0*sin(circle.par(0))-y0*cos(circle.par(0))+circle.par(1);
-		uu=1+circle.par(2)*circle.par(1);
-		C=-circle.par(2)*y0+uu*cos(circle.par(0));
-		BB=circle.par(2)*x0+uu*sin(circle.par(0));
-		A=2*DO+circle.par(2)*(sqr(DO)+sqr(DP));
-		U=sqrt(1+circle.par(2)*A);
-		xi=1/(sqr(BB)+sqr(C));
-		v=1+circle.par(2)*DO;
-		lambda=(A/2)/(U*sqr(1+U));
-		mu=1/(U*(1+U))+circle.par(2)*lambda;
-		zeta=sqr(DO)+sqr(DP);
-		
-		Jacob << xi*uu*v, -xi*sqr(circle.par(2))*DP, xi*DP,
-		2*mu*uu*DP, 2*mu*v, mu*zeta-lambda*A,
-		0, 0, 1;
-		
-		circle.par(0)=atan2(BB,C);
-		circle.par(1)=A/(1+U);
-		// circle.par(2)=circle.par(2);
-		
-		circle.cov=Jacob*circle.cov*Jacob.transpose();
-	}
-	
-	/*!
-	 \brief Compute cross product of two 2D vector (assuming z component 0), returning the z component of the result.
-	 
-	 \param a first 2D vector in the product.
-	 \param b second 2D vector in the product.
-	 
-	 \return z component of the cross product.
-	 */
-	
-	__host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b) {
-		return a.x()*b.y()-a.y()*b.x();
-	}
-	
-	/*!
-	 \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
-	 
-	 \param hits hits coordinates.
-	 \param hits_cov hits covariance matrix.
-	 \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-	 \param B magnetic field in Gev/cm/c.
-	 \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
-	 */
-	__host__ __device__ inline void PrepareBrokenLineData(const Matrix3xNd& hits,
-														  const Matrix3Nd& hits_cov,
-														  const Vector4d& fast_fit,
-														  const double B,
-														  PreparedBrokenLineData & results) {
-		u_int n=hits.cols();
-		u_int i;
-		Vector2d d;
-		Vector2d e;
-		results.radii=Matrix2xNd::Zero(2,n);
-		results.s=VectorNd::Zero(n);
-		results.S=VectorNd::Zero(n);
-		results.Z=VectorNd::Zero(n);
-		results.VarBeta=VectorNd::Zero(n);
-		
-		results.q=1;
-		d=hits.block(0,1,2,1)-hits.block(0,0,2,1);
-		e=hits.block(0,n-1,2,1)-hits.block(0,n-2,2,1);
-		if(cross2D(d,e)>0) results.q=-1;
-		
-		const double slope=-results.q/fast_fit(3);
-		
-		Matrix2d R=RotationMatrix(slope);
-		
-		// calculate radii and s
-		results.radii=hits.block(0,0,2,n)-fast_fit.head(2)*MatrixXd::Constant(1,n,1);
-		e=-fast_fit(2)*fast_fit.head(2)/fast_fit.head(2).norm();
-		for(i=0;i<n;i++) {
-			d=results.radii.block(0,i,2,1);
-			results.s(i)=results.q*fast_fit(2)*atan2(cross2D(d,e),d.dot(e)); // calculates the arc length
-			//if(results.s(i)<=0);
-		}
-		VectorNd z=hits.block(2,0,1,n).transpose();
-		
-		//calculate S and Z
-		Matrix2xNd pointsSZ=Matrix2xNd::Zero(2,n);
-		for(i=0;i<n;i++) {
-			pointsSZ(0,i)=results.s(i);
-			pointsSZ(1,i)=z(i);
-			pointsSZ.block(0,i,2,1)=R*pointsSZ.block(0,i,2,1);
-		}
-		results.S=pointsSZ.block(0,0,1,n).transpose();
-		results.Z=pointsSZ.block(1,0,1,n).transpose();
-		
-		//calculate VarBeta
-		for(i=1;i<n-1;i++) {
-			results.VarBeta(i)=MultScatt(results.S(i+1)-results.S(i),B,fast_fit(2),i+2,slope)+MultScatt(results.S(i)-results.S(i-1),B,fast_fit(2),i+1,slope);
-		}
-	}
-	
-	/*!
-	 \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. This is the whole matrix in the case of the line fit and the main n-by-n block in the case of the circle fit.
-	 
-	 \param w weights of the first part of the cost function, the one with the measurements and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
-	 \param S total distance traveled by the particle from the pre-fitted closest approach.
-	 \param VarBeta kink angles' variance.
-	 
-	 \return the n-by-n matrix of the linear system
-	 */
-	__host__ __device__ inline MatrixNd MatrixC_u(const VectorNd& w, const VectorNd& S, const VectorNd& VarBeta) {
-		u_int n=S.rows();
-		u_int i;
-		
-		MatrixNd C_U=MatrixNd::Zero(n,n);
-		for(i=0;i<n;i++) {
-			C_U(i,i)=w(i);
-			if(i>1) C_U(i,i)+=1/(VarBeta(i-1)*sqr(S(i)-S(i-1)));
-			if(i>0 && i<n-1) C_U(i,i)+=(1/VarBeta(i))*sqr((S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
-			if(i<n-2) C_U(i,i)+=1/(VarBeta(i+1)*sqr(S(i+1)-S(i)));
-			
-			if(i>0 && i<n-1) C_U(i,i+1)=1/(VarBeta(i)*(S(i+1)-S(i)))*(-(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
-			if(i<n-2) C_U(i,i+1)+=1/(VarBeta(i+1)*(S(i+1)-S(i)))*(-(S(i+2)-S(i))/((S(i+2)-S(i+1))*(S(i+1)-S(i))));
-			
-			if(i<n-2) C_U(i,i+2)=1/(VarBeta(i+1)*(S(i+2)-S(i+1))*(S(i+1)-S(i)));
-			
-			C_U(i,i)=C_U(i,i)/2;
-		}
-		MatrixNd C_u;
-		C_u=C_U+C_U.transpose();
-		
-		return C_u;
-	}
-	
-	/*!
-	 \brief A very fast helix fit.
-	 
-	 \param hits the measured hits.
-	 
-	 \return (X0,Y0,R,tan(theta)).
-	 
-	 \warning sign of theta is (intentionally, for now) mistaken for negative charges.
-	 */
-	
-	__host__ __device__ inline Vector4d BL_Fast_fit(const Matrix3xNd& hits) {
-		Vector4d result;
-		u_int n=hits.cols();
-		
-		const Vector2d a=hits.block(0,n/2,2,1)-hits.block(0,0,2,1);
-		const Vector2d b=hits.block(0,n-1,2,1)-hits.block(0,n/2,2,1);
-		const Vector2d c=hits.block(0,0,2,1)-hits.block(0,n-1,2,1);
-		
-		result(0)=hits(0,0)-(a(1)*c.squaredNorm()+c(1)*a.squaredNorm())/(2*cross2D(c,a));
-		result(1)=hits(1,0)+(a(0)*c.squaredNorm()+c(0)*a.squaredNorm())/(2*cross2D(c,a));
-		// check Wikipedia for these formulas
-		
-		result(2)=(a.norm()*b.norm()*c.norm())/(2*abs(cross2D(b,a)));
-		// Using Math Olympiad's formula R=abc/(4A)
-		
-		const Vector2d d=hits.block(0,0,2,1)-result.head(2);
-		const Vector2d e=hits.block(0,n-1,2,1)-result.head(2);
-		
-		result(3)=result(2)*atan2(cross2D(d, e), d.dot(e))/(hits(2,n-1)-hits(2,0));
-		// ds/dz slope between last and first point
-		
-		return result;
-	}
-	
-	/*!
-	 \brief Performs the Broken Line fit in the curved track case (that is, the fit parameters are the interceptions u and the curvature correction \Delta\kappa).
-	 
-	 \param hits hits coordinates.
-	 \param hits_cov hits covariance matrix.
-	 \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-	 \param B magnetic field in Gev/cm/c.
-	 \param data PreparedBrokenLineData.
-	 \param circle_results struct to be filled with the results in this form:
-	 -par parameter of the line in this form: (phi, d, k); \n
-	 -cov covariance matrix of the fitted parameter; \n
-	 -chi2 value of the cost function in the minimum.
-	 \param Jacob passed by reference in order to save stack.
-	 \param C_U passed by reference in order to sake stack.
-	 
-	 \details The function implements the steps 2 and 3 of the Broken Line fit with the curvature correction.\n
-	 The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and \Delta\kappa and their covariance matrix.
-	 The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
-	 */
-	
-	__host__ __device__ inline void BL_Circle_fit(const Matrix3xNd& hits,
-												  const Matrix3Nd& hits_cov,
-												  const Vector4d& fast_fit,
-												  const double B,
-												  PreparedBrokenLineData& data,
-												  karimaki_circle_fit & circle_results,
-												  Matrix3d& Jacob,
-												  MatrixNplusONEd& C_U) {
-		u_int n=hits.cols();
-		u_int i;
-		
-		circle_results.q=data.q;
-		Matrix2xNd& radii=data.radii;
-		const VectorNd& s=data.s;
-		const VectorNd& S=data.S;
-		VectorNd& Z=data.Z;
-		VectorNd& VarBeta=data.VarBeta;
-		const double slope=-circle_results.q/fast_fit(3);
-		VarBeta*=1+sqr(slope); // the kink angles are projected!
-		
-		for(i=0;i<n;i++) {
-			Z(i)=radii.block(0,i,2,1).norm()-fast_fit(2);
-		}
-		
-		Matrix2d V; // covariance matrix
-		VectorNd w=VectorNd::Zero(n); // weights
-		Matrix2d RR; // rotation matrix point by point
-		//double Slope; // slope of the circle point by point
-		for(i=0;i<n;i++) {
-			V(0,0)=hits_cov(i,i); // I could not find an easy access to sub-matrices in Eigen...
-			V(0,1)=hits_cov(i,i+n);
-			V(1,0)=hits_cov(i+n,i);
-			V(1,1)=hits_cov(i+n,i+n);
-			//Slope=-radii(0,i)/radii(1,i);
-			RR=RotationMatrix(-radii(0,i)/radii(1,i));
-			w(i)=1/((RR*V*RR.transpose())(1,1)); // compute the orthogonal weight point by point
-		}
-		
-		VectorNplusONEd r_u=VectorNplusONEd::Zero(n+1);
-		for(i=0;i<n;i++) {
-			r_u(i)=w(i)*Z(i);
-		} r_u(n)=0;
-		
-		C_U=MatrixNplusONEd::Zero(n+1,n+1);
-		//add the border to the C_u matrix
-		for(i=0;i<n;i++) {
-			if(i>0 && i<n-1) {
-				C_U(i,n)+=-(s(i+1)-s(i-1))/(2*VarBeta(i))*(s(i+1)-s(i-1))/((s(i+1)-s(i))*(s(i)-s(i-1)));
-				C_U(n,i)=C_U(i,n);
-			}
-			if(i>1) {
-				C_U(i,n)+=(s(i)-s(i-2))/(2*VarBeta(i-1)*(s(i)-s(i-1)));
-				C_U(n,i)=C_U(i,n);
-			}
-			if(i<n-2) {
-				C_U(i,n)+=(s(i+2)-s(i))/(2*VarBeta(i+1)*(s(i+1)-s(i)));
-				C_U(n,i)=C_U(i,n);
-			}
-			
-			if(i>0 && i<n-1) C_U(n,n)+=sqr(s(i+1)-s(i-1))/(4*VarBeta(i));
-		}
-		C_U.block(0,0,n,n)=MatrixC_u(w,s,VarBeta);
-		MatrixNplusONEd& I=C_U;
-		I=C_U.inverse();//MatrixNplusONEd I=C_U.inverse();
-		
-		VectorNplusONEd& u=r_u;
-		u=I*r_u; // obtain the fitted parameters by solving the linear system
-		
-		// compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
-		
-		radii.block(0,0,2,1)/=radii.block(0,0,2,1).norm();
-		radii.block(0,1,2,1)/=radii.block(0,1,2,1).norm();
-		
-		Vector2d d=hits.block(0,0,2,1)+(-Z(0)+u(0))*radii.block(0,0,2,1);
-		Vector2d e=hits.block(0,1,2,1)+(-Z(1)+u(1))*radii.block(0,1,2,1);
-		
-		circle_results.par << atan2((e-d)(1),(e-d)(0)),
-		-circle_results.q*(fast_fit(2)-sqrt(sqr(fast_fit(2))-(e-d).squaredNorm()/4)),
-		circle_results.q*(1/fast_fit(2)+u(n));
-		
-		assert(circle_results.q*circle_results.par(1)<=0);
-		
-		Vector2d eMinusd=e-d;
-		double tmp1=eMinusd.squaredNorm();
-		
-		Jacob << (radii(1,0)*eMinusd(0)-eMinusd(1)*radii(0,0))/tmp1,(radii(1,1)*eMinusd(0)-eMinusd(1)*radii(0,1))/tmp1,0,
-		(circle_results.q/2)*(eMinusd(0)*radii(0,0)+eMinusd(1)*radii(1,0))/sqrt(sqr(2*fast_fit(2))-tmp1),(circle_results.q/2)*(eMinusd(0)*radii(0,1)+eMinusd(1)*radii(1,1))/sqrt(sqr(2*fast_fit(2))-tmp1),0,
-		0,0,circle_results.q;
-		
-		circle_results.cov << I(0,0), I(0,1), I(0,n),
-		I(1,0), I(1,1), I(1,n),
-		I(n,0), I(n,1), I(n,n);
-		
-		circle_results.cov=Jacob*circle_results.cov*Jacob.transpose();
-		
-		//...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
-		
-		TranslateKarimaki(circle_results,(e-d)(0)/2,(e-d)(1)/2,Jacob);
-		circle_results.cov(0,0)+=(1+sqr(slope))*MultScatt(S(1)-S(0),B,fast_fit(2),2,slope);
-		
-		//...And translate back to the original system
-		
-		TranslateKarimaki(circle_results,d(0),d(1),Jacob);
-		
-		// compute chi2
-		circle_results.chi2=0;
-		for(i=0;i<n;i++) {
-			circle_results.chi2+=w(i)*sqr(Z(i)-u(i));
-			if(i>0 && i<n-1) circle_results.chi2+=sqr(u(i-1)/(s(i)-s(i-1))-u(i)*(s(i+1)-s(i-1))/((s(i+1)-s(i))*(s(i)-s(i-1)))+u(i+1)/(s(i+1)-s(i))+(s(i+1)-s(i-1))*u(n)/2)/VarBeta(i);
-		}
-		
-		assert(circle_results.chi2>=0);
-	}
-	
-	/*!
-	 \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
-	 
-	 \param hits hits coordinates.
-	 \param hits_cov hits covariance matrix.
-	 \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-	 \param B magnetic field in Gev/cm/c.
-	 \param data PreparedBrokenLineData.
-	 \param line_results struct to be filled with the results in this form:
-	 -par parameter of the line in this form: (cot(theta), Zip); \n
-	 -cov covariance matrix of the fitted parameter; \n
-	 -chi2 value of the cost function in the minimum.
-	 
-	 \details The function implements the steps 2 and 3 of the Broken Line fit without the curvature correction.\n
-	 The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and their covariance matrix.
-	 The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
-	 */
-	
-	__host__ __device__ inline void BL_Line_fit(const Matrix3xNd& hits,
-												const Matrix3Nd& hits_cov,
-												const Vector4d& fast_fit,
-												const double B,
-												const PreparedBrokenLineData& data,
-												line_fit & line_results) {
-		u_int n=hits.cols();
-		u_int i;
-		
-		const Matrix2xNd& radii=data.radii;
-		const VectorNd& S=data.S;
-		const VectorNd& Z=data.Z;
-		const VectorNd& VarBeta=data.VarBeta;
-		
-		const double slope=-data.q/fast_fit(3);
-		Matrix2d R=RotationMatrix(slope);
-		
-		Matrix3d V=Matrix3d::Zero(); // covariance matrix XYZ
-		Matrix2x3d JacobXYZtosZ=Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz)
-		VectorNd w=VectorNd::Zero(n);
-		for(i=0;i<n;i++) {
-			V(0,0)=hits_cov(i,i); // I could not find an easy way to access the sub-matrices in Eigen...
-			V(0,1)=hits_cov(i,i+n);
-			V(0,2)=hits_cov(i,i+2*n);
-			V(1,0)=hits_cov(i+n,i);
-			V(1,1)=hits_cov(i+n,i+n);
-			V(1,2)=hits_cov(i+n,i+2*n);
-			V(2,0)=hits_cov(i+2*n,i);
-			V(2,1)=hits_cov(i+2*n,i+n);
-			V(2,2)=hits_cov(i+2*n,i+2*n);
-			JacobXYZtosZ(0,0)=radii(1,i)/radii.block(0,i,2,1).norm();
-			JacobXYZtosZ(0,1)=-radii(0,i)/radii.block(0,i,2,1).norm();
-			JacobXYZtosZ(1,2)=1;
-			w(i)=1/((R*JacobXYZtosZ*V*JacobXYZtosZ.transpose()*R.transpose())(1,1)); // compute the orthogonal weight point by point
-		}
-		
-		VectorNd r_u=VectorNd::Zero(n);
-		for(i=0;i<n;i++) {
-			r_u(i)=w(i)*Z(i);
-		}
-		
-		MatrixNd I=MatrixC_u(w,S,VarBeta).inverse();
-		VectorNd u=I*r_u; // obtain the fitted parameters by solving the linear system
-		
-		// line parameters in the system in which the first hit is the origin and with axis along SZ
-		line_results.par << (u(1)-u(0))/(S(1)-S(0)), u(0);
-		line_results.cov << (I(0,0)-2*I(0,1)+I(1,1))/sqr(S(1)-S(0))+MultScatt(S(1)-S(0),B,fast_fit(2),2,slope), (I(0,1)-I(0,0))/(S(1)-S(0)),
-		(I(0,1)-I(0,0))/(S(1)-S(0)), I(0,0);
-		
-		// translate to the original SZ system
-		Matrix2d Jacob;
-		Jacob(0,0)=1;
-		Jacob(0,1)=0;
-		Jacob(1,0)=-S(0);
-		Jacob(1,1)=1;
-		line_results.par(1)+=-line_results.par(0)*S(0);
-		line_results.cov=Jacob*line_results.cov*Jacob.transpose();
-		
-		// rotate to the original sz system
-		double tmp=R(0,0)-line_results.par(0)*R(0,1);
-		Jacob(0,0)=1/sqr(tmp);
-		Jacob(0,1)=0;
-		Jacob(1,0)=line_results.par(1)*R(0,1)/sqr(tmp);
-		Jacob(1,1)=1/tmp;
-		line_results.par(1)=line_results.par(1)/tmp;
-		line_results.par(0)=(R(0,1)+line_results.par(0)*R(0,0))/tmp;
-		line_results.cov=Jacob*line_results.cov*Jacob.transpose();
-		
-		// compute chi2
-		line_results.chi2=0;
-		for(i=0;i<n;i++) {
-			line_results.chi2+=w(i)*sqr(Z(i)-u(i));
-			if(i>0 && i<n-1) line_results.chi2+=sqr(u(i-1)/(S(i)-S(i-1))-u(i)*(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1)))+u(i+1)/(S(i+1)-S(i)))/VarBeta(i);
-		}
-		
-		assert(line_results.chi2>=0);
-	}
-	
-	/*!
-	 \brief Helix fit by three step:
-	 -fast pre-fit (see Fast_fit() for further info); \n
-	 -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n
-	 -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n
-	 Points must be passed ordered (from inner to outer layer).
-	 
-	 \param hits Matrix3xNd hits coordinates in this form: \n
-	 |x1|x2|x3|...|xn| \n
-	 |y1|y2|y3|...|yn| \n
-	 |z1|z2|z3|...|zn|
-	 \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
-	 |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n
-	 |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n
-	 |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n
-	 |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n
-	 .       .       .       .       . .       .       .       .       . .       .       .       .       . \n
-	 |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n
-	 |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n
-	 |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n
-	 |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n
-	 .       .       .    .          . .       .       .       .       . .       .       .       .       . \n
-	 |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n
-	 |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
-	 |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
-	 |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
-	 \param B magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
-	 
-	 \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
-	 
-	 \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs.
-	 
-	 \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
-	 */
-	
-	__host__ __device__ inline helix_fit Helix_fit(const Matrix3xNd& hits,
-												   const Matrix3Nd& hits_cov,
-												   const double B) {
-		helix_fit helix;
-		
-		helix.fast_fit=BL_Fast_fit(hits);
-		
-		PreparedBrokenLineData data;
-		karimaki_circle_fit circle;
-		line_fit line;
-		Matrix3d Jacob;
-		MatrixNplusONEd C_U;
-		
-		PrepareBrokenLineData(hits,hits_cov,helix.fast_fit,B,data);
-		BL_Line_fit(hits,hits_cov,helix.fast_fit,B,data,line);
-		BL_Circle_fit(hits,hits_cov,helix.fast_fit,B,data,circle,Jacob,C_U);
-		
-		// the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
-		Jacob << 1,0,0,
-		0,1,0,
-		0,0,-abs(circle.par(2))*B/(sqr(circle.par(2))*circle.par(2));
-		circle.par(2)=B/abs(circle.par(2));
-		circle.cov=Jacob*circle.cov*Jacob.transpose();
-		
-		helix.par << circle.par, line.par;
-		helix.cov=MatrixXd::Zero(5, 5);
-		helix.cov.block(0,0,3,3)=circle.cov;
-		helix.cov.block(3,3,2,2)=line.cov;
-		helix.q=circle.q;
-		helix.chi2_circle=circle.chi2;
-		helix.chi2_line=line.chi2;
-		
-		return helix;
-	}
-	
+  
+  using namespace Rfit;
+
+  //!< Karimäki's parameters: (phi, d, k=1/R)
+  /*!< covariance matrix: \n
+    |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
+    |cov(phi, d )|cov( d , d )|cov( k , d )| \n
+    |cov(phi, k )|cov( d , k )|cov( k , k )|
+  */
+  using karimaki_circle_fit = Rfit::circle_fit;
+  
+  
+  /*!
+    \brief data needed for the Broken Line fit procedure.
+  */
+  template<int N>
+  struct PreparedBrokenLineData {
+    int q; //!< particle charge
+    Matrix2xNd<N> radii; //!< xy data in the system in which the pre-fitted center is the origin
+    VectorNd<N> s; //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach
+    VectorNd<N> S; //!< total distance traveled (three-dimensional)
+    VectorNd<N> Z; //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    VectorNd<N> VarBeta; //!< kink angles in the SZ plane
+  };
+  
+   /*!
+    \brief Computes the Coulomb multiple scattering variance of the planar angle.
+    
+    \param length length of the track in the material.
+    \param B magnetic field in Gev/cm/c.
+    \param R radius of curvature (needed to evaluate p).
+    \param Layer denotes which of the four layers of the detector is the endpoint of the multiple scattered track. For example, if Layer=3, then the particle has just gone through the material between the second and the third layer.
+    
+    \todo add another Layer variable to identify also the start point of the track, so if there are missing hits or multiple hits, the part of the detector that the particle has traversed can be exactly identified.
+    
+    \warning the formula used here assumes beta=1, and so neglects the dependence of theta_0 on the mass of the particle at fixed momentum.
+    
+    \return the variance of the planar angle ((theta_0)^2 /3).
+  */
+  __host__ __device__ inline double MultScatt(const double& length, const double B, const double R, int Layer, double slope) {
+    // limit R to 20GeV...
+    auto pt2 = std::min(20.,B*R);
+    pt2 *=pt2;
+    constexpr double XXI_0 = 0.06/16.; //!< inverse of radiation length of the material in cm
+    //if(Layer==1) XXI_0=0.06/16.;
+    // else XXI_0=0.06/16.;
+    //XX_0*=1;
+    constexpr double geometry_factor=0.7; //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+    constexpr double fact = geometry_factor*sqr(13.6/1000.);
+    return fact/(pt2*(1.+sqr(slope)))
+      *(std::abs(length)*XXI_0)*sqr(1.+0.038*log(std::abs(length)*XXI_0));
+  }
+  
+  /*!
+    \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
+    
+    \param slope tangent of the angle of rotation.
+    
+    \return 2D rotation matrix.
+  */
+  __host__ __device__ inline Matrix2d RotationMatrix(double slope) {
+    Matrix2d Rot;
+    Rot(0,0)=1./sqrt(1.+sqr(slope));
+    Rot(0,1)=slope*Rot(0,0);
+    Rot(1,0)=-Rot(0,1);
+    Rot(1,1)=Rot(0,0);
+    return Rot;
+  }
+  
+  /*!
+    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a translation of the coordinate system, such that the old origin has coordinates (x0,y0) in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
+    
+    \param circle circle fit in the old coordinate system.
+    \param x0 x coordinate of the translation vector.
+    \param y0 y coordinate of the translation vector.
+    \param Jacob passed by reference in order to save stack.
+  */
+  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, double x0, double y0, Matrix3d& Jacob) {
+    double A,U,BB,C,DO,DP,uu,xi,v,mu,lambda,zeta;
+    DP=x0*cos(circle.par(0))+y0*sin(circle.par(0));
+    DO=x0*sin(circle.par(0))-y0*cos(circle.par(0))+circle.par(1);
+    uu=1+circle.par(2)*circle.par(1);
+    C=-circle.par(2)*y0+uu*cos(circle.par(0));
+    BB=circle.par(2)*x0+uu*sin(circle.par(0));
+    A=2.*DO+circle.par(2)*(sqr(DO)+sqr(DP));
+    U=sqrt(1.+circle.par(2)*A);
+    xi=1./(sqr(BB)+sqr(C));
+    v=1.+circle.par(2)*DO;
+    lambda=(0.5*A)/(U*sqr(1.+U));
+    mu=1./(U*(1.+U))+circle.par(2)*lambda;
+    zeta=sqr(DO)+sqr(DP);
+    
+    Jacob << xi*uu*v, -xi*sqr(circle.par(2))*DP, xi*DP,
+      2.*mu*uu*DP, 2.*mu*v, mu*zeta-lambda*A,
+      0, 0, 1.;
+    
+    circle.par(0)=atan2(BB,C);
+    circle.par(1)=A/(1+U);
+    // circle.par(2)=circle.par(2);
+    
+    circle.cov=Jacob*circle.cov*Jacob.transpose();
+  }
+  
+ 
+  /*!
+    \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param B magnetic field in Gev/cm/c.
+    \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
+  */
+  template<typename M3xN, typename V4, int N>
+  __host__ __device__ inline void prepareBrokenLineData(const M3xN& hits,
+							const V4& fast_fit,
+							const double B,
+							PreparedBrokenLineData<N> & results) {
+    constexpr auto n = N;
+    u_int i;
+    Vector2d d;
+    Vector2d e;
+    
+    d=hits.block(0,1,2,1)-hits.block(0,0,2,1);
+    e=hits.block(0,n-1,2,1)-hits.block(0,n-2,2,1);
+    results.q = cross2D(d,e)>0 ? -1 : 1;
+    
+    const double slope=-results.q/fast_fit(3);
+    
+    Matrix2d R=RotationMatrix(slope);
+    
+    // calculate radii and s
+    results.radii=hits.block(0,0,2,n)-fast_fit.head(2)*MatrixXd::Constant(1,n,1);
+    e=-fast_fit(2)*fast_fit.head(2)/fast_fit.head(2).norm();
+    for(i=0;i<n;i++) {
+      d=results.radii.block(0,i,2,1);
+      results.s(i)=results.q*fast_fit(2)*atan2(cross2D(d,e),d.dot(e)); // calculates the arc length
+    }
+    VectorNd<N> z=hits.block(2,0,1,n).transpose();
+    
+    //calculate S and Z
+    Matrix2xNd<N> pointsSZ=Matrix2xNd<N>::Zero();
+    for(i=0;i<n;i++) {
+      pointsSZ(0,i)=results.s(i);
+      pointsSZ(1,i)=z(i);
+      pointsSZ.block(0,i,2,1)=R*pointsSZ.block(0,i,2,1);
+    }
+    results.S=pointsSZ.block(0,0,1,n).transpose();
+    results.Z=pointsSZ.block(1,0,1,n).transpose();
+    
+    //calculate VarBeta
+    results.VarBeta(0)=results.VarBeta(n-1)=0;
+    for(i=1;i<n-1;i++) {
+      results.VarBeta(i)=MultScatt(results.S(i+1)-results.S(i),B,fast_fit(2),i+2,slope)+MultScatt(results.S(i)-results.S(i-1),B,fast_fit(2),i+1,slope);
+    }
+  }
+  
+  /*!
+    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. This is the whole matrix in the case of the line fit and the main n-by-n block in the case of the circle fit.
+    
+    \param w weights of the first part of the cost function, the one with the measurements and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
+    \param S total distance traveled by the particle from the pre-fitted closest approach.
+    \param VarBeta kink angles' variance.
+    
+    \return the n-by-n matrix of the linear system
+  */
+  template<int N>
+  __host__ __device__ inline MatrixNd<N> MatrixC_u(const VectorNd<N>& w, const VectorNd<N>& S, const VectorNd<N>& VarBeta) {
+    constexpr u_int n=N;
+    u_int i;
+    
+    MatrixNd<N> C_U=MatrixNd<N>::Zero();
+    for(i=0;i<n;i++) {
+      C_U(i,i)=w(i);
+      if(i>1) C_U(i,i)+=1./(VarBeta(i-1)*sqr(S(i)-S(i-1)));
+      if(i>0 && i<n-1) C_U(i,i)+=(1./VarBeta(i))*sqr((S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
+      if(i<n-2) C_U(i,i)+=1./(VarBeta(i+1)*sqr(S(i+1)-S(i)));
+      
+      if(i>0 && i<n-1) C_U(i,i+1)=1./(VarBeta(i)*(S(i+1)-S(i)))*(-(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
+      if(i<n-2) C_U(i,i+1)+=1./(VarBeta(i+1)*(S(i+1)-S(i)))*(-(S(i+2)-S(i))/((S(i+2)-S(i+1))*(S(i+1)-S(i))));
+      
+      if(i<n-2) C_U(i,i+2)=1./(VarBeta(i+1)*(S(i+2)-S(i+1))*(S(i+1)-S(i)));
+
+      C_U(i,i)*=0.5;
+    }
+    return C_U+C_U.transpose();
+  }
+  
+  /*!
+    \brief A very fast helix fit.
+    
+    \param hits the measured hits.
+    
+    \return (X0,Y0,R,tan(theta)).
+    
+    \warning sign of theta is (intentionally, for now) mistaken for negative charges.
+  */
+  
+  template<typename M3xN, typename V4>
+  __host__ __device__ inline void BL_Fast_fit(const M3xN& hits, V4 & result)
+  {
+    
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N; // get the number of hits
+    
+    const Vector2d a=hits.block(0,n/2,2,1)-hits.block(0,0,2,1);
+    const Vector2d b=hits.block(0,n-1,2,1)-hits.block(0,n/2,2,1);
+    const Vector2d c=hits.block(0,0,2,1)-hits.block(0,n-1,2,1);
+
+    auto tmp = 0.5/cross2D(c,a);
+    result(0)=hits(0,0)-(a(1)*c.squaredNorm()+c(1)*a.squaredNorm())*tmp;
+    result(1)=hits(1,0)+(a(0)*c.squaredNorm()+c(0)*a.squaredNorm())*tmp;
+    // check Wikipedia for these formulas
+    
+    result(2)=sqrt(a.squaredNorm()*b.squaredNorm()*c.squaredNorm())/(2.*std::abs(cross2D(b,a)));
+    // Using Math Olympiad's formula R=abc/(4A)
+    
+    const Vector2d d=hits.block(0,0,2,1)-result.head(2);
+    const Vector2d e=hits.block(0,n-1,2,1)-result.head(2);
+    
+    result(3)=result(2)*atan2(cross2D(d, e), d.dot(e))/(hits(2,n-1)-hits(2,0));
+    // ds/dz slope between last and first point
+    
+  }
+  
+  /*!
+    \brief Performs the Broken Line fit in the curved track case (that is, the fit parameters are the interceptions u and the curvature correction \Delta\kappa).
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param B magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param circle_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (phi, d, k); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit with the curvature correction.\n
+    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and \Delta\kappa and their covariance matrix.
+    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+  */
+  template<typename M3xN, typename M6xN, typename V4, int N>
+  __host__ __device__ inline void BL_Circle_fit(const M3xN& hits,
+						const  M6xN & hits_ge,
+						const V4& fast_fit,
+						const double B,
+						PreparedBrokenLineData<N>& data,
+						karimaki_circle_fit & circle_results
+                                               ) {
+
+    constexpr u_int n = N;
+    u_int i;
+    
+    circle_results.q=data.q;
+    auto & radii=data.radii;
+    const auto & s=data.s;
+    const auto & S=data.S;
+    auto & Z=data.Z;
+    auto & VarBeta=data.VarBeta;
+    const double slope=-circle_results.q/fast_fit(3);
+    VarBeta*=1.+sqr(slope); // the kink angles are projected!
+    
+    for(i=0;i<n;i++) {
+      Z(i)=radii.block(0,i,2,1).norm()-fast_fit(2);
+    }
+    
+    Matrix2d V; // covariance matrix
+    VectorNd<N> w; // weights
+    Matrix2d RR; // rotation matrix point by point
+    //double Slope; // slope of the circle point by point
+    for(i=0;i<n;i++) {
+      V(0,0)=hits_ge.col(i)[0];                // x errors
+      V(0,1)=V(1,0)=hits_ge.col(i)[1];   // cov_xy
+      V(1,1)=hits_ge.col(i)[2];                // y errors
+      //Slope=-radii(0,i)/radii(1,i);
+      RR=RotationMatrix(-radii(0,i)/radii(1,i));
+      w(i)=1./((RR*V*RR.transpose())(1,1)); // compute the orthogonal weight point by point
+    }
+    
+    VectorNplusONEd<N> r_u;
+    r_u(n)=0;
+    for(i=0;i<n;i++) {
+      r_u(i)=w(i)*Z(i);
+    }
+    
+    MatrixNplusONEd<N> C_U;
+    C_U.block(0,0,n,n)=MatrixC_u(w,s,VarBeta);
+    C_U(n,n) =0;
+    //add the border to the C_u matrix
+    for(i=0;i<n;i++) {
+      C_U(i,n) =0;
+      if(i>0 && i<n-1) {
+	C_U(i,n)+=-(s(i+1)-s(i-1))*(s(i+1)-s(i-1))/(2.*VarBeta(i)*(s(i+1)-s(i))*(s(i)-s(i-1)));
+      }
+      if(i>1) {
+	C_U(i,n)+=(s(i)-s(i-2))/(2.*VarBeta(i-1)*(s(i)-s(i-1)));
+      }
+      if(i<n-2) {
+	C_U(i,n)+=(s(i+2)-s(i))/(2.*VarBeta(i+1)*(s(i+1)-s(i)));
+      }
+      C_U(n,i) = C_U(i,n);
+      if(i>0 && i<n-1) C_U(n,n)+=sqr(s(i+1)-s(i-1))/(4.*VarBeta(i));
+    }
+
+#ifdef CPP_DUMP
+    std::cout << "CU5\n" << C_U << std::endl;
+#endif
+    MatrixNplusONEd<N> I;
+    choleskyInversion::invert(C_U,I);
+    // MatrixNplusONEd<N> I = C_U.inverse();
+#ifdef CPP_DUMP
+    std::cout << "I5\n" << I << std::endl;
+#endif
+
+    
+    VectorNplusONEd<N> u = I*r_u; // obtain the fitted parameters by solving the linear system
+    
+    // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
+    
+    radii.block(0,0,2,1)/=radii.block(0,0,2,1).norm();
+    radii.block(0,1,2,1)/=radii.block(0,1,2,1).norm();
+    
+    Vector2d d=hits.block(0,0,2,1)+(-Z(0)+u(0))*radii.block(0,0,2,1);
+    Vector2d e=hits.block(0,1,2,1)+(-Z(1)+u(1))*radii.block(0,1,2,1);
+    
+    circle_results.par << atan2((e-d)(1),(e-d)(0)),
+      -circle_results.q*(fast_fit(2)-sqrt(sqr(fast_fit(2))- 0.25*(e-d).squaredNorm())),
+      circle_results.q*(1./fast_fit(2)+u(n));
+    
+    assert(circle_results.q*circle_results.par(1)<=0);
+    
+    Vector2d eMinusd=e-d;
+    double tmp1=eMinusd.squaredNorm();
+    
+    Matrix3d Jacob;
+    Jacob << (radii(1,0)*eMinusd(0)-eMinusd(1)*radii(0,0))/tmp1,(radii(1,1)*eMinusd(0)-eMinusd(1)*radii(0,1))/tmp1,0,
+      (circle_results.q/2)*(eMinusd(0)*radii(0,0)+eMinusd(1)*radii(1,0))/sqrt(sqr(2*fast_fit(2))-tmp1),(circle_results.q/2)*(eMinusd(0)*radii(0,1)+eMinusd(1)*radii(1,1))/sqrt(sqr(2*fast_fit(2))-tmp1),0,
+      0,0,circle_results.q;
+    
+    circle_results.cov << I(0,0), I(0,1), I(0,n),
+      I(1,0), I(1,1), I(1,n),
+      I(n,0), I(n,1), I(n,n);
+    
+    circle_results.cov=Jacob*circle_results.cov*Jacob.transpose();
+    
+    //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
+    
+    TranslateKarimaki(circle_results,0.5*(e-d)(0),0.5*(e-d)(1),Jacob);
+    circle_results.cov(0,0)+=(1+sqr(slope))*MultScatt(S(1)-S(0),B,fast_fit(2),2,slope);
+    
+    //...And translate back to the original system
+    
+    TranslateKarimaki(circle_results,d(0),d(1),Jacob);
+    
+    // compute chi2
+    circle_results.chi2=0;
+    for(i=0;i<n;i++) {
+      circle_results.chi2+=w(i)*sqr(Z(i)-u(i));
+      if(i>0 && i<n-1) circle_results.chi2+=sqr(u(i-1)/(s(i)-s(i-1))-u(i)*(s(i+1)-s(i-1))/((s(i+1)-s(i))*(s(i)-s(i-1)))+u(i+1)/(s(i+1)-s(i))+(s(i+1)-s(i-1))*u(n)/2)/VarBeta(i);
+    }
+    
+    // assert(circle_results.chi2>=0);
+  }
+  
+  /*!
+    \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param B magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param line_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (cot(theta), Zip); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit without the curvature correction.\n
+    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and their covariance matrix.
+    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+  */
+  template<typename V4, typename M6xN, int N>
+  __host__ __device__ inline void BL_Line_fit(const  M6xN & hits_ge,
+					      const V4& fast_fit,
+					      const double B,
+					      const PreparedBrokenLineData<N>& data,
+					      line_fit & line_results) {
+    constexpr u_int n = N;
+    u_int i;
+    
+    const auto & radii=data.radii;
+    const auto & S=data.S;
+    const auto & Z=data.Z;
+    const auto& VarBeta=data.VarBeta;
+    
+    const double slope=-data.q/fast_fit(3);
+    Matrix2d R=RotationMatrix(slope);
+    
+    Matrix3d V=Matrix3d::Zero(); // covariance matrix XYZ
+    Matrix2x3d JacobXYZtosZ=Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz)
+    VectorNd<N> w=VectorNd<N>::Zero();
+    for(i=0;i<n;i++) {
+      V(0,0)=hits_ge.col(i)[0];                // x errors
+      V(0,1)=V(1,0)=hits_ge.col(i)[1];   // cov_xy
+      V(0,2)=V(2,0)= hits_ge.col(i)[3];   // cov_xz
+      V(1,1)=hits_ge.col(i)[2];                // y errors
+      V(2,1)=V(1,2)=hits_ge.col(i)[4];   // cov_yz
+      V(2,2)=hits_ge.col(i)[5];                // z errors
+      auto tmp = 1./radii.block(0,i,2,1).norm();
+      JacobXYZtosZ(0,0)=radii(1,i)*tmp;
+      JacobXYZtosZ(0,1)=-radii(0,i)*tmp;
+      JacobXYZtosZ(1,2)=1.;
+      w(i)=1./((R*JacobXYZtosZ*V*JacobXYZtosZ.transpose()*R.transpose())(1,1)); // compute the orthogonal weight point by point
+    }
+    
+    VectorNd<N> r_u;
+    for(i=0;i<n;i++) {
+      r_u(i)=w(i)*Z(i);
+    }
+#ifdef CPP_DUMP
+    std::cout << "CU4\n" << MatrixC_u(w,S,VarBeta) << std::endl;
+#endif
+    MatrixNd<N> I; choleskyInversion::invert(MatrixC_u(w,S,VarBeta),I);
+    //    MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
+#ifdef CPP_DUMP
+    std::cout << "I4\n" << I << std::endl;
+#endif
+
+    VectorNd<N> u=I*r_u; // obtain the fitted parameters by solving the linear system
+    
+    // line parameters in the system in which the first hit is the origin and with axis along SZ
+    line_results.par << (u(1)-u(0))/(S(1)-S(0)), u(0);
+    auto idiff = 1./(S(1)-S(0));
+    line_results.cov << (I(0,0)-2*I(0,1)+I(1,1))*sqr(idiff)+MultScatt(S(1)-S(0),B,fast_fit(2),2,slope),
+      (I(0,1)-I(0,0))*idiff,
+      (I(0,1)-I(0,0))*idiff, I(0,0);
+    
+    // translate to the original SZ system
+    Matrix2d Jacob;
+    Jacob(0,0)=1.;
+    Jacob(0,1)=0;
+    Jacob(1,0)=-S(0);
+    Jacob(1,1)=1.;
+    line_results.par(1)+=-line_results.par(0)*S(0);
+    line_results.cov=Jacob*line_results.cov*Jacob.transpose();
+    
+    // rotate to the original sz system
+    auto tmp=R(0,0)-line_results.par(0)*R(0,1);
+    Jacob(1,1)=1./tmp;
+    Jacob(0,0)=Jacob(1,1)*Jacob(1,1);
+    Jacob(0,1)=0;
+    Jacob(1,0)=line_results.par(1)*R(0,1)*Jacob(0,0);
+    line_results.par(1)=line_results.par(1)*Jacob(1,1);
+    line_results.par(0)=(R(0,1)+line_results.par(0)*R(0,0))*Jacob(1,1);
+    line_results.cov=Jacob*line_results.cov*Jacob.transpose();
+    
+    // compute chi2
+    line_results.chi2=0;
+    for(i=0;i<n;i++) {
+      line_results.chi2+=w(i)*sqr(Z(i)-u(i));
+      if(i>0 && i<n-1) line_results.chi2+=sqr(u(i-1)/(S(i)-S(i-1))
+					      -u(i)*(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1)))
+					      +u(i+1)/(S(i+1)-S(i)))/VarBeta(i);
+    }
+    
+    // assert(line_results.chi2>=0);
+  }
+  
+  /*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n
+    -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n
+    Points must be passed ordered (from inner to outer layer).
+    
+    \param hits Matrix3xNd hits coordinates in this form: \n
+    |x1|x2|x3|...|xn| \n
+    |y1|y2|y3|...|yn| \n
+    |z1|z2|z3|...|zn|
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+    |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n
+    |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n
+    |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n
+    |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n
+    .       .       .       .       . .       .       .       .       . .       .       .       .       . \n
+    |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n
+    |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n
+    |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n
+    |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n
+    .       .       .    .          . .       .       .       .       . .       .       .       .       . \n
+    |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n
+    |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
+    |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
+    |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
+    \param B magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
+    
+    \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
+    
+    \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs.
+    
+    \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
+  */
+  template<int N>
+  inline helix_fit BL_Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float,6,4>& hits_ge, const double B)
+{
+    helix_fit helix;
+    Vector4d fast_fit;
+    BL_Fast_fit(hits,fast_fit);
+
+    PreparedBrokenLineData<N> data;
+    karimaki_circle_fit circle;
+    line_fit line;
+    Matrix3d Jacob;
+    
+    prepareBrokenLineData(hits,fast_fit,B,data);
+    BL_Line_fit(hits_ge,fast_fit,B,data,line);
+    BL_Circle_fit(hits,hits_ge,fast_fit,B,data,circle);
+    
+    // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
+    Jacob << 1.,0,0,
+      0,1.,0,
+      0,0,-std::abs(circle.par(2))*B/(sqr(circle.par(2))*circle.par(2));
+    circle.par(2)=B/std::abs(circle.par(2));
+    circle.cov=Jacob*circle.cov*Jacob.transpose();
+    
+    helix.par << circle.par, line.par;
+    helix.cov=MatrixXd::Zero(5, 5);
+    helix.cov.block(0,0,3,3)=circle.cov;
+    helix.cov.block(3,3,2,2)=line.cov;
+    helix.q=circle.q;
+    helix.chi2_circle=circle.chi2;
+    helix.chi2_line=line.chi2;
+    
+    return helix;
+  }
+  
 }  // namespace BrokenLine
 
 #endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
index ba0f0aa13e1a6..0d9be5a346d0a 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
@@ -2,97 +2,58 @@
 #define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
 
 #include <cmath>
+#include <cstdint>
 
 #include <cuda_runtime.h>
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 namespace Rfit
 {
 
-constexpr double d = 1.e-4;          //!< used in numerical derivative (J2 in Circle_fit())
-constexpr unsigned int max_nop = 4;  //!< In order to avoid use of dynamic memory
 
+  using Vector2d = Eigen::Vector2d;
+  using Vector3d = Eigen::Vector3d;
+  using Vector4d = Eigen::Vector4d;
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix2d = Eigen::Matrix2d;
+  using Matrix3d = Eigen::Matrix3d;
+  using Matrix4d = Eigen::Matrix4d;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+  using Matrix6d = Eigen::Matrix<double, 6, 6>;
 
-using VectorXd = Eigen::VectorXd;
-using MatrixXd = Eigen::MatrixXd;
-template<int N>
-using MatrixNd = Eigen::Matrix<double, N, N>;
-template<int N>
-using ArrayNd = Eigen::Array<double, N, N>;
-template<int N>
-using Matrix2Nd = Eigen::Matrix<double, 2 * N, 2 * N>;
-template<int N>
-using Matrix3Nd = Eigen::Matrix<double, 3 * N, 3 * N>;
-template<int N>
-using Matrix2xNd = Eigen::Matrix<double, 2, N>;
-template<int N>
-using Array2xNd = Eigen::Array<double, 2, N>;
-template<int N>
-using Matrix3xNd = Eigen::Matrix<double, 3, N>;
-template<int N>
-using MatrixNx3d = Eigen::Matrix<double, N, 3>;
-template<int N>
-using MatrixNx5d = Eigen::Matrix<double, N, 5>;
-template<int N>
-using VectorNd = Eigen::Matrix<double, N, 1>;
-template<int N>
-using Vector2Nd = Eigen::Matrix<double, 2 * N, 1>;
-template<int N>
-using Vector3Nd = Eigen::Matrix<double, 3 * N, 1>;
-template<int N>
-using RowVectorNd = Eigen::Matrix<double, 1, 1, N>;
-template<int N>
-using RowVector2Nd = Eigen::Matrix<double, 1, 2 * N>;
 
+ template<int N>
+ using Matrix3xNd = Eigen::Matrix<double, 3, N>; // used for inputs hits
 
 
-using Vector2d = Eigen::Vector2d;
-using Vector3d = Eigen::Vector3d;
-using Vector4d = Eigen::Vector4d;
-using Matrix2d = Eigen::Matrix2d;
-using Matrix3d = Eigen::Matrix3d;
-using Matrix4d = Eigen::Matrix4d;
-using Matrix5d = Eigen::Matrix<double, 5, 5>;
-using Matrix6d = Eigen::Matrix<double, 6, 6>;
-using Vector5d = Eigen::Matrix<double, 5, 1>;
-
-using Matrix3f = Eigen::Matrix3f;
-using Vector3f = Eigen::Vector3f;
-using Vector4f = Eigen::Vector4f;
-using Vector6f = Eigen::Matrix<double, 6, 1>;
-
-using u_int = unsigned int;
-
-
-struct circle_fit
-{
+  struct circle_fit
+  {
     Vector3d par;  //!< parameter: (X0,Y0,R)
     Matrix3d cov;
     /*!< covariance matrix: \n
       |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
-  */
+    */
     int32_t q;  //!< particle charge
     float chi2 = 0.0;
-};
-
-struct line_fit
-{
+  };
+  
+  struct line_fit
+  {
     Vector2d par;  //!<(cotan(theta),Zip)
     Matrix2d cov;
     /*!<
       |cov(c_t,c_t)|cov(Zip,c_t)| \n
       |cov(c_t,Zip)|cov(Zip,Zip)|
-  */
+    */
     double chi2 = 0.0;
-};
-
-struct helix_fit
-{
+  };
+  
+  struct helix_fit
+  {
     Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
     Matrix5d cov;
     /*!< ()->cov() \n
@@ -101,12 +62,12 @@ struct helix_fit
       |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
       |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
       |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
-  */
+    */
     float chi2_circle;
     float chi2_line;
-//    Vector4d fast_fit;
+    //    Vector4d fast_fit;
     int32_t q;  //!< particle charge
-} __attribute__((aligned(16)));
+  }; // __attribute__((aligned(16)));
 
 } // namespace RFit
 #endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
new file mode 100644
index 0000000000000..03ccb011645ec
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -0,0 +1,203 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
+
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+
+
+namespace Rfit
+{
+
+
+  constexpr double d = 1.e-4;          //!< used in numerical derivative (J2 in Circle_fit())
+
+
+
+  using VectorXd = Eigen::VectorXd;
+  using MatrixXd = Eigen::MatrixXd;
+  template<int N>
+  using MatrixNd = Eigen::Matrix<double, N, N>;
+  template<int N>
+  using MatrixNplusONEd = Eigen::Matrix<double, N+1, N+1>; 
+  template<int N>
+  using ArrayNd = Eigen::Array<double, N, N>;
+  template<int N>
+  using Matrix2Nd = Eigen::Matrix<double, 2 * N, 2 * N>;
+  template<int N>
+  using Matrix3Nd = Eigen::Matrix<double, 3 * N, 3 * N>;
+  template<int N>
+  using Matrix2xNd = Eigen::Matrix<double, 2, N>;
+  template<int N>
+  using Array2xNd = Eigen::Array<double, 2, N>;
+  template<int N>
+  using MatrixNx3d = Eigen::Matrix<double, N, 3>;
+  template<int N>
+  using MatrixNx5d = Eigen::Matrix<double, N, 5>;
+  template<int N>
+  using VectorNd = Eigen::Matrix<double, N, 1>;
+  template<int N>
+  using VectorNplusONEd  = Eigen::Matrix<double, N+1, 1>;
+  template<int N>
+  using Vector2Nd = Eigen::Matrix<double, 2 * N, 1>;
+  template<int N>
+  using Vector3Nd = Eigen::Matrix<double, 3 * N, 1>;
+  template<int N>
+  using RowVectorNd = Eigen::Matrix<double, 1, 1, N>;
+  template<int N>
+  using RowVector2Nd = Eigen::Matrix<double, 1, 2 * N>;
+
+
+  using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
+
+  
+  using Matrix3f = Eigen::Matrix3f;
+  using Vector3f = Eigen::Vector3f;
+  using Vector4f = Eigen::Vector4f;
+  using Vector6f = Eigen::Matrix<double, 6, 1>;
+
+
+
+  
+  using u_int = unsigned int;
+
+
+
+  
+  template <class C>
+  __host__ __device__ void printIt(C* m, const char* prefix = "")
+  {
+#ifdef RFIT_DEBUG
+    for (u_int r = 0; r < m->rows(); ++r)
+      {
+        for (u_int c = 0; c < m->cols(); ++c)
+	  {
+            printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
+	  }
+      }
+#endif
+  }
+  
+  /*!
+    \brief raise to square.
+  */
+  template <typename T>
+  constexpr T sqr(const T a)
+  {
+    return a * a;
+  }
+  
+  /*!
+    \brief Compute cross product of two 2D vector (assuming z component 0),
+    returning z component of the result.
+    \param a first 2D vector in the product.
+    \param b second 2D vector in the product.
+    \return z component of the cross product.
+  */
+  
+  __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
+  {
+    return a.x() * b.y() - a.y() * b.x();
+  }
+  
+  /*!
+   *  load error in CMSSW format to our formalism
+   *  
+   */
+  template<typename M6xNf, typename M2Nd>
+  __host__ __device__ void loadCovariance2D(M6xNf const & ge,  M2Nd & hits_cov) {
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+    for (uint32_t i=0; i< hits_in_fit; ++i) {
+      auto ge_idx = 0; auto j=0; auto l=0;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 2; j=1; l=1;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1; j=1; l=0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+    }
+  }
+  
+  template<typename M6xNf, typename M3xNd>
+  __host__ __device__ void loadCovariance(M6xNf const & ge,  M3xNd & hits_cov) {
+    
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+    for (uint32_t i=0; i<hits_in_fit; ++i) {
+      auto ge_idx = 0; auto j=0; auto l=0;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 2; j=1; l=1;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 5; j=2; l=2;
+      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1; j=1; l=0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 3; j=2; l=0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 4; j=2; l=1;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+    }
+  }
+
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+    \param B magnetic field in Gev/cm/c unit.
+    \param error flag for errors computation.
+  */
+  __host__ __device__
+  inline void par_uvrtopak(circle_fit& circle, const double B, const bool error)
+  {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)),
+      circle.q * (temp1 - circle.par(2)), circle.par(2) * B;
+    if (error)
+      {
+        const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+        const double temp3 = 1. / temp1 * circle.q;
+        Matrix3d J4;
+        J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., 
+	  circle.par(0) * temp3, circle.par(1) * temp3, -circle.q,
+	  0., 0., B;
+        circle.cov = J4 * circle.cov * J4.transpose();
+      }
+    circle.par = par_pak;
+  }
+  
+}
+
+
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
new file mode 100644
index 0000000000000..fac80941a48e0
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
@@ -0,0 +1,27 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_PixelNtupletsFitter_H
+#define RecoPixelVertexing_PixelTrackFitting_PixelNtupletsFitter_H
+
+#include <vector>
+
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackingRecHit/interface/TrackingRecHit.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitterBase.h"
+#include "RecoTracker/TkTrackingRegions/interface/TrackingRegion.h"
+
+class PixelNtupletsFitter final : public PixelFitterBase {
+public:
+  explicit PixelNtupletsFitter(float nominalB, const MagneticField *field, bool useRiemannFit);
+  ~PixelNtupletsFitter() override = default;
+  std::unique_ptr<reco::Track> run(const std::vector<const TrackingRecHit *>& hits,
+                                   const TrackingRegion& region,
+                                   const edm::EventSetup& setup) const override;
+
+private:
+  float nominalB_;
+  const MagneticField *field_;
+  bool useRiemannFit_;
+};
+
+#endif // RecoPixelVertexing_PixelTrackFitting_PixelNtupletsFitter_H
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 9171df9cb9bfc..656047aababf9 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -1,112 +1,11 @@
 #ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
 #define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
 
-#include "FitResult.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
 namespace Rfit
 {
 
-template <class C>
-__host__ __device__ void printIt(C* m, const char* prefix = "")
-{
-#ifdef RFIT_DEBUG
-    for (u_int r = 0; r < m->rows(); ++r)
-    {
-        for (u_int c = 0; c < m->cols(); ++c)
-        {
-            printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
-        }
-    }
-#endif
-}
-
-/*!
-    \brief raise to square.
-*/
-template <typename T>
-__host__ __device__ inline T sqr(const T a)
-{
-    return a * a;
-}
-
-/*!
-    \brief Compute cross product of two 2D vector (assuming z component 0),
-    returning z component of the result.
-    \param a first 2D vector in the product.
-    \param b second 2D vector in the product.
-    \return z component of the cross product.
-*/
-
-__host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
-{
-    return a.x() * b.y() - a.y() * b.x();
-}
-
-/*!
- *  load error in CMSSW format to our formalism
- *  
- */
-  template<typename M6x4f, typename M2Nd>
-  __host__ __device__ void loadCovariance2D(M6x4f const & ge,  M2Nd & hits_cov) {
-    // Index numerology:
-    // i: index of the hits/point (0,..,3)
-    // j: index of space component (x,y,z)
-    // l: index of space components (x,y,z)
-    // ge is always in sync with the index i and is formatted as:
-    // ge[] ==> [xx, xy, yy, xz, yz, zz]
-    // in (j,l) notation, we have:
-    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
-    // so the index ge_idx corresponds to the matrix elements:
-    // | 0  1  3 |
-    // | 1  2  4 |
-    // | 3  4  5 |
-    constexpr uint32_t  hits_in_fit = 4; // Fixme
-    for (uint32_t i=0; i< hits_in_fit; ++i) {
-      auto ge_idx = 0; auto j=0; auto l=0;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2; j=1; l=1;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1; j=1; l=0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-    }
-  }
-  
-  template<typename M6x4f, int N>
-  __host__ __device__ void loadCovariance(M6x4f const & ge,  Matrix3Nd<N> & hits_cov) {
-
-    // Index numerology:
-    // i: index of the hits/point (0,..,3)
-    // j: index of space component (x,y,z)
-    // l: index of space components (x,y,z)
-    // ge is always in sync with the index i and is formatted as:
-    // ge[] ==> [xx, xy, yy, xz, yz, zz]
-    // in (j,l) notation, we have:
-    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
-    // so the index ge_idx corresponds to the matrix elements:
-    // | 0  1  3 |
-    // | 1  2  4 |
-    // | 3  4  5 |
-    constexpr uint32_t  hits_in_fit = 4; // Fixme
-    for (uint32_t i=0; i<hits_in_fit; ++i) {
-      auto ge_idx = 0; auto j=0; auto l=0;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2; j=1; l=1;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 5; j=2; l=2;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1; j=1; l=0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
-	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 3; j=2; l=0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
-	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 4; j=2; l=1;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
-	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-    }
-  }
-
   
 /*!  Compute the Radiation length in the uniform hypothesis
  *
@@ -138,7 +37,6 @@ void computeRadLenUniformMaterial(const VNd1 &length_values,
   // Radiation length of the pixel detector in the uniform assumption, with
   // 0.06 rad_len at 16 cm
   constexpr double XX_0_inv = 0.06/16.;
-//  const double XX_0 = 1000.*16.f/(0.06);
   u_int n = length_values.rows();
   rad_lengths(0) = length_values(0)*XX_0_inv;
   for (u_int j = 1; j < n; ++j) {
@@ -177,7 +75,7 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
 #ifdef RFIT_DEBUG
     Rfit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
 #endif
-    constexpr auto n = N;
+    constexpr u_int n = N;
     double p_t = std::min(20.,fast_fit(2) * B);   // limit pt to avoid too small error!!!
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
     VectorNd<N> rad_lengths_S;
@@ -236,7 +134,7 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
 							 VectorNd<N> const& rad,
 							 double B)
 {
-    u_int n = N;
+    constexpr u_int n = N;
     double p_t = std::min(20.,fast_fit(2) * B);   // limit pt to avoid too small error!!!
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
     double theta = atan(fast_fit(3));
@@ -292,7 +190,7 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
     printf("Address of p2D: %p\n", &p2D);
 #endif
     printIt(&p2D, "cov_radtocart - p2D:");
-    u_int n = p2D.cols();
+    constexpr u_int n = N;
     Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
     VectorNd<N> rad_inv = rad.cwiseInverse();
     printIt(&rad_inv, "cov_radtocart - rad_inv:");
@@ -328,7 +226,7 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
 							 const Matrix2Nd<N>& cov_cart,
 							 const VectorNd<N>& rad)
 {
-    u_int n = p2D.cols();
+    constexpr u_int n = N;
     VectorNd<N> cov_rad;
     const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
     for (u_int i = 0; i < n; ++i)
@@ -361,7 +259,7 @@ template<typename M2xN, typename V4, int N>
 							      V4& fast_fit,
 							      const VectorNd<N>& rad)
 {
-    u_int n = p2D.cols();
+    constexpr u_int n = N;
     VectorNd<N> cov_rad;
     for (u_int i = 0; i < n; ++i)
     {
@@ -413,34 +311,6 @@ template<typename M2xN>
     return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) > 0)? -1 : 1;
 }
 
-/*!
-    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and
-    consequently covariance matrix.
-    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
-    be transformed and particle charge.
-    \param B magnetic field in Gev/cm/c unit.
-    \param error flag for errors computation.
-*/
-
-__host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool error)
-{
-    Vector3d par_pak;
-    const double temp0 = circle.par.head(2).squaredNorm();
-    const double temp1 = sqrt(temp0);
-    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)),
-        circle.q * (temp1 - circle.par(2)), circle.par(2) * B;
-    if (error)
-    {
-        const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-        const double temp3 = 1. / temp1 * circle.q;
-        Matrix3d J4;
-        J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., 
-               circle.par(0) * temp3, circle.par(1) * temp3, -circle.q,
-               0., 0., B;
-        circle.cov = J4 * circle.cov * J4.transpose();
-    }
-    circle.par = par_pak;
-}
 
 /*!
     \brief Compute the eigenvector associated to the minimum eigenvalue.
@@ -527,7 +397,8 @@ __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2)
 template<typename M3xN, typename V4>
 __host__ __device__ inline void Fast_fit(const M3xN& hits, V4 & result)
 {
-    u_int n = hits.cols();  // get the number of hits
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N; // get the number of hits
     printIt(&hits, "Fast_fit - hits: ");
 
     // CIRCLE FIT
@@ -617,7 +488,7 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
 #endif
     // INITIALIZATION
     Matrix2Nd<N> V = hits_cov2D;
-    u_int n = hits2D.cols();
+    constexpr u_int n = N;
     printIt(&hits2D, "circle_fit - hits2D:");
     printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
 
@@ -640,7 +511,8 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
         printIt(&V, "circle_fit - V:");
         cov_rad += scatter_cov_rad;
         printIt(&cov_rad, "circle_fit - cov_rad:");
-        G = cov_rad.inverse();
+        choleskyInversion::invert(cov_rad,G);
+        // G = cov_rad.inverse();
 	renorm = G.sum();
         G *= 1. / renorm;
         weight = Weight_circle(G);
@@ -803,7 +675,7 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
         printIt(&C0, "circle_fit - C0:");
 
         const MatrixNd<N> W = weight * weight.transpose();
-        const MatrixNd<N> H = MatrixXd::Identity(n, n).rowwise() - weight.transpose();
+        const MatrixNd<N> H = MatrixNd<N>::Identity().rowwise() - weight.transpose();
         const MatrixNx3d<N> s_v = H * p3D.transpose();
         printIt(&W, "circle_fit - W:");
         printIt(&H, "circle_fit - H:");
@@ -952,7 +824,7 @@ inline line_fit Line_fit(const M3xN& hits,
 			 const bool error) {
     
   constexpr uint32_t N = M3xN::ColsAtCompileTime;
-  auto n = hits.cols();
+  constexpr auto n = N;
   double theta = -circle.q*atan(fast_fit(3));
   theta = theta < 0. ? theta + M_PI : theta;
     
@@ -987,7 +859,7 @@ inline line_fit Line_fit(const M3xN& hits,
 
   // associated Jacobian, used in weights and errors computation
   Matrix6d Cov = Matrix6d::Zero();
-  Matrix2d cov_sz[4];  // FIXME: should be "N"
+  Matrix2d cov_sz[N];
   for (u_int i = 0; i < n; ++i)
   {
     Vector2d p = hits.block(0, i, 2, 1) - o;
@@ -1057,11 +929,12 @@ inline line_fit Line_fit(const M3xN& hits,
 #endif
 
   // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
-  MatrixNd<N> Vy_inv = cov_with_ms.inverse();
-  Eigen::Matrix<double, 2, 2> Inv_Cov = A*Vy_inv*A.transpose();
-
+  MatrixNd<N> Vy_inv; choleskyInversion::invert(cov_with_ms,Vy_inv);
+  // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
+  Eigen::Matrix<double, 2, 2> Cov_params = A*Vy_inv*A.transpose();
   // Compute the Covariance Matrix of the fit parameters
-  Eigen::Matrix<double, 2, 2> Cov_params = Inv_Cov.inverse();
+  choleskyInversion::invert(Cov_params,Cov_params);
+
 
   // Now Compute the Parameters in the form [2,1]
   // The first component is q.
@@ -1084,7 +957,6 @@ inline line_fit Line_fit(const M3xN& hits,
 
   VectorNd<N> res = p2D_rot.row(1).transpose() - A.transpose() * sol;
   double chi2 = res.transpose()*Vy_inv*res;
-  chi2 = chi2 / float(n);
 
   line_fit line;
   line.par << m, q;
@@ -1139,16 +1011,16 @@ inline line_fit Line_fit(const M3xN& hits,
 */
 
 template<int N>
-inline helix_fit Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float,6,4>& hits_ge, const double B,
+inline helix_fit Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float,6,N>& hits_ge, const double B,
                            const bool error)
 {
-    u_int n = hits.cols();
+    constexpr u_int n = N;
     VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
     // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
     Vector4d fast_fit; 
     Fast_fit(hits,fast_fit);
-    Rfit::Matrix2Nd<4> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
+    Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
     Rfit::loadCovariance2D(hits_ge,hits_cov);
     circle_fit circle = Circle_fit(hits.block(0, 0, 2, n),
                                    hits_cov,
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
index d8177a0e9447c..62a8e8541aa64 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
@@ -3,6 +3,6 @@
 <use name="HeterogeneousCore/Producer"/>
 <use name="HeterogeneousCore/Product"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
-<library file="*.cc *.cu" name="RecoPixelVertexingPixelTrackFittingPlugins">
+<library file="*.cc" name="RecoPixelVertexingPixelTrackFittingPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
new file mode 100644
index 0000000000000..eeca145ab93e3
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
@@ -0,0 +1,51 @@
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitter.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
+
+#include "MagneticField/Engine/interface/MagneticField.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+class PixelNtupletsFitterProducer: public edm::global::EDProducer<> {
+public:
+  explicit PixelNtupletsFitterProducer(const edm::ParameterSet& iConfig)
+    : useRiemannFit_(iConfig.getParameter<bool>("useRiemannFit"))
+  {
+    produces<PixelFitter>();
+  }
+  ~PixelNtupletsFitterProducer() override {}
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
+    descriptions.add("pixelNtupletsFitterDefault", desc);
+  }
+
+private:
+  bool useRiemannFit_;
+  void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+};
+
+
+void PixelNtupletsFitterProducer::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::ESHandle<MagneticField> fieldESH;
+  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
+  float bField = 1 / PixelRecoUtilities::fieldInInvGev(iSetup);
+  auto impl = std::make_unique<PixelNtupletsFitter>(bField,
+      fieldESH.product(), useRiemannFit_);
+  auto prod = std::make_unique<PixelFitter>(std::move(impl));
+  iEvent.put(std::move(prod));
+}
+
+DEFINE_FWK_MODULE(PixelNtupletsFitterProducer);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
index 7f13c7218eafa..57c6d6ec0e806 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
@@ -24,9 +24,7 @@ using namespace pixeltrackfitting;
 using edm::ParameterSet;
 
 PixelTrackProducer::PixelTrackProducer(const ParameterSet& cfg)
-  : runOnGPU_(cfg.getParameter<bool>("runOnGPU")),
-  theReconstruction(cfg, consumesCollector()),
-  theGPUReconstruction(cfg, consumesCollector())
+  :   theReconstruction(cfg, consumesCollector())
 {
   edm::LogInfo("PixelTrackProducer")<<" construction...";
   produces<reco::TrackCollection>();
@@ -40,7 +38,6 @@ void PixelTrackProducer::fillDescriptions(edm::ConfigurationDescriptions& descri
   edm::ParameterSetDescription desc;
 
   desc.add<std::string>("passLabel", "pixelTracks"); // What is this? It is not used anywhere in this code.
-  desc.add<bool>("runOnGPU", false);
   PixelTrackReconstruction::fillDescriptions(desc);
 
   descriptions.add("pixelTracks", desc);
@@ -51,11 +48,7 @@ void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es)
   LogDebug("PixelTrackProducer, produce")<<"event# :"<<ev.id();
 
   TracksWithTTRHs tracks;
-  if (!runOnGPU_)
     theReconstruction.run(tracks, ev, es);
-  else {
-    theGPUReconstruction.run(tracks, ev, es);
-  }
   edm::ESHandle<TrackerTopology> httopo;
   es.get<TrackerTopologyRcd>().get(httopo);
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
index 7e0d5d73b03fc..8852c884c7cc5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
@@ -4,8 +4,6 @@
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstruction.h"
 
-#include "PixelTrackReconstructionGPU.h"
-
 namespace edm { class Event; class EventSetup; class ParameterSet; class ConfigurationDescriptions; }
 class TrackerTopology;
 
@@ -21,9 +19,7 @@ class PixelTrackProducer :  public edm::stream::EDProducer<> {
   void produce(edm::Event& ev, const edm::EventSetup& es) override;
 
 private:
-  bool runOnGPU_;
   PixelTrackReconstruction theReconstruction;
-  PixelTrackReconstructionGPU theGPUReconstruction;
 };
 
 #endif // PixelTrackProducer_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 51bf679c91cf4..ef6d5d16fb329 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -12,8 +12,7 @@
 from RecoTracker.TkSeedingLayers.PixelLayerTriplets_cfi import *
 from RecoTracker.TkSeedingLayers.TTRHBuilderWithoutAngle4PixelTriplets_cfi import *
 from RecoPixelVertexing.PixelTrackFitting.pixelFitterByHelixProjections_cfi import pixelFitterByHelixProjections
-from RecoPixelVertexing.PixelTrackFitting.pixelFitterByRiemannParaboloid_cfi import pixelFitterByRiemannParaboloid
-from RecoPixelVertexing.PixelTrackFitting.pixelFitterByBrokenLine_cfi import pixelFitterByBrokenLine
+from RecoPixelVertexing.PixelTrackFitting.pixelNtupletsFitter_cfi import pixelNtupletsFitter
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackFilterByKinematics_cfi import pixelTrackFilterByKinematics
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackCleanerBySharedHits_cfi import pixelTrackCleanerBySharedHits
 from RecoPixelVertexing.PixelTrackFitting.pixelTracks_cfi import pixelTracks as _pixelTracks
@@ -88,22 +87,11 @@
 _pixelTracksTask_lowPU.replace(pixelTracksHitQuadruplets, pixelTracksHitTriplets)
 trackingLowPU.toReplaceWith(pixelTracksTask, _pixelTracksTask_lowPU)
 
-# Use Riemann fit and substitute previous Fitter producer with the Riemann one
-from Configuration.ProcessModifiers.riemannFit_cff import riemannFit
-from Configuration.ProcessModifiers.riemannFitGPU_cff import riemannFitGPU
-riemannFit.toModify(pixelTracks, Fitter = "pixelFitterByRiemannParaboloid")
-riemannFitGPU.toModify(pixelTracks, runOnGPU = True)
-_pixelTracksTask_riemannFit = pixelTracksTask.copy()
-_pixelTracksTask_riemannFit.replace(pixelFitterByHelixProjections, pixelFitterByRiemannParaboloid)
-riemannFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_riemannFit)
-
-# Use BrokenLine fit and substitute previous Fitter producer with the BrokenLine one
-from Configuration.ProcessModifiers.brokenLine_cff import brokenLine
-from Configuration.ProcessModifiers.brokenLineGPU_cff import brokenLineGPU
-brokenLine.toModify(pixelTracks, Fitter = "pixelFitterByBrokenLine")
-brokenLineGPU.toModify(pixelTracks, runOnGPU = True)
-_pixelTracksTask_brokenLine = pixelTracksTask.copy()
-_pixelTracksTask_brokenLine.replace(pixelFitterByHelixProjections, pixelFitterByBrokenLine)
-brokenLine.toReplaceWith(pixelTracksTask, _pixelTracksTask_brokenLine)
+# Use ntuple fit and substitute previous Fitter producer with the ntuple one
+from Configuration.ProcessModifiers.pixelNtupleFit_cff import pixelNtupleFit as ntupleFit
+ntupleFit.toModify(pixelTracks, Fitter = "pixelNtupletsFitter")
+_pixelTracksTask_ntupleFit = pixelTracksTask.copy()
+_pixelTracksTask_ntupleFit.replace(pixelFitterByHelixProjections, pixelNtupletsFitter)
+ntupleFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_ntupleFit)
 
 pixelTracksSequence = cms.Sequence(pixelTracksTask)
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py b/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py
new file mode 100644
index 0000000000000..10e1e3852e9c4
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py
@@ -0,0 +1,6 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoPixelVertexing.PixelTrackFitting.pixelNtupletsFitterDefault_cfi import pixelNtupletsFitterDefault
+
+pixelNtupletsFitter = pixelNtupletsFitterDefault.clone()
+
diff --git a/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
new file mode 100644
index 0000000000000..92b8cac8f8fe9
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
@@ -0,0 +1,117 @@
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include "FWCore/Framework/interface/EventSetup.h"
+
+#include "DataFormats/GeometryCommonDetAlgo/interface/GlobalError.h"
+#include "DataFormats/GeometryVector/interface/GlobalPoint.h"
+#include "DataFormats/GeometryVector/interface/LocalPoint.h"
+
+#include "DataFormats/TrackingRecHit/interface/TrackingRecHit.h"
+#include "Geometry/CommonDetUnit/interface/GeomDet.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+#include "DataFormats/GeometryCommonDetAlgo/interface/Measurement1D.h"
+
+#include "Geometry/CommonDetUnit/interface/GeomDetType.h"
+
+#include "MagneticField/Engine/interface/MagneticField.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "DataFormats/GeometryVector/interface/Pi.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackBuilder.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackErrorParam.h"
+
+#include "CommonTools/Utils/interface/DynArray.h"
+
+using namespace std;
+
+
+PixelNtupletsFitter::PixelNtupletsFitter(float nominalB, const MagneticField* field,
+                                         bool useRiemannFit)
+    : nominalB_(nominalB), field_(field),
+    useRiemannFit_(useRiemannFit) {}
+
+std::unique_ptr<reco::Track> PixelNtupletsFitter::run(
+    const std::vector<const TrackingRecHit*>& hits, const TrackingRegion& region, const edm::EventSetup& ) const {
+
+  using namespace Rfit;
+
+  std::unique_ptr<reco::Track> ret;
+
+  unsigned int nhits = hits.size();
+
+  if (nhits < 2) return ret;
+
+  declareDynArray(GlobalPoint, nhits, points);
+  declareDynArray(GlobalError, nhits, errors);
+  declareDynArray(bool, nhits, isBarrel);
+
+  for (unsigned int i = 0; i != nhits; ++i) {
+    auto const& recHit = hits[i];
+    points[i] = GlobalPoint(recHit->globalPosition().basicVector() - region.origin().basicVector());
+    errors[i] = recHit->globalPositionError();
+    isBarrel[i] = recHit->detUnit()->type().isBarrel();
+  }
+
+   assert(nhits==4);
+   Rfit::Matrix3xNd<4> hits_gp;
+
+   Eigen::Matrix<float,6,4> hits_ge = Eigen::Matrix<float,6,4>::Zero();
+
+  for (unsigned int i = 0; i < nhits; ++i) {
+    hits_gp.col(i) << points[i].x(), points[i].y(), points[i].z();
+
+    hits_ge.col(i) <<  errors[i].cxx(), errors[i].cyx(), errors[i].cyy(),
+                              errors[i].czx(), errors[i].czy(), errors[i].czz();
+  }
+
+
+  helix_fit fittedTrack = useRiemannFit_ ?
+	Rfit::Helix_fit(hits_gp, hits_ge, nominalB_, true)
+        : BrokenLine::BL_Helix_fit(hits_gp, hits_ge, nominalB_);
+
+  int iCharge = fittedTrack.q;
+
+  // parameters are:
+  // 0: phi
+  // 1: tip
+  // 2: curvature
+  // 3: cottheta
+  // 4: zip
+  float valPhi = fittedTrack.par(0);
+
+  float valTip = fittedTrack.par(1);
+
+  float valCotTheta = fittedTrack.par(3);
+
+  float valZip = fittedTrack.par(4);
+  float valPt = fittedTrack.par(2);
+  //
+  //  PixelTrackErrorParam param(valEta, valPt);
+  float errValPhi = std::sqrt(fittedTrack.cov(0, 0));
+  float errValTip = std::sqrt(fittedTrack.cov(1, 1));
+
+  float errValPt = std::sqrt(fittedTrack.cov(2, 2));
+
+  float errValCotTheta = std::sqrt(fittedTrack.cov(3, 3));
+  float errValZip = std::sqrt(fittedTrack.cov(4, 4));
+
+  float chi2 = fittedTrack.chi2_line + fittedTrack.chi2_circle;
+
+  PixelTrackBuilder builder;
+  Measurement1D phi(valPhi, errValPhi);
+  Measurement1D tip(valTip, errValTip);
+
+  Measurement1D pt(valPt, errValPt);
+  Measurement1D cotTheta(valCotTheta, errValCotTheta);
+  Measurement1D zip(valZip, errValZip);
+
+  ret.reset(
+      builder.build(pt, phi, cotTheta, tip, zip, chi2, iCharge, hits, field_, region.origin()));
+  return ret;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index c2a87db9444f7..83ebf7a577711 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -21,13 +21,19 @@
   <flags CXXFLAGS="-g"/>
 </bin>
 
+<bin file="testRiemannFit.cpp" name="testBrokenLineFit">
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <flags CXXFLAGS="-g -DUSE_BL"/>
+</bin>
+
 <bin file="testRiemannFit.cpp" name="testRiemannFitDump">
   <use name="eigen"/>
   <use name="cuda"/>
   <flags CXXFLAGS="-g -DRFIT_DEBUG"/>
 </bin>
 
-<bin file="testEigenGPU.cu" name="testEigenGPU_t">
+<bin file="testEigenGPU.cu" name="testRiemannFitGPU_t">
   <use name="eigen"/>
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
@@ -35,6 +41,14 @@
   <flags CXXFLAGS="-g"/>
 </bin>
 
+<bin file="testEigenGPU.cu" name="testBrokenLineFitGPU_t">
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <flags CXXFLAGS="-g -DUSE_BL"/>
+</bin>
+
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
   <use name="eigen"/>
   <use name="cuda"/>
@@ -50,6 +64,13 @@
   <flags CXXFLAGS="-DEIGEN_NO_DEBUG"/>
 </bin>
 
+<bin file="PixelTrackRiemannFit.cc" name = "PixelTrackBrokenLineFit">
+  <use name="eigen"/>
+  <use name="cuda"/>
+  <use name="root"/>
+  <flags CXXFLAGS="-DEIGEN_NO_DEBUG -DUSE_BL"/>
+</bin>
+
 <bin file="PixelTrackRiemannFit.cc" name = "PixelTrackRiemannFit_Debug">
   <use name="eigen"/>
   <use name="cuda"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
index adcabd7dde508..f19618e23d252 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -10,8 +10,11 @@
 #include <TFile.h>
 #include <TH1F.h>
 
+#ifdef USE_BL
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#else
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-//#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#endif
 
 using namespace std;
 using namespace Eigen;
@@ -335,7 +338,6 @@ void computePull(std::array<Fit, N> & fit, const char * label,
 
 void test_helix_fit(bool getcin) {
   int n_;
-  bool return_err;
   const double B_field = 3.8 * c_speed / pow(10, 9) / 100;
   Matrix<double, 6, 1> gen_par;
   Vector5d true_par;
@@ -343,7 +345,7 @@ void test_helix_fit(bool getcin) {
   generator.seed(1);
   std::cout << std::setprecision(6);
   cout << "_________________________________________________________________________\n";
-  cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration return_err debug" << endl;
+  cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration debug" << endl;
   if (getcin) {
     cout << "hits: ";
     cin  >> n_;
@@ -368,14 +370,11 @@ void test_helix_fit(bool getcin) {
      gen_par(4) = 10.;   // R (p_t)
      gen_par(5) = 1.;   // eta
   }
-  return_err = true;
 
   const int iteration = 5000;
   gen_par = New_par(gen_par, 1, B_field);
   true_par = True_par(gen_par, 1, B_field);
-  // Matrix3xNd<4> hits;
   std::array<helix_fit, iteration> helixRiemann_fit;
-//  std::array<BrokenLine::helix_fit, iteration> helixBrokenLine_fit;
 
   std::cout << "\nTrue parameters: "
     << "phi: " << true_par(0) << " "
@@ -396,11 +395,14 @@ void test_helix_fit(bool getcin) {
     //      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
     //      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
     delta -= std::chrono::high_resolution_clock::now()-start;
-    helixRiemann_fit[i%iteration] = Rfit::Helix_fit(gen.hits, gen.hits_ge, B_field, return_err);
+    helixRiemann_fit[i%iteration] =
+#ifdef USE_BL
+      BrokenLine::BL_Helix_fit(gen.hits, gen.hits_ge, B_field);
+#else
+      Rfit::Helix_fit(gen.hits, gen.hits_ge, B_field, true);
+#endif
     delta += std::chrono::high_resolution_clock::now()-start;
 
-//    helixBrokenLine_fit[i] = BrokenLine::Helix_fit(gen.hits, gen.hits_cov, B_field);
-
     if (helixRiemann_fit[i%iteration].par(0)>10.) std::cout << "error" << std::endl;
     if (0==i)
       cout << std::setprecision(6)
@@ -423,7 +425,6 @@ void test_helix_fit(bool getcin) {
   }
   std::cout << "elapsted time " << double(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count())/1.e6 << std::endl;
   computePull(helixRiemann_fit, "Riemann", n_, iteration, true_par);
-//  computePull(helixBrokenLine_fit, "BrokenLine", n_, iteration, true_par);
 }
 
 int main(int nargs, char**) {
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 3917de89a8185..88ba8139f01ae 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -3,32 +3,108 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
+#ifdef USE_BL
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#else
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#endif
+
+
 #include "test_common.h"
+
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
 
 using namespace Eigen;
 
 namespace Rfit {
   constexpr uint32_t maxNumberOfTracks() { return 5*1024; }
   constexpr uint32_t stride() { return maxNumberOfTracks();}
-  using Matrix3x4d = Eigen::Matrix<double,3,4>;
-  using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
-  using Matrix6x4f = Eigen::Matrix<float,6,4>;
-  using Map6x4f = Eigen::Map<Matrix6x4f,0,Eigen::Stride<6*stride(),stride()> >;
+  // hits
+  template<int N>
+  using Matrix3xNd = Eigen::Matrix<double,3,N>;
+  template<int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >;
+  // errors
+  template<int N>
+  using Matrix6xNf = Eigen::Matrix<float,6,N>;
+  template<int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >;
+  // fast fit
   using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
 
 }
 
+
+
+template<int N>
+__global__
+void kernelPrintSizes(double * __restrict__ phits,
+                      float * __restrict__ phits_ge
+		      ) {
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits+i,3,4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,4);
+  if (i!=0) return;
+  printf("GPU sizes %lu %lu %lu %lu %lu\n",sizeof(hits[i]),sizeof(hits_ge[i]),
+	 sizeof(Vector4d),sizeof(Rfit::line_fit),sizeof(Rfit::circle_fit));
+}
+
+
+template<int N>
 __global__
 void kernelFastFit(double * __restrict__ phits, double * __restrict__ presults) {
   auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3x4d hits(phits+i,3,4);
+  Rfit::Map3xNd<N> hits(phits+i,3,N);
   Rfit::Map4d result(presults+i,4);
+#ifdef USE_BL
+  BrokenLine::BL_Fast_fit(hits, result);
+#else
   Rfit::Fast_fit(hits,  result);
+#endif
+}
+
+#ifdef USE_BL
+
+template<int N>
+__global__
+void kernelBrokenLineFit(double * __restrict__ phits,
+			 float * __restrict__ phits_ge, 
+			 double * __restrict__ pfast_fit_input, 
+			 double B,
+			 Rfit::circle_fit * circle_fit,
+			 Rfit::line_fit * line_fit
+			 ) {
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits+i,3,N);
+  Rfit::Map4d   fast_fit_input(pfast_fit_input+i,4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
+  
+  BrokenLine::PreparedBrokenLineData<N> data;
+  Rfit::Matrix3d Jacob;
+  
+  auto & line_fit_results = line_fit[i];
+  auto & circle_fit_results = circle_fit[i];
+  
+  BrokenLine::prepareBrokenLineData(hits,fast_fit_input,B,data);
+  BrokenLine::BL_Line_fit(hits_ge,fast_fit_input,B,data,line_fit_results);
+  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit_input,B,data,circle_fit_results);
+  Jacob << 1.,0,0,
+    0,1.,0,
+    0,0,-B/std::copysign(Rfit::sqr(circle_fit_results.par(2)),circle_fit_results.par(2));
+  circle_fit_results.par(2)=B/std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov=Jacob*circle_fit_results.cov*Jacob.transpose();
+
+#ifdef TEST_DEBUG
+if (0==i) {
+  printf("Circle param %f,%f,%f\n",circle_fit[i].par(0),circle_fit[i].par(1),circle_fit[i].par(2));
+ }
+#endif
 }
 
+#else
+
+template<int N>
 __global__
 void kernelCircleFit(double * __restrict__ phits,
     float * __restrict__ phits_ge, 
@@ -36,16 +112,14 @@ void kernelCircleFit(double * __restrict__ phits,
     double B,
     Rfit::circle_fit * circle_fit_resultsGPU) {
 
-auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3x4d hits(phits+i,3,4);
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits+i,3,N);
   Rfit::Map4d   fast_fit_input(pfast_fit_input+i,4);
-  Rfit::Map6x4f hits_ge(phits_ge+i,6,4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
 
-  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
   constexpr auto n = N;
-  
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
   Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
   Rfit::loadCovariance2D(hits_ge,hits_cov);
   
@@ -77,59 +151,87 @@ if (0==i) {
 #endif
 }
 
+template<int N>
 __global__
 void kernelLineFit(double * __restrict__ phits,
 		   float * __restrict__ phits_ge,
                    double B,
                    Rfit::circle_fit * circle_fit,
-                   double * __restrict__ pfast_fit,
+                   double * __restrict__ pfast_fit_input,
                    Rfit::line_fit * line_fit)
 {
   auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3x4d hits(phits+i,3,4);
-  Rfit::Map4d   fast_fit(pfast_fit+i,4);
-  Rfit::Map6x4f hits_ge(phits_ge+i,6,4);
-  line_fit[i] = Rfit::Line_fit(hits, hits_ge, circle_fit[i], fast_fit, B, true);
+  Rfit::Map3xNd<N> hits(phits+i,3,N);
+  Rfit::Map4d   fast_fit_input(pfast_fit_input+i,4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
+  line_fit[i] = Rfit::Line_fit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
 }
+#endif
 
-template<typename M3x4, typename M6x4>
+template<typename M3xN, typename M6xN>
 __device__ __host__
-void fillHitsAndHitsCov(M3x4 & hits, M6x4 & hits_ge) {
-  hits << 1.98645, 4.72598, 7.65632, 11.3151,
-          2.18002, 4.88864, 7.75845, 11.3134,
-          2.46338, 6.99838,  11.808,  17.793;
+void fillHitsAndHitsCov(M3xN & hits, M6xN & hits_ge) {
+
+  constexpr uint32_t N = M3xN::ColsAtCompileTime;
+
+  if (N==5) {
+    hits << 2.934787,  6.314229, 8.936963, 10.360559,  12.856387,
+      0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+      -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07;
+    return;
+  }
+
+ 
+  
+  if (N>3) 
+    hits << 1.98645, 4.72598, 7.65632, 11.3151,
+      2.18002, 4.88864, 7.75845, 11.3134,
+      2.46338, 6.99838,  11.808,  17.793;
+  else
+    hits << 1.98645, 4.72598, 7.65632,
+      2.18002, 4.88864, 7.75845,
+      2.46338, 6.99838,  11.808;
+  
   hits_ge.col(0)[0] = 7.14652e-06;
   hits_ge.col(1)[0] = 2.15789e-06;
   hits_ge.col(2)[0] = 1.63328e-06;
-  hits_ge.col(3)[0] = 6.27919e-06;
+  if (N>3) hits_ge.col(3)[0] = 6.27919e-06;
   hits_ge.col(0)[2] = 6.10348e-06;
   hits_ge.col(1)[2] = 2.08211e-06;
   hits_ge.col(2)[2] = 1.61672e-06;
-  hits_ge.col(3)[2] = 6.28081e-06;
+  if (N>3) hits_ge.col(3)[2] = 6.28081e-06;
   hits_ge.col(0)[5] = 5.184e-05;
   hits_ge.col(1)[5] = 1.444e-05;
   hits_ge.col(2)[5] = 6.25e-06;
-  hits_ge.col(3)[5] = 3.136e-05;
+  if (N>3) hits_ge.col(3)[5] = 3.136e-05;
   hits_ge.col(0)[1] = -5.60077e-06;
   hits_ge.col(1)[1] = -1.11936e-06;
   hits_ge.col(2)[1] = -6.24945e-07;
-  hits_ge.col(3)[1] = -5.28e-06;
+  if (N>3) hits_ge.col(3)[1] = -5.28e-06;
 }
 
+
+template<int N>
 __global__
 void kernelFillHitsAndHitsCov(double * __restrict__ phits,
   float * phits_ge) {
   auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3x4d hits(phits+i,3,4);
-  Rfit::Map6x4f hits_ge(phits_ge+i,6,4);
-  hits_ge = MatrixXf::Zero(6,4);
+  Rfit::Map3xNd<N> hits(phits+i,3,N);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
+  hits_ge = MatrixXf::Zero(6,N);
   fillHitsAndHitsCov(hits,hits_ge);
 }
 
+template<int N>
 void testFit() {
   constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd<4> hits;
-  Rfit::Matrix6x4f hits_ge = MatrixXf::Zero(6,4);
+  Rfit::Matrix3xNd<N> hits;
+  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6,N);
   double * hitsGPU = nullptr;;
   float * hits_geGPU = nullptr;
   double * fast_fit_resultsGPU = nullptr;
@@ -137,32 +239,45 @@ void testFit() {
   Rfit::circle_fit * circle_fit_resultsGPU = nullptr;
   Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
   Rfit::line_fit * line_fit_resultsGPU = nullptr;
+  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
 
   fillHitsAndHitsCov(hits, hits_ge);
 
-  std::cout << "sizes " << sizeof(hits) << ' ' << sizeof(hits_ge)
-	    << ' ' << sizeof(Vector4d)<< std::endl;
+  std::cout << "sizes " << N << ' '
+	    << sizeof(hits) << ' ' << sizeof(hits_ge)
+	    << ' ' << sizeof(Vector4d) 
+	    << ' ' << sizeof(Rfit::line_fit) 
+            << ' ' << sizeof(Rfit::circle_fit)
+            << std::endl;
   
   std::cout << "Generated hits:\n" << hits << std::endl;
   std::cout << "Generated cov:\n" << hits_ge << std::endl;
 
   // FAST_FIT_CPU
+#ifdef USE_BL
+  Vector4d fast_fit_results; BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+#else
   Vector4d fast_fit_results; Rfit::Fast_fit(hits, fast_fit_results);
+#endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
   // for timing    purposes we fit    4096 tracks
   constexpr uint32_t Ntracks = 4096;
-  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix3xNd<4>)));
-  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix6x4f)));
+  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix3xNd<N>)));
+  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix6xNf<N>)));
   cudaCheck(cudaMalloc(&fast_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Vector4d)));
-  cudaCheck(cudaMalloc((void **)&line_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::line_fit)));
-  cudaCheck(cudaMalloc((void **)&circle_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMalloc(&line_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::circle_fit)));
 
+  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, Rfit::maxNumberOfTracks()*sizeof(Vector4d)));
+  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, Rfit::maxNumberOfTracks()*sizeof(Rfit::line_fit)));
 
-  kernelFillHitsAndHitsCov<<<Ntracks/64, 64>>>(hitsGPU,hits_geGPU);
+
+  kernelPrintSizes<N><<<Ntracks/64, 64>>>(hitsGPU,hits_geGPU);
+  kernelFillHitsAndHitsCov<N><<<Ntracks/64, 64>>>(hitsGPU,hits_geGPU);
 
   // FAST_FIT GPU
-  kernelFastFit<<<Ntracks/64, 64>>>(hitsGPU, fast_fit_resultsGPU);
+  kernelFastFit<N><<<Ntracks/64, 64>>>(hitsGPU, fast_fit_resultsGPU);
   cudaDeviceSynchronize();
   
   cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Vector4d), cudaMemcpyDeviceToHost);
@@ -170,43 +285,69 @@ void testFit() {
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
 
+
+#ifdef USE_BL
+  // CIRCLE AND LINE FIT CPU
+  BrokenLine::PreparedBrokenLineData<N> data;
+  BrokenLine::karimaki_circle_fit circle_fit_results;
+  Rfit::line_fit line_fit_results;
+  Rfit::Matrix3d Jacob;
+  BrokenLine::prepareBrokenLineData(hits,fast_fit_results,B,data);
+  BrokenLine::BL_Line_fit(hits_ge,fast_fit_results,B,data,line_fit_results);
+  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit_results,B,data,circle_fit_results);
+  Jacob << 1.,0,0,
+    0,1.,0,
+    0,0,-B/std::copysign(Rfit::sqr(circle_fit_results.par(2)),circle_fit_results.par(2));
+  circle_fit_results.par(2)=B/std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov=Jacob*circle_fit_results.cov*Jacob.transpose();
+
+  // fit on GPU
+  kernelBrokenLineFit<N><<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU,
+					  fast_fit_resultsGPU, B,
+					  circle_fit_resultsGPU,
+					  line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  
+#else
   // CIRCLE_FIT CPU
-  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
-  constexpr auto n = N;
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-  Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
+  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<N>::Zero();
   Rfit::loadCovariance2D(hits_ge,hits_cov);
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, N),
       hits_cov,
       fast_fit_results, rad, B, true);
-  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
   // CIRCLE_FIT GPU
-
-  kernelCircleFit<<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU,
+  kernelCircleFit<N><<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU,
       fast_fit_resultsGPU, B, circle_fit_resultsGPU);
   cudaDeviceSynchronize();
-
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
-      sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
-  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
-
+ 
   // LINE_FIT CPU
   Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
-  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
 
-  // LINE_FIT GPU
-  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
 
-  kernelLineFit<<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  kernelLineFit<N><<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
   cudaDeviceSynchronize();
+#endif
+
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
+  
+  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
+	     sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
+  
+  
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+    // LINE_FIT GPU
   cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par));
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N==5 ? 1e-4 : 1e-6)); // requires fma on CPU
 
+  
   std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
   std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
   std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl;
@@ -217,7 +358,10 @@ void testFit() {
 int main (int argc, char * argv[]) {
   exitSansCUDADevices();
 
-  testFit();
+  testFit<4>();
+  testFit<3>();
+  testFit<5>();
+
   std::cout << "TEST FIT, NO ERRORS" << std::endl;
 
   return 0;
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
index af4a3e52f46fa..a1e1049392ad0 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
@@ -3,7 +3,11 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
+#ifdef USE_BL
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#else
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#endif
 
 #include "test_common.h"
 
@@ -12,77 +16,148 @@ using namespace Eigen;
 namespace Rfit {
   constexpr uint32_t maxNumberOfTracks() { return 5*1024; }
   constexpr uint32_t stride() { return maxNumberOfTracks();}
-  using Matrix3x4d = Eigen::Matrix<double,3,4>;
-  using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
-  using Matrix6x4f = Eigen::Matrix<float,6,4>;
-  using Map6x4f = Eigen::Map<Matrix6x4f,0,Eigen::Stride<6*stride(),stride()> >;
+  // hits
+  template<int N>
+  using Matrix3xNd = Eigen::Matrix<double,3,N>;
+  template<int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >;
+  // errors
+  template<int N>
+  using Matrix6xNf = Eigen::Matrix<float,6,N>;
+  template<int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >;
+  // fast fit
   using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
 
 }
 
-template<typename M3x4, typename M6x4>
-void fillHitsAndHitsCov(M3x4 & hits, M6x4 & hits_ge) {
-  hits << 1.98645, 4.72598, 7.65632, 11.3151,
-          2.18002, 4.88864, 7.75845, 11.3134,
-          2.46338, 6.99838,  11.808,  17.793;
+/*
+Hit global: 641,0 2: 2.934787,0.773211,-10.980247
+Error: 641,0 2: 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05
+Hit global: 641,1 104: 6.314229,1.816356,-23.162731
+Error: 641,1 104: 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06
+Hit global: 641,2 1521: 8.936963,2.765734,-32.759060
+Error: 641,2 1521: 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07
+Hit global: 641,3 1712: 10.360559,3.330824,-38.061260
+Error: 641,3 1712: 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08
+Hit global: 641,4 1824: 12.856387,4.422212,-47.518867
+Error: 641,4 1824: 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07
+*/
+
+template<typename M3xN, typename M6xN>
+void fillHitsAndHitsCov(M3xN & hits, M6xN & hits_ge) {
+
+  constexpr uint32_t N = M3xN::ColsAtCompileTime;
+
+  if (N==5) {
+    hits << 2.934787,  6.314229, 8.936963, 10.360559,  12.856387,
+      0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+      -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07;
+    return;
+  }
+
+  
+  if (N>3) 
+    hits << 1.98645, 4.72598, 7.65632, 11.3151,
+      2.18002, 4.88864, 7.75845, 11.3134,
+      2.46338, 6.99838,  11.808,  17.793;
+  else
+    hits << 1.98645, 4.72598, 7.65632,
+      2.18002, 4.88864, 7.75845,
+      2.46338, 6.99838,  11.808;
+  
   hits_ge.col(0)[0] = 7.14652e-06;
   hits_ge.col(1)[0] = 2.15789e-06;
   hits_ge.col(2)[0] = 1.63328e-06;
-  hits_ge.col(3)[0] = 6.27919e-06;
+  if (N>3) hits_ge.col(3)[0] = 6.27919e-06;
   hits_ge.col(0)[2] = 6.10348e-06;
   hits_ge.col(1)[2] = 2.08211e-06;
   hits_ge.col(2)[2] = 1.61672e-06;
-  hits_ge.col(3)[2] = 6.28081e-06;
+  if (N>3) hits_ge.col(3)[2] = 6.28081e-06;
   hits_ge.col(0)[5] = 5.184e-05;
   hits_ge.col(1)[5] = 1.444e-05;
   hits_ge.col(2)[5] = 6.25e-06;
-  hits_ge.col(3)[5] = 3.136e-05;
+  if (N>3) hits_ge.col(3)[5] = 3.136e-05;
   hits_ge.col(0)[1] = -5.60077e-06;
   hits_ge.col(1)[1] = -1.11936e-06;
   hits_ge.col(2)[1] = -6.24945e-07;
-  hits_ge.col(3)[1] = -5.28e-06;
+  if (N>3) hits_ge.col(3)[1] = -5.28e-06;
 }
 
+
+template<int N>
 void testFit() {
   constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd<4> hits;
-  Rfit::Matrix6x4f hits_ge = MatrixXf::Zero(6,4);
+  Rfit::Matrix3xNd<N> hits;
+  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6,N);
 
   fillHitsAndHitsCov(hits, hits_ge);
 
-  std::cout << "sizes " << sizeof(hits) << ' ' << sizeof(hits_ge)
+  std::cout << "sizes " << N << ' '
+	    <<sizeof(hits) << ' ' << sizeof(hits_ge)
 	    << ' ' << sizeof(Vector4d)<< std::endl;
   
   std::cout << "Generated hits:\n" << hits << std::endl;
   std::cout << "Generated cov:\n" << hits_ge << std::endl;
 
   // FAST_FIT_CPU
+#ifdef USE_BL
+  Vector4d fast_fit_results; BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+#else
   Vector4d fast_fit_results; Rfit::Fast_fit(hits, fast_fit_results);
+#endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
 
   // CIRCLE_FIT CPU
-  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
-  constexpr auto n = N;
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
-  Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
+
+#ifdef USE_BL
+  BrokenLine::PreparedBrokenLineData<N> data;
+  BrokenLine::karimaki_circle_fit circle_fit_results;
+  Rfit::Matrix3d Jacob;
+    
+  BrokenLine::prepareBrokenLineData(hits,fast_fit_results,B,data);
+  Rfit::line_fit line_fit_results;
+  BrokenLine::BL_Line_fit(hits_ge,fast_fit_results,B,data,line_fit_results);
+  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit_results,B,data,circle_fit_results);
+  Jacob << 1.,0,0,
+    0,1.,0,
+    0,0,-B/std::copysign(Rfit::sqr(circle_fit_results.par(2)),circle_fit_results.par(2));
+  circle_fit_results.par(2)=B/std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov=Jacob*circle_fit_results.cov*Jacob.transpose();
+#else
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<N>::Zero();
   Rfit::loadCovariance2D(hits_ge,hits_cov);
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, n),
+  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, N),
       hits_cov,
       fast_fit_results, rad, B, true);
-  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
-
   // LINE_FIT CPU
   Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
-  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+  Rfit::par_uvrtopak(circle_fit_results, B, true);
+
+#endif
+  
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par
+	    << "\nchi2 " << circle_fit_results.chi2 << std::endl;
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par
+	    << "\nchi2 " << line_fit_results.chi2 << std::endl;
 
   std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
   std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
 }
 
 int main (int argc, char * argv[]) {
-  testFit();
+  testFit<4>();
+  testFit<3>();
+  testFit<5>();
+
   return 0;
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
new file mode 100644
index 0000000000000..a0b2b9e56f59c
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -0,0 +1,236 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include "HelixFitOnGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+
+using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+
+using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+
+using namespace Eigen;
+
+
+// #define BL_DUMP_HITS
+
+template<int N>
+__global__
+void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
+    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
+    HitsOnGPU const * __restrict__ hhp,
+    double * __restrict__ phits,
+    float * __restrict__ phits_ge,
+    double * __restrict__ pfast_fit,
+    uint32_t nHits,
+    uint32_t offset)
+{
+
+  constexpr uint32_t hitsInFit = N;
+
+  assert(hitsInFit<=nHits);
+
+  assert(pfast_fit); assert(foundNtuplets);
+
+  // look in bin for this hit multiplicity
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+
+#ifdef BROKENLINE_DEBUG
+  if (0==local_start) printf("%d Ntuple of size %d for %d hits to fit\n",tupleMultiplicity->size(nHits), nHits, hitsInFit);
+#endif
+
+  auto tuple_start = local_start + offset;
+  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+
+  // get it from the ntuple container (one to one to helix)
+  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
+  assert (helix_start < foundNtuplets->nbins());
+
+  assert (foundNtuplets->size(helix_start)==nHits);
+
+  Rfit::Map3xNd<N> hits(phits+local_start);
+  Rfit::Map4d   fast_fit(pfast_fit+local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
+
+#ifdef BL_DUMP_HITS
+  __shared__ int done;
+  done = 0;
+  __syncthreads(); 
+  bool dump =  (foundNtuplets->size(helix_start)==5 &&
+                0 == atomicAdd(&done,1));
+#endif
+
+  // Prepare data structure
+  auto const * hitId = foundNtuplets->begin(helix_start);
+  for (unsigned int i = 0; i < hitsInFit; ++i) {
+    auto hit = hitId[i];
+    float ge[6];
+    hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
+#ifdef BL_DUMP_HITS
+    if (dump){
+      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detInd_d[hit],i,hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
+      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detInd_d[hit],i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+    }
+#endif
+    hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
+    hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
+  }
+  BrokenLine::BL_Fast_fit(hits,fast_fit);
+
+  // no NaN here....
+  assert(fast_fit(0)==fast_fit(0));
+  assert(fast_fit(1)==fast_fit(1));
+  assert(fast_fit(2)==fast_fit(2));
+  assert(fast_fit(3)==fast_fit(3));
+
+} 
+
+template<int N>
+__global__
+void kernelBLFit(
+    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
+    double B,
+    Rfit::helix_fit *results,
+    double * __restrict__ phits,
+    float * __restrict__ phits_ge,
+    double * __restrict__ pfast_fit,
+    uint32_t nHits,
+    uint32_t offset)
+{
+
+  assert(N<=nHits);
+
+  assert(results); assert(pfast_fit);
+
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  auto tuple_start = local_start + offset;
+  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+
+  // get it for the ntuple container (one to one to helix)
+  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
+
+
+  Rfit::Map3xNd<N> hits(phits+local_start);
+  Rfit::Map4d   fast_fit(pfast_fit+local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
+
+  BrokenLine::PreparedBrokenLineData<N> data;
+  Rfit::Matrix3d Jacob;
+
+  BrokenLine::karimaki_circle_fit circle;
+  Rfit::line_fit line;
+ 
+  BrokenLine::prepareBrokenLineData(hits,fast_fit,B,data);
+  BrokenLine::BL_Line_fit(hits_ge,fast_fit,B,data,line);
+  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit,B,data,circle);
+  Jacob << 1,0,0,
+    0,1,0,
+    0,0,-B/std::copysign(Rfit::sqr(circle.par(2)),circle.par(2));
+  circle.par(2)=B/std::abs(circle.par(2));
+  circle.cov=Jacob*circle.cov*Jacob.transpose();
+
+
+  // Grab helix_fit from the proper location in the output vector
+  auto & helix = results[helix_start];
+  helix.par << circle.par, line.par;
+
+  helix.cov = Rfit::Matrix5d::Zero();
+  helix.cov.block(0, 0, 3, 3) = circle.cov;
+  helix.cov.block(3, 3, 2, 2) = line.cov;
+
+  helix.q = circle.q;
+  helix.chi2_circle = circle.chi2;
+  helix.chi2_line = line.chi2;
+
+#ifdef BROKENLINE_DEBUG
+  if ( !(circle.chi2>=0) || !(line.chi2>=0) ) printf("kernelBLFit failed! %f/%f\n", helix.chi2_circle,helix.chi2_line);
+  printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", N,nHits, helix_start,
+         circle.par(0), circle.par(1), circle.par(2));
+  printf("kernelBLHits line.par(0,1): %d %f,%f\n", helix_start, line.par(0),line.par(1));
+  printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",helix.chi2_circle,helix.chi2_line, 
+         helix.cov(0,0),helix.cov(1,1),helix.cov(2,2),helix.cov(3,3),helix.cov(4,4));
+#endif
+}
+
+
+void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+{
+    assert(tuples_d); assert(fast_fit_resultsGPU_);
+
+    auto blockSize = 64;
+    auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+    for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
+
+      // fit triplets
+      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, hh.gpu_d,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+          3, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             3, offset);
+      cudaCheck(cudaGetLastError());
+
+      // fit quads
+      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, hh.gpu_d,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+          4, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             4, offset);
+      cudaCheck(cudaGetLastError());
+
+      if (fit5as4_) {
+        // fit penta (only first 4)
+        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, hh.gpu_d,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+          5, offset);
+        cudaCheck(cudaGetLastError());
+
+        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             5, offset);
+        cudaCheck(cudaGetLastError());
+      } else {
+        // fit penta (all 5)
+        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, hh.gpu_d,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+          5, offset);
+        cudaCheck(cudaGetLastError());
+
+        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             5, offset);
+        cudaCheck(cudaGetLastError());
+      }
+
+    } // loop on concurrent fits
+
+
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 942404a9313e3..1c7777ee37b3e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -25,6 +25,7 @@ namespace CAConstants {
    using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>;
    using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5*maxTuples()>;
    using HitToTuple = OneToManyAssoc<tindex_type, PixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
+   using TupleMultiplicity = OneToManyAssoc<tindex_type,8,maxTuples()>;
 
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index dbd4eecbaab3c..c9abaae27ab8b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -29,6 +29,8 @@ class GPUCACell {
 
   using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
   GPUCACell() = default;
 #ifdef __CUDACC__
 
@@ -115,7 +117,7 @@ class GPUCACell {
 
   
   __device__
-  bool
+  inline bool
   dcaCut(Hits const & hh, GPUCACell const & otherCell,
                        const float region_origin_radius_plus_tolerance,
                        const float maxCurv) const {
@@ -137,16 +139,30 @@ class GPUCACell {
 
   }
 
+  __device__
+  inline bool 
+  hole(Hits const & hh, GPUCACell const & innerCell) const {
+    constexpr float r4 = 16.f;
+    auto ri = innerCell.get_inner_r(hh);
+    auto zi = innerCell.get_inner_z(hh);
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
+    auto z4 = std::abs(zi + (r4-ri)*(zo-zi)/(ro-ri));
+    return z4>25.f && z4<33.f;
+  }
+
+
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
 
-// #ifdef __CUDACC__
-
+  template<typename CM>
   __device__
   inline void find_ntuplets(
+      Hits const & hh,
       GPUCACell * __restrict__ cells,
       TuplesOnGPU::Container & foundNtuplets, 
       AtomicPairCounter & apc,
+      CM & tupleMultiplicity,
       TmpTuple & tmpNtuplet,
       const unsigned int minHitsPerNtuplet) const
   {
@@ -159,19 +175,23 @@ class GPUCACell {
     tmpNtuplet.push_back_unsafe(theDoubletId);
     assert(tmpNtuplet.size()<=4);
 
-    if(theOuterNeighbors.size()>0) { // continue
+    if(theOuterNeighbors.size()>0) {
       for (int j = 0; j < theOuterNeighbors.size(); ++j) {
         auto otherCell = theOuterNeighbors[j];
-        cells[otherCell].find_ntuplets(cells, foundNtuplets, apc, tmpNtuplet,
-                                       minHitsPerNtuplet);
+        cells[otherCell].find_ntuplets(hh, cells, foundNtuplets, apc, tupleMultiplicity, 
+                                       tmpNtuplet, minHitsPerNtuplet);
       }
     } else {  // if long enough save...
       if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) {
-        hindex_type hits[6]; auto nh=0U;
-        for (auto c : tmpNtuplet) hits[nh++] = cells[c].theInnerHitId;
-        hits[nh] = theOuterHitId; 
-        uint16_t it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
-        for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it);
+        // triplets accepted only pointing to the hole
+        if (tmpNtuplet.size()>=3 || hole(hh, cells[tmpNtuplet[0]])) {
+          hindex_type hits[6]; auto nh=0U;
+          for (auto c : tmpNtuplet) hits[nh++] = cells[c].theInnerHitId;
+          hits[nh] = theOuterHitId; 
+          uint16_t it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
+          for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it);
+          tupleMultiplicity.countDirect(tmpNtuplet.size()+1);
+        }
       }
     }
     tmpNtuplet.pop_back();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
similarity index 75%
rename from RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
rename to RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index fe95e10a48b5a..393eb9b020c3d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,12 +1,13 @@
-#include "RiemannFitOnGPU.h"
+#include "HelixFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-void RiemannFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, Rfit::helix_fit * helix_fit_results) {
+void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMultiplicity const * tupleMultiplicity, Rfit::helix_fit * helix_fit_results) {
 
   tuples_d = tuples;
+  tupleMultiplicity_d = tupleMultiplicity;
   helix_fit_results_d = helix_fit_results;
 
-  assert(tuples_d); assert(helix_fit_results_d);
+  assert(tuples_d); assert(tupleMultiplicity_d); assert(helix_fit_results_d);
 
   cudaCheck(cudaMalloc(&hitsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
   cudaCheck(cudaMemset(hitsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
@@ -22,7 +23,7 @@ void RiemannFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, Rfit:
 
 }
 
-void RiemannFitOnGPU::deallocateOnGPU() {
+void HelixFitOnGPU::deallocateOnGPU() {
 
   cudaFree(hitsGPU_);
   cudaFree(hits_geGPU_);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
similarity index 55%
rename from RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
rename to RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index fac88ac2c2bd4..62e7e2a5fddd7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_RiemannFitOnGPU_h
-#define RecoPixelVertexing_PixelTrackFitting_plugins_RiemannFitOnGPU_h
+#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
+#define RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
@@ -15,26 +15,40 @@ namespace Rfit {
   using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
   using Matrix6x4f = Eigen::Matrix<float,6,4>;
   using Map6x4f = Eigen::Map<Matrix6x4f,0,Eigen::Stride<6*stride(),stride()> >;
+
+  // hits
+  template<int N>
+  using Matrix3xNd = Eigen::Matrix<double,3,N>;
+  template<int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >;
+  // errors
+  template<int N>
+  using Matrix6xNf = Eigen::Matrix<float,6,N>;
+  template<int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >;
+  // fast fit
   using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
 
 }
 
 
-class RiemannFitOnGPU {
+class HelixFitOnGPU {
 public:
 
    using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
    using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
    using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+   using TupleMultiplicity = CAConstants::TupleMultiplicity;
 
-   RiemannFitOnGPU() = default;
-   ~RiemannFitOnGPU() { deallocateOnGPU();}
+   explicit HelixFitOnGPU(bool fit5as4) : fit5as4_(fit5as4) {}
+   ~HelixFitOnGPU() { deallocateOnGPU();}
 
    void setBField(double bField) { bField_ = bField;}
-   void launchKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+   void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+   void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
-   void allocateOnGPU(TuplesOnGPU::Container const * tuples, Rfit::helix_fit * helix_fit_results);
+   void allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMultiplicity const * tupleMultiplicity, Rfit::helix_fit * helix_fit_results);
    void deallocateOnGPU();
 
 
@@ -44,17 +58,19 @@ class RiemannFitOnGPU {
 
     // fowarded
     TuplesOnGPU::Container const * tuples_d = nullptr;
+    TupleMultiplicity const * tupleMultiplicity_d = nullptr;
     double bField_;
     Rfit::helix_fit * helix_fit_results_d = nullptr;
 
-
-
    // Riemann Fit internals
    double *hitsGPU_ = nullptr;
    float *hits_geGPU_ = nullptr;
    double *fast_fit_resultsGPU_ = nullptr;
    Rfit::circle_fit *circle_fit_resultsGPU_ = nullptr;
 
+    const bool fit5as4_;
+
+
 };
 
 #endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 1bcfb847d2ae8..e9111cb0b5db1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -2,7 +2,7 @@
 // Author: Felice Pantaleo, CERN
 //
 
-#include "RiemannFitOnGPU.h"
+#include "HelixFitOnGPU.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
 
 #include <cstdint>
@@ -21,35 +21,48 @@ using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
 
+template<int N>
 __global__
-void kernelFastFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
+void kernelFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
+    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
+    uint32_t nHits,
     HitsOnGPU const * __restrict__ hhp,
-    int hits_in_fit,
     double * __restrict__ phits,
     float * __restrict__ phits_ge,
     double * __restrict__ pfast_fit,
     uint32_t offset)
 {
 
-  assert(hits_in_fit==4); // FixMe later template
+  constexpr uint32_t hitsInFit = N;
 
-  assert(pfast_fit); assert(foundNtuplets);
+  assert(hitsInFit<=nHits);
 
+  assert(pfast_fit); assert(foundNtuplets); assert(tupleMultiplicity);
+
+  // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  auto helix_start = local_start + offset;
 
-  if (helix_start>=foundNtuplets->nbins()) return;
-  if (foundNtuplets->size(helix_start)<hits_in_fit) {
-    return;
-  }
+#ifdef RIEMANN_DEBUG
+  if (0==local_start) printf("%d Ntuple of size %d for %d hits to fit\n",tupleMultiplicity->size(nHits), nHits, hitsInFit);
+#endif
+
+  auto tuple_start = local_start + offset;
+  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+
+  // get it from the ntuple container (one to one to helix)
+  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
+  assert (helix_start < foundNtuplets->nbins());
+
+  assert (foundNtuplets->size(helix_start)==nHits);
 
-  Rfit::Map3x4d hits(phits+local_start);
+  Rfit::Map3xNd<N> hits(phits+local_start);
   Rfit::Map4d   fast_fit(pfast_fit+local_start);
-  Rfit::Map6x4f hits_ge(phits_ge+local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
+
 
   // Prepare data structure
   auto const * hitId = foundNtuplets->begin(helix_start);
-  for (unsigned int i = 0; i < hits_in_fit; ++i) {
+  for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
     float ge[6];
@@ -69,9 +82,11 @@ void kernelFastFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtupl
 
 }
 
+template<int N>
 __global__
-void kernelCircleFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
-    int hits_in_fit,
+void kernelCircleFit(
+    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
+    uint32_t nHits,
     double B,
     double * __restrict__ phits,
     float * __restrict__ phits_ge,
@@ -79,66 +94,74 @@ void kernelCircleFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtu
     Rfit::circle_fit *circle_fit,
     uint32_t offset)
 {
+
   assert(circle_fit); 
+  assert(N<=nHits);
 
-  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  auto helix_start = local_start + offset;
+  // same as above...
 
-  if (helix_start>=foundNtuplets->nbins()) return;
-  if (foundNtuplets->size(helix_start)<hits_in_fit) {
-    return;
-  }
+  // look in bin for this hit multiplicity
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  auto tuple_start = local_start + offset;
+  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
 
-  Rfit::Map3x4d hits(phits+local_start);
-  Rfit::Map4d   fast_fit(pfast_fit_input+local_start);
-  Rfit::Map6x4f hits_ge(phits_ge+local_start);
+  // get it for the ntuple container (one to one to helix)
+  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
 
-  constexpr uint32_t N = Rfit::Map3x4d::ColsAtCompileTime;
-  constexpr auto n = N;
 
-  assert(4==n); // later will be templated...
+  Rfit::Map3xNd<N> hits(phits+local_start);
+  Rfit::Map4d   fast_fit(pfast_fit_input+local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
 
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<4>::Zero();
+  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<N>::Zero();
   Rfit::loadCovariance2D(hits_ge,hits_cov);
 
   circle_fit[local_start] =
-      Rfit::Circle_fit(hits.block(0, 0, 2, n),
+      Rfit::Circle_fit(hits.block(0, 0, 2, N),
                        hits_cov,
                        fast_fit, rad, B, true);
 
-#ifdef GPU_DEBUG
-//  printf("kernelCircleFitAllHits circle.par(0,1,2): %d %f,%f,%f\n", helix_start, 
+#ifdef RIEMANN_DEBUG
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", helix_start, 
 //         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
 #endif
 }
 
+
+template<int N>
 __global__
-void kernelLineFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
-    int hits_in_fit,
+void kernelLineFit(
+    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
+    uint32_t nHits,
     double B,
     Rfit::helix_fit *results,
     double * __restrict__ phits,
     float * __restrict__ phits_ge,
-    double * __restrict__ pfast_fit,
+    double * __restrict__ pfast_fit_input,
     Rfit::circle_fit * __restrict__ circle_fit,
     uint32_t offset)
 {
 
   assert(results); assert(circle_fit);
+  assert(N<=nHits);
+
+  // same as above...
 
+  // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  auto helix_start = local_start + offset;
+  auto tuple_start = local_start + offset;
+  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
 
-  if (helix_start>=foundNtuplets->nbins()) return;
-  if (foundNtuplets->size(helix_start)<hits_in_fit) {
-    return;
-  }
+  // get it for the ntuple container (one to one to helix)
+  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
+
+
+  Rfit::Map3xNd<N> hits(phits+local_start);
+  Rfit::Map4d   fast_fit(pfast_fit_input+local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
 
-  Rfit::Map3x4d hits(phits+local_start);
-  Rfit::Map4d   fast_fit(pfast_fit+local_start);
-  Rfit::Map6x4f hits_ge(phits_ge+local_start);
   auto const & line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true);
 
   par_uvrtopak(circle_fit[local_start], B, true);
@@ -157,39 +180,100 @@ void kernelLineFitAllHits(TuplesOnGPU::Container const * __restrict__ foundNtupl
   helix.chi2_circle = circle_fit[local_start].chi2;
   helix.chi2_line = line_fit.chi2;
 
-#ifdef GPU_DEBUG
-  printf("kernelLineFitAllHits circle.par(0,1,2): %d %f,%f,%f\n", helix_start,
+#ifdef RIEMANN_DEBUG
+  printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", N,nHits, helix_start,
          circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
-  printf("kernelLineFitAllHits line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0),line_fit.par(1));
-  printf("kernelLineFitAllHits chi2 cov %f/%f %f,%f,%f,%f,%f\n",helix.chi2_circle,helix.chi2_line, 
+  printf("kernelLineFit line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0),line_fit.par(1));
+  printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",helix.chi2_circle,helix.chi2_line, 
          helix.cov(0,0),helix.cov(1,1),helix.cov(2,2),helix.cov(3,3),helix.cov(4,4));
 #endif
 }
 
 
-void RiemannFitOnGPU::launchKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
 {
     assert(tuples_d); assert(fast_fit_resultsGPU_);
 
-    auto blockSize = 128;
+    auto blockSize = 64;
     auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
     for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
-      kernelFastFitAllHits<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, hh.gpu_d, 4,
+
+      // triplets
+      kernelFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, 3, 
+          hh.gpu_d,
           hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFitAllHits<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, 4, bField_,
+      kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tupleMultiplicity_d, 3, bField_,
           hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
       cudaCheck(cudaGetLastError());
 
+      kernelLineFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, 3,  bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             offset);
+      cudaCheck(cudaGetLastError());
+
+      // quads
+      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, 4,
+          hh.gpu_d,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tupleMultiplicity_d, 4, bField_,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+      cudaCheck(cudaGetLastError());
 
-      kernelLineFitAllHits<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-             tuples_d, 4,  bField_, helix_fit_results_d,
+      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, 4,  bField_, helix_fit_results_d,
              hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
              offset);
       cudaCheck(cudaGetLastError());
+
+      if (fit5as4_) {
+        // penta
+        kernelFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, 5,
+          hh.gpu_d,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+        cudaCheck(cudaGetLastError());
+
+        kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tupleMultiplicity_d, 5, bField_,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+        cudaCheck(cudaGetLastError());
+
+        kernelLineFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             offset);
+        cudaCheck(cudaGetLastError());
+      } else {
+        // penta all 5
+        kernelFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tuples_d, tupleMultiplicity_d, 5,
+          hh.gpu_d,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+        cudaCheck(cudaGetLastError());
+
+        kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+          tupleMultiplicity_d, 5, bField_,
+          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+        cudaCheck(cudaGetLastError());
+
+        kernelLineFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+             tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
+             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             offset);
+        cudaCheck(cudaGetLastError());
+
+      }
+
     }
+
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 796241eaf50ff..4368bc12ab5f8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -54,7 +54,7 @@ namespace gpuPixelDoublets {
     auto sg=0;
     for (uint32_t ic=0; ic<s; ++ic) {
       auto & ci = cells[vc[ic]];
-      if (checkTrack && 0==ci.theTracks.size()) continue;
+      if (checkTrack && ci.theTracks.empty()) continue;
       cc[sg] = vc[ic];
       d[sg] = ci.get_inner_detId(hh);
 //      l[sg] = layer(d[sg]);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 192efd0d8919f..0e67c4cc3e28b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -107,15 +107,20 @@ namespace gpuPixelDoublets {
       auto kl = Hist::bin(int16_t(mep-iphicut));
       auto kh = Hist::bin(int16_t(mep+iphicut));
       auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
+
+#ifdef GPU_DEBUG
       int  tot  = 0;
       int  nmin = 0;
+      int tooMany=0;
+#endif
+
       auto khh = kh;
       incr(khh);
-
-      int tooMany=0;
       for (auto kk = kl; kk != khh; incr(kk)) {
+#ifdef GPU_DEBUG
         if (kk != kl && kk != kh)
           nmin += hist.size(kk+hoff);
+#endif
         auto const * __restrict__ p = hist.begin(kk+hoff);
         auto const * __restrict__ e = hist.end(kk+hoff);
         p+=first;
@@ -132,8 +137,10 @@ namespace gpuPixelDoublets {
           // int layerPairId, int doubletId, int innerHitId, int outerHitId)
           cells[ind].init(hh, pairLayerId, ind, i, oi);
           isOuterHitOfCell[oi].push_back(ind);
+#ifdef GPU_DEBUG
           if (isOuterHitOfCell[oi].full()) ++tooMany;
           ++tot;
+#endif
         }
       }
 #ifdef GPU_DEBUG

From f07cca0a71e4abb4716d15f8847d55d36fb7abc7 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Mon, 25 Feb 2019 11:46:16 +0100
Subject: [PATCH 045/102] Improve pixel doublets and CA, and extend debugging
 functionality (cms-patatrack#338)

Improve pixel doublets and CA:
  - add pixel cluster size and shape cuts in doublets;
  - add triplet cleaner;
  - improved cluster size studies
  - implement layer-dependent cuts in the CA.

Add counters in GPU code and possibility to test full doublet combinatorics.

Update python notebook and include z0 resolution.
---
 .../PixelTriplets/plugins/CAConstants.h       | 13 ++-
 .../PixelTriplets/plugins/GPUCACell.h         | 20 +++--
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  2 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 90 ++++++++++++++++---
 4 files changed, 100 insertions(+), 25 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 1c7777ee37b3e..48a9ec7adf2f7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -8,16 +8,23 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
 
+// #define ONLY_PHICUT
 
 namespace CAConstants {
 
    // constants
-   constexpr uint32_t maxNumberOfQuadruplets() { return 10000; }
+
+   constexpr uint32_t maxNumberOfQuadruplets() { return 6*1024; }
+#ifndef ONLY_PHICUT
+   constexpr uint32_t maxNumberOfDoublets() { return 262144; }
    constexpr uint32_t maxCellsPerHit() { return 128; }
+#else
+   constexpr uint32_t maxNumberOfDoublets() { return 6*262144; }
+   constexpr uint32_t maxCellsPerHit() { return 4*128; }
+#endif
    constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
    constexpr uint32_t maxNumberOfLayers() { return 10; }
-   constexpr uint32_t maxNumberOfDoublets() { return 262144; }
-   constexpr uint32_t maxTuples() { return 10000;}
+   constexpr uint32_t maxTuples() { return 6*1024;}
 
    // types
    using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index c9abaae27ab8b..b3e23792a4083 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -18,8 +18,8 @@
 class GPUCACell {
 public:
 
-  static constexpr int maxCellsPerHit = 128; // was 256
-  using OuterHitOfCell = GPU::VecArray< unsigned int, maxCellsPerHit>;
+  static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit();
+  using OuterHitOfCell = CAConstants::OuterHitOfCell;
 
 
   using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
@@ -93,7 +93,8 @@ class GPUCACell {
 
     auto r1 = otherCell.get_inner_r(hh);
     auto z1 = otherCell.get_inner_z(hh);
-    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, 0.003f); // 2.f*thetaCut); // FIXME tune cuts
+    auto isBarrel = otherCell.get_outer_detId(hh)<1184;
+    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, isBarrel ? 0.002f : 0.003f); // 2.f*thetaCut); // FIXME tune cuts
     return (aligned &&  dcaCut(hh, otherCell, otherCell.get_inner_detId(hh)<96 ? 0.15f : 0.25f, hardCurvCut));  // FIXME tune cuts
                             // region_origin_radius_plus_tolerance,  hardCurvCut));
   }
@@ -183,14 +184,19 @@ class GPUCACell {
       }
     } else {  // if long enough save...
       if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) {
+#ifndef ALL_TRIPLETS
         // triplets accepted only pointing to the hole
-        if (tmpNtuplet.size()>=3 || hole(hh, cells[tmpNtuplet[0]])) {
+        if (tmpNtuplet.size()>=3 || hole(hh, cells[tmpNtuplet[0]]))
+#endif
+       {
           hindex_type hits[6]; auto nh=0U;
           for (auto c : tmpNtuplet) hits[nh++] = cells[c].theInnerHitId;
           hits[nh] = theOuterHitId; 
-          uint16_t it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
-          for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it);
-          tupleMultiplicity.countDirect(tmpNtuplet.size()+1);
+          auto it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
+          if (it>=0)  { // if negative is overflow....
+            for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it);
+            tupleMultiplicity.countDirect(tmpNtuplet.size()+1);
+          }
         }
       }
     }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 62e7e2a5fddd7..40a62c4c1c723 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -9,7 +9,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
 }
 
 namespace Rfit {
-  constexpr uint32_t maxNumberOfConcurrentFits() { return 2*1024;}
+  constexpr uint32_t maxNumberOfConcurrentFits() { return 6*1024;}
   constexpr uint32_t stride() { return maxNumberOfConcurrentFits();}
   using Matrix3x4d = Eigen::Matrix<double,3,4>;
   using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 0e67c4cc3e28b..7007554fce9e6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -15,6 +15,12 @@
 #include "GPUCACell.h"
 #include "CAConstants.h"
 
+
+// useful for benchmark
+// #define ONLY_PHICUT
+// #define USE_ZCUT
+// #define NO_CLSCUT
+
 namespace gpuPixelDoublets {
 
   constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
@@ -32,10 +38,21 @@ namespace gpuPixelDoublets {
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
                          GPUCACell::OuterHitOfCell * isOuterHitOfCell,
                          int16_t const * __restrict__ phicuts,
+#ifdef USE_ZCUT
                          float const * __restrict__ minz,
                          float const * __restrict__ maxz,
-                         float const * __restrict__ maxr)
+#endif
+                         float const * __restrict__ maxr, bool ideal_cond)
   {
+
+#ifndef NO_CLSCUT 
+    // ysize cuts (z in the barrel)  times 8
+    constexpr int minYsizeB1=36;
+    constexpr int minYsizeB2=28;
+    constexpr int maxDYsize12=28;
+    constexpr int maxDYsize=20;
+#endif
+
     auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
 
     // nPairsMax to be optimized later (originally was 64).
@@ -79,10 +96,33 @@ namespace gpuPixelDoublets {
       assert(i < offsets[inner+1]);
 
       // found hit corresponding to our cuda thread, now do the job
-      auto mep = iphi[i];
       auto mez = __ldg(hh.zg_d+i);
-      auto mer = __ldg(hh.rg_d+i);
 
+#ifdef USE_ZCUT
+     // this statement is responsible for a 10% slow down of the kernel once all following cuts are optimized...
+     if (mez<minz[pairLayerId] || mez>maxz[pairLayerId]) continue;
+#endif
+
+#ifndef NO_CLSCUT
+      auto mes = __ldg(hh.ysize_d+i);
+
+      // if ideal treat inner ladder as outer
+      auto mi = __ldg(hh.detInd_d+i);
+      if (inner==0) assert(mi<96);    
+      const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
+
+      // auto mesx = __ldg(hh.xsize_d+i);
+      // if (mesx<0) continue; // remove edges in x as overlap will take care
+
+      if (inner==0 && outer>3 && isOuterLadder)  // B1 and F1
+         if (mes>0 && mes<minYsizeB1) continue; // only long cluster  (5*8)
+      if (inner==1 && outer>3)  // B2 and F1
+         if (mes>0 && mes<minYsizeB2) continue;
+#endif // NO_CLSCUT
+
+      auto mep = iphi[i];
+      auto mer = __ldg(hh.rg_d+i);
+ 
       constexpr float z0cut = 12.f;                     // cm
       constexpr float hardPtCut = 0.5f;                 // GeV
       constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
@@ -102,6 +142,16 @@ namespace gpuPixelDoublets {
           dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
       };
 
+#ifndef NO_CLSCUT
+      auto zsizeCut = [&](int j) {
+        auto onlyBarrel = outer<4;
+        auto so = __ldg(hh.ysize_d+j);
+        //auto sox = __ldg(hh.xsize_d+j);
+        auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize;
+        return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy;
+      };
+#endif
+
       auto iphicut = phicuts[pairLayerId];
 
       auto kl = Hist::bin(int16_t(mep-iphicut));
@@ -131,7 +181,12 @@ namespace gpuPixelDoublets {
 
           if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
             continue;
+#ifndef ONLY_PHICUT
+#ifndef NO_CLSCUT
+          if (zsizeCut(oi)) continue;
+#endif
           if (z0cutoff(oi) || ptcut(oi)) continue;
+#endif
           auto ind = atomicAdd(nCells, 1); 
           if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector??
           // int layerPairId, int doubletId, int innerHitId, int outerHitId)
@@ -158,14 +213,15 @@ namespace gpuPixelDoublets {
   void getDoubletsFromHisto(GPUCACell * cells,
                             uint32_t * nCells,
                             siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
-                            GPUCACell::OuterHitOfCell * isOuterHitOfCell)
+                            GPUCACell::OuterHitOfCell * isOuterHitOfCell,
+                            bool ideal_cond)
   {
     constexpr int nPairs = 13;
     constexpr const uint8_t layerPairs[2*nPairs] = {
       0, 1,  1, 2,  2, 3,
-   // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
-      0, 7,  1, 7,  2, 7,  7, 8,  8, 9,
-      0, 4,  1, 4,  2, 4,  4, 5,  5, 6
+      // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
+      0, 7,  1, 7,  2, 7,  7, 8,  8, 9, // neg
+      0, 4,  1, 4,  2, 4,  4, 5,  5, 6,  // pos
     };
 
     constexpr int16_t phi0p05 = 522;    // round(521.52189...) = phi2short(0.05);
@@ -178,17 +234,19 @@ namespace gpuPixelDoublets {
       phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
     };
 
+#ifdef USE_ZCUT
     float const minz[nPairs] = {
-      0., 0., 0.,
-      0., 0., 0., 0., 0.,
-      0., 0., 0., 0., 0.
+      -20., -22., -22.,
+      -30., -30.,-30., -70., -70.,
+        0., 10., 15., -70., -70.
     };
 
     float const maxz[nPairs] = {
-      20., 15., 12.,
-      30., 20., 20., 50., 50.,
-      30., 20., 20., 50., 50.
+      20., 22., 22.,
+       0., -10., -15., 70., 70.,
+      30., 30., 30., 70., 70.
     };
+#endif
 
     float const maxr[nPairs] = {
       20., 20., 20.,
@@ -200,7 +258,11 @@ namespace gpuPixelDoublets {
     doubletsFromHisto(layerPairs, nPairs, cells, nCells,
                       hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
                       hh, isOuterHitOfCell,
-                      phicuts, minz, maxz, maxr);
+                      phicuts, 
+#ifdef USE_ZCUT
+                      minz, maxz, 
+#endif
+                      maxr , ideal_cond);
   }
 
 

From 260c0b21ea0bb009c12f5e062e4f73763c45fe94 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 29 Mar 2019 06:50:23 -0400
Subject: [PATCH 046/102] Migrate the pixel rechits producer and CA to the new
 heterogeneous framework (cms-patatrack#338)

Use cleaned hits.
Use pixel layer and ladders geometry, and use pixel triplets in the gaps.

Optimise GPU memory usage:
  - reduce the number of memory allocations
  - fix the size of the cub workspace
  - allocate memory per event via the caching allocator
  - use constant memory for geometry and parameters
  - use shared memory where the content is the same for every thread

Optimise kernel launches, and add a protection for empty events and overflows.
---
 .../customizePixelTracksForProfiling.py       |   2 -
 .../PixelTrackFitting/interface/FitResult.h   |   4 +-
 .../plugins/BrokenLineFitOnGPU.cu             |  68 +++--
 .../PixelTriplets/plugins/BuildFile.xml       |   2 -
 .../PixelTriplets/plugins/CAConstants.h       |  30 +-
 .../PixelTriplets/plugins/GPUCACell.h         |  94 ++++--
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  17 --
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  22 +-
 .../PixelTriplets/plugins/RecHitsMap.h        |   3 +
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  |  80 +++---
 .../PixelTriplets/plugins/gpuFishbone.h       |   6 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 271 +++---------------
 .../plugins/gpuPixelDoubletsAlgos.h           | 220 ++++++++++++++
 13 files changed, 455 insertions(+), 364 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 58935e9a6991c..1021918c0ce6c 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process):
     process = customizePixelTracksForProfiling(process)
 
     # Disable conversions to legacy
-    process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
     process.pixelTracksHitQuadruplets.gpuEnableConversion = False
     process.pixelTracks.gpuEnableConversion = False
     process.pixelVertices.gpuEnableConversion = False
@@ -32,7 +31,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
     process = customizePixelTracksForProfilingDisableConversion(process)
 
     # Disable "unnecessary" transfers to CPU
-    process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
     process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
     process.pixelVertices.gpuEnableTransfer = False
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
index 0d9be5a346d0a..e6ab9f93ca306 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
@@ -38,7 +38,7 @@ namespace Rfit
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
     */
     int32_t q;  //!< particle charge
-    float chi2 = 0.0;
+    float chi2;
   };
   
   struct line_fit
@@ -49,7 +49,7 @@ namespace Rfit
       |cov(c_t,c_t)|cov(Zip,c_t)| \n
       |cov(c_t,Zip)|cov(Zip,Zip)|
     */
-    double chi2 = 0.0;
+    double chi2;
   };
   
   struct helix_fit
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index a0b2b9e56f59c..894a80616af02 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -11,12 +11,12 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
-using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
-
-using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+using HitsOnGPU = TrackingRecHit2DSOAView;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
@@ -75,14 +75,14 @@ void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     float ge[6];
-    hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
+    hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
 #ifdef BL_DUMP_HITS
     if (dump){
-      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detInd_d[hit],i,hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
-      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detInd_d[hit],i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detectorIndex(hit),i,hhp->xGlobal(hit),hhp->yGlobal(hit),hhp->zGlobal(hit));
+      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detetectorIndex(hit),i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
     }
 #endif
-    hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
+    hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
     hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
   }
   BrokenLine::BL_Fast_fit(hits,fast_fit);
@@ -167,65 +167,71 @@ void kernelBLFit(
 }
 
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
 {
-    assert(tuples_d); assert(fast_fit_resultsGPU_);
+    assert(tuples_d);
 
     auto blockSize = 64;
     auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
-    for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
+   //  Fit internals
+   edm::Service<CUDAService> cs;
+   auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
+   auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
+   auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
+
+   for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
 
       // fit triplets
-      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           3, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              3, offset);
       cudaCheck(cudaGetLastError());
 
       // fit quads
-      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           4, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              4, offset);
       cudaCheck(cudaGetLastError());
 
       if (fit5as4_) {
         // fit penta (only first 4)
-        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           5, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              5, offset);
         cudaCheck(cudaGetLastError());
       } else {
         // fit penta (all 5)
-        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           5, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              5, offset);
         cudaCheck(cudaGetLastError());
       }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 3c8397cf572f6..d4140692181bf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -11,8 +11,6 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
-<flags CXXFLAGS="-g -fno-math-errno"/>
-<flags CUDA_FLAGS="-g"/>
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 48a9ec7adf2f7..a33c613402dcf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -6,32 +6,52 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
-#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 // #define ONLY_PHICUT
 
 namespace CAConstants {
 
    // constants
-
-   constexpr uint32_t maxNumberOfQuadruplets() { return 6*1024; }
+#ifdef GPU_SMALL_EVENTS
+   constexpr uint32_t maxNumberOfTuples() { return 3*1024;}
+#else
+   constexpr uint32_t maxNumberOfTuples() { return 6*1024;}
+#endif
+   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
+#ifndef GPU_SMALL_EVENTS
    constexpr uint32_t maxNumberOfDoublets() { return 262144; }
    constexpr uint32_t maxCellsPerHit() { return 128; }
+#else
+   constexpr uint32_t maxNumberOfDoublets() { return 262144/2; }
+   constexpr uint32_t maxCellsPerHit() { return 128/2; }
+#endif
 #else
    constexpr uint32_t maxNumberOfDoublets() { return 6*262144; }
    constexpr uint32_t maxCellsPerHit() { return 4*128; }
 #endif
+  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets()/4;}
+
+
    constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
    constexpr uint32_t maxNumberOfLayers() { return 10; }
-   constexpr uint32_t maxTuples() { return 6*1024;}
+   constexpr uint32_t maxTuples() { return maxNumberOfTuples();}
 
    // types
    using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct
    using tindex_type = uint16_t; //  for tuples
+
+   using CellNeighbors = GPU::VecArray< uint32_t, 36>;
+   using CellTracks = GPU::VecArray< tindex_type, 42>;
+
+   using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
+   using CellTracksVector = GPU::SimpleVector<CellTracks>;
+
    using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>;
    using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5*maxTuples()>;
-   using HitToTuple = OneToManyAssoc<tindex_type, PixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
+   using HitToTuple = OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
    using TupleMultiplicity = OneToManyAssoc<tindex_type,8,maxTuples()>;
 
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index b3e23792a4083..f20a96dc79b73 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -6,24 +6,28 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
 
-
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
 class GPUCACell {
 public:
 
+  using ptrAsInt = unsigned long long;
+
   static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit();
   using OuterHitOfCell = CAConstants::OuterHitOfCell;
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
 
-
-  using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
-  using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+  using Hits = TrackingRecHit2DSOAView;
+  using hindex_type = Hits::hindex_type;
 
   using TmpTuple = GPU::VecArray<uint32_t,6>;
 
@@ -35,7 +39,8 @@ class GPUCACell {
 #ifdef __CUDACC__
 
   __device__ __forceinline__
-  void init(Hits const & hh,
+  void init(CellNeighborsVector & cellNeighbors, CellTracksVector & cellTracks,
+      Hits const & hh,
       int layerPairId, int doubletId,  
       hindex_type innerHitId, hindex_type outerHitId)
   {
@@ -44,23 +49,45 @@ class GPUCACell {
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
 
-    theInnerZ = __ldg(hh.zg_d+innerHitId);
-    theInnerR = __ldg(hh.rg_d+innerHitId);
-    theOuterNeighbors.reset();
-    theTracks.reset();
+    theInnerZ = hh.zGlobal(innerHitId);
+    theInnerR = hh.rGlobal(innerHitId);
+
+    outerNeighbors().reset();
+    tracks().reset();
+    assert(outerNeighbors().empty());
+    assert(tracks().empty());
+
   }
 
-  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
-  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); }
-  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); }
-  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); }
-  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; }
-  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); }
-  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
-  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }
 
-  __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theInnerHitId); }
-  __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return __ldg(hh.detInd_d+theOuterHitId); }
+ __device__ __forceinline__
+  int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector & cellNeighbors) {
+     return outerNeighbors().push_back(t);
+  }
+
+  __device__ __forceinline__
+  int addTrack(CellTracks::value_t t, CellTracksVector & cellTracks) {
+     return tracks().push_back(t);
+  } 
+
+  __device__ __forceinline__ CellTracks & tracks() { return theTracks;}
+  __device__ __forceinline__ CellTracks const & tracks() const { return theTracks;}
+  __device__ __forceinline__ CellNeighbors & outerNeighbors() { return theOuterNeighbors;}
+  __device__ __forceinline__ CellNeighbors const & outerNeighbors() const { return theOuterNeighbors;}
+  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return hh.xGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return hh.xGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return hh.yGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return hh.yGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return hh.zGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return hh.rGlobal(theOuterHitId); }
+
+   __device__ __forceinline__ auto get_inner_iphi(Hits const & hh) const { return hh.iphi(theInnerHitId); }
+   __device__ __forceinline__ auto get_outer_iphi(Hits const & hh) const { return hh.iphi(theOuterHitId); }
+
+  __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return hh.detectorIndex(theOuterHitId); }
 
   constexpr unsigned int get_inner_hit_id() const {
     return theInnerHitId;
@@ -143,13 +170,19 @@ class GPUCACell {
   __device__
   inline bool 
   hole(Hits const & hh, GPUCACell const & innerCell) const {
-    constexpr float r4 = 16.f;
+    int p = get_outer_iphi(hh);
+    if (p<0) p+=std::numeric_limits<unsigned short>::max();
+    p = (64*p)/std::numeric_limits<unsigned short>::max();
+    p %=2;
+    float r4 = p==0 ? 15.815 : 16.146;  // later on from geom
     auto ri = innerCell.get_inner_r(hh);
     auto zi = innerCell.get_inner_z(hh);
     auto ro = get_outer_r(hh);
     auto zo = get_outer_z(hh);
     auto z4 = std::abs(zi + (r4-ri)*(zo-zi)/(ro-ri));
-    return z4>25.f && z4<33.f;
+    auto zm = z4-6.7*int(z4/6.7);
+    auto h = zm<0.2 || zm>6.5;
+    return h || ( z4>26 && z4<32.f);
   }
 
 
@@ -161,6 +194,7 @@ class GPUCACell {
   inline void find_ntuplets(
       Hits const & hh,
       GPUCACell * __restrict__ cells,
+      CellTracksVector & cellTracks,
       TuplesOnGPU::Container & foundNtuplets, 
       AtomicPairCounter & apc,
       CM & tupleMultiplicity,
@@ -176,10 +210,10 @@ class GPUCACell {
     tmpNtuplet.push_back_unsafe(theDoubletId);
     assert(tmpNtuplet.size()<=4);
 
-    if(theOuterNeighbors.size()>0) {
-      for (int j = 0; j < theOuterNeighbors.size(); ++j) {
-        auto otherCell = theOuterNeighbors[j];
-        cells[otherCell].find_ntuplets(hh, cells, foundNtuplets, apc, tupleMultiplicity, 
+    if(outerNeighbors().size()>0) {
+      for (int j = 0; j < outerNeighbors().size(); ++j) {
+        auto otherCell = outerNeighbors()[j];
+        cells[otherCell].find_ntuplets(hh, cells, cellTracks, foundNtuplets, apc, tupleMultiplicity, 
                                        tmpNtuplet, minHitsPerNtuplet);
       }
     } else {  // if long enough save...
@@ -194,7 +228,7 @@ class GPUCACell {
           hits[nh] = theOuterHitId; 
           auto it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
           if (it>=0)  { // if negative is overflow....
-            for (auto c : tmpNtuplet) cells[c].theTracks.push_back(it);
+            for (auto c : tmpNtuplet) cells[c].addTrack(it,cellTracks);
             tupleMultiplicity.countDirect(tmpNtuplet.size()+1);
           }
         }
@@ -206,9 +240,11 @@ class GPUCACell {
 
 #endif // __CUDACC__
 
-  GPU::VecArray< uint32_t, 36> theOuterNeighbors;
-  GPU::VecArray< uint16_t, 42> theTracks;
+private:
+  CellNeighbors theOuterNeighbors;
+  CellTracks theTracks;
 
+public:
   int32_t theDoubletId;
   int32_t theLayerPairId;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index 393eb9b020c3d..b2eab7626279e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -9,27 +9,10 @@ void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMu
 
   assert(tuples_d); assert(tupleMultiplicity_d); assert(helix_fit_results_d);
 
-  cudaCheck(cudaMalloc(&hitsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
-  cudaCheck(cudaMemset(hitsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)));
-
-  cudaCheck(cudaMalloc(&hits_geGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)));
-  cudaCheck(cudaMemset(hits_geGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)));
-
-  cudaCheck(cudaMalloc(&fast_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)));
-  cudaCheck(cudaMemset(fast_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)));
-
-  cudaCheck(cudaMalloc(&circle_fit_resultsGPU_, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)));
-  cudaCheck(cudaMemset(circle_fit_resultsGPU_, 0x00, maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)));
-
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {
 
-  cudaFree(hitsGPU_);
-  cudaFree(hits_geGPU_);
-  cudaFree(fast_fit_resultsGPU_);
-  cudaFree(circle_fit_resultsGPU_);
-
 }
 
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 40a62c4c1c723..a50116c87321d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -4,9 +4,11 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
-namespace siPixelRecHitsHeterogeneousProduct {
-   struct HitsOnCPU;
-}
+#include <cuda/api_wrappers.h>
+
+class TrackingRecHit2DSOAView;
+class TrackingRecHit2DCUDA;
+
 
 namespace Rfit {
   constexpr uint32_t maxNumberOfConcurrentFits() { return 6*1024;}
@@ -35,8 +37,8 @@ namespace Rfit {
 class HelixFitOnGPU {
 public:
 
-   using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
-   using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+   using HitsOnGPU = TrackingRecHit2DSOAView;
+   using HitsOnCPU = TrackingRecHit2DCUDA;
 
    using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
    using TupleMultiplicity = CAConstants::TupleMultiplicity;
@@ -45,8 +47,8 @@ class HelixFitOnGPU {
    ~HelixFitOnGPU() { deallocateOnGPU();}
 
    void setBField(double bField) { bField_ = bField;}
-   void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
-   void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+   void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream);
+   void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream);
 
    void allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMultiplicity const * tupleMultiplicity, Rfit::helix_fit * helix_fit_results);
    void deallocateOnGPU();
@@ -62,12 +64,6 @@ class HelixFitOnGPU {
     double bField_;
     Rfit::helix_fit * helix_fit_results_d = nullptr;
 
-   // Riemann Fit internals
-   double *hitsGPU_ = nullptr;
-   float *hits_geGPU_ = nullptr;
-   double *fast_fit_resultsGPU_ = nullptr;
-   Rfit::circle_fit *circle_fit_resultsGPU_ = nullptr;
-
     const bool fit5as4_;
 
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
index d27a639a5a9bf..f7538fc822011 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
@@ -8,6 +8,9 @@
 #include <unordered_map>
 
 #include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
+#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 
 // store T for each cluster
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index e9111cb0b5db1..ea801b1b46389 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -11,12 +11,14 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
-using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
-using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
@@ -66,10 +68,10 @@ void kernelFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
     auto hit = hitId[i];
     // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
     float ge[6];
-    hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
+    hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
     // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
 
-    hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
+    hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
     hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
   }
   Rfit::Fast_fit(hits,fast_fit);
@@ -190,85 +192,93 @@ void kernelLineFit(
 }
 
 
-void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
 {
-    assert(tuples_d); assert(fast_fit_resultsGPU_);
+    assert(tuples_d); 
 
     auto blockSize = 64;
     auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
+   //  Fit internals
+   edm::Service<CUDAService> cs;
+   auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
+   auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
+   auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
+   auto circle_fit_resultsGPU_holder = cs->make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit),stream);
+   Rfit::circle_fit * circle_fit_resultsGPU_ = (Rfit::circle_fit*)(circle_fit_resultsGPU_holder.get());
+
     for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
 
       // triplets
-      kernelFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 3, 
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 3, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 3,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
       cudaCheck(cudaGetLastError());
 
       // quads
-      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 4,
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 4, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 4,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
       cudaCheck(cudaGetLastError());
 
       if (fit5as4_) {
         // penta
-        kernelFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 5,
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
         cudaCheck(cudaGetLastError());
 
-        kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 5, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelLineFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
         cudaCheck(cudaGetLastError());
       } else {
         // penta all 5
-        kernelFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tuples_d, tupleMultiplicity_d, 5,
-          hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,offset);
+          hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
         cudaCheck(cudaGetLastError());
 
-        kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
           tupleMultiplicity_d, 5, bField_,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_, offset);
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelLineFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelLineFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, circle_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
              offset);
         cudaCheck(cudaGetLastError());
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 4368bc12ab5f8..e5f8406bd31b8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -10,7 +10,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "DataFormats/Math/interface/approx_atan2.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 
 #include "GPUCACell.h"
@@ -32,8 +31,7 @@ namespace gpuPixelDoublets {
 
 
     auto const & hh = *hhp;
-    uint8_t const * __restrict__ layerp =  hh.phase1TopologyLayer_d;
-    auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};
+    auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id);};
 
     // x run faster...
     auto idy = threadIdx.y + blockIdx.y * blockDim.y;
@@ -54,7 +52,7 @@ namespace gpuPixelDoublets {
     auto sg=0;
     for (uint32_t ic=0; ic<s; ++ic) {
       auto & ci = cells[vc[ic]];
-      if (checkTrack && ci.theTracks.empty()) continue;
+      if (checkTrack && ci.tracks().empty()) continue;
       cc[sg] = vc[ic];
       d[sg] = ci.get_inner_detId(hh);
 //      l[sg] = layer(d[sg]);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 7007554fce9e6..b4c563e601008 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -1,223 +1,16 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
 #define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
 
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <limits>
+#include "RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h"
 
-#include "DataFormats/Math/interface/approx_atan2.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
-
-#include "GPUCACell.h"
-#include "CAConstants.h"
-
-
-// useful for benchmark
-// #define ONLY_PHICUT
-// #define USE_ZCUT
-// #define NO_CLSCUT
+#define CONSTANT_VAR __constant__
 
 namespace gpuPixelDoublets {
 
-  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
-
-  template<typename Hist>
-  __device__
-  __forceinline__
-  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
-                         uint32_t nPairs,
-                         GPUCACell * cells,
-                         uint32_t * nCells,
-                         int16_t const * __restrict__ iphi,
-                         Hist const & __restrict__ hist,
-                         uint32_t const * __restrict__ offsets,
-                         siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
-                         GPUCACell::OuterHitOfCell * isOuterHitOfCell,
-                         int16_t const * __restrict__ phicuts,
-#ifdef USE_ZCUT
-                         float const * __restrict__ minz,
-                         float const * __restrict__ maxz,
-#endif
-                         float const * __restrict__ maxr, bool ideal_cond)
-  {
-
-#ifndef NO_CLSCUT 
-    // ysize cuts (z in the barrel)  times 8
-    constexpr int minYsizeB1=36;
-    constexpr int minYsizeB2=28;
-    constexpr int maxDYsize12=28;
-    constexpr int maxDYsize=20;
-#endif
-
-    auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
-
-    // nPairsMax to be optimized later (originally was 64).
-    // If it should be much bigger, consider using a block-wide parallel prefix scan,
-    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
-    const int nPairsMax = 16;
-    assert(nPairs <= nPairsMax);
-    uint32_t innerLayerCumulativeSize[nPairsMax];
-    innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
-    for (uint32_t i = 1; i < nPairs; ++i) {
-      innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
-    }
-    auto ntot = innerLayerCumulativeSize[nPairs-1];
-
-    // x runs faster
-    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
-    auto first = threadIdx.x;
-    auto stride = blockDim.x;
-    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
-
-      uint32_t pairLayerId=0;
-      while (j >= innerLayerCumulativeSize[pairLayerId++]);
-      --pairLayerId; // move to lower_bound ??
-
-      assert(pairLayerId < nPairs);
-      assert(j < innerLayerCumulativeSize[pairLayerId]);
-      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);
-
-      uint8_t inner = layerPairs[2*pairLayerId];
-      uint8_t outer = layerPairs[2*pairLayerId+1];
-      assert(outer > inner);
-
-      auto hoff = Hist::histOff(outer);
-
-      auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
-      i += offsets[inner];
-
-      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
-
-      assert(i >= offsets[inner]);
-      assert(i < offsets[inner+1]);
-
-      // found hit corresponding to our cuda thread, now do the job
-      auto mez = __ldg(hh.zg_d+i);
-
-#ifdef USE_ZCUT
-     // this statement is responsible for a 10% slow down of the kernel once all following cuts are optimized...
-     if (mez<minz[pairLayerId] || mez>maxz[pairLayerId]) continue;
-#endif
-
-#ifndef NO_CLSCUT
-      auto mes = __ldg(hh.ysize_d+i);
-
-      // if ideal treat inner ladder as outer
-      auto mi = __ldg(hh.detInd_d+i);
-      if (inner==0) assert(mi<96);    
-      const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
-
-      // auto mesx = __ldg(hh.xsize_d+i);
-      // if (mesx<0) continue; // remove edges in x as overlap will take care
-
-      if (inner==0 && outer>3 && isOuterLadder)  // B1 and F1
-         if (mes>0 && mes<minYsizeB1) continue; // only long cluster  (5*8)
-      if (inner==1 && outer>3)  // B2 and F1
-         if (mes>0 && mes<minYsizeB2) continue;
-#endif // NO_CLSCUT
-
-      auto mep = iphi[i];
-      auto mer = __ldg(hh.rg_d+i);
- 
-      constexpr float z0cut = 12.f;                     // cm
-      constexpr float hardPtCut = 0.5f;                 // GeV
-      constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
-      constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
-      auto ptcut = [&](int j) {
-        auto r2t4 = minRadius2T4;
-        auto ri = mer;
-        auto ro = __ldg(hh.rg_d+j);
-        auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
-        return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
-      };
-      auto z0cutoff = [&](int j) {
-        auto zo = __ldg(hh.zg_d+j);
-        auto ro = __ldg(hh.rg_d+j);
-        auto dr = ro-mer;
-        return dr > maxr[pairLayerId] ||
-          dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
-      };
-
-#ifndef NO_CLSCUT
-      auto zsizeCut = [&](int j) {
-        auto onlyBarrel = outer<4;
-        auto so = __ldg(hh.ysize_d+j);
-        //auto sox = __ldg(hh.xsize_d+j);
-        auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize;
-        return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy;
-      };
-#endif
-
-      auto iphicut = phicuts[pairLayerId];
-
-      auto kl = Hist::bin(int16_t(mep-iphicut));
-      auto kh = Hist::bin(int16_t(mep+iphicut));
-      auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
-
-#ifdef GPU_DEBUG
-      int  tot  = 0;
-      int  nmin = 0;
-      int tooMany=0;
-#endif
-
-      auto khh = kh;
-      incr(khh);
-      for (auto kk = kl; kk != khh; incr(kk)) {
-#ifdef GPU_DEBUG
-        if (kk != kl && kk != kh)
-          nmin += hist.size(kk+hoff);
-#endif
-        auto const * __restrict__ p = hist.begin(kk+hoff);
-        auto const * __restrict__ e = hist.end(kk+hoff);
-        p+=first;
-        for (;p < e; p+=stride) {
-          auto oi=__ldg(p);
-          assert(oi>=offsets[outer]);
-          assert(oi<offsets[outer+1]);
-
-          if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
-            continue;
-#ifndef ONLY_PHICUT
-#ifndef NO_CLSCUT
-          if (zsizeCut(oi)) continue;
-#endif
-          if (z0cutoff(oi) || ptcut(oi)) continue;
-#endif
-          auto ind = atomicAdd(nCells, 1); 
-          if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector??
-          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
-          cells[ind].init(hh, pairLayerId, ind, i, oi);
-          isOuterHitOfCell[oi].push_back(ind);
-#ifdef GPU_DEBUG
-          if (isOuterHitOfCell[oi].full()) ++tooMany;
-          ++tot;
-#endif
-        }
-      }
-#ifdef GPU_DEBUG
-      if (tooMany > 0)
-        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
-#endif
-    }  // loop in block...
-  }
+    using namespace gpuPixelDoubletsAlgos;
 
-  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
-  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
-
-  __global__
-  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
-  void getDoubletsFromHisto(GPUCACell * cells,
-                            uint32_t * nCells,
-                            siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
-                            GPUCACell::OuterHitOfCell * isOuterHitOfCell,
-                            bool ideal_cond)
-  {
     constexpr int nPairs = 13;
-    constexpr const uint8_t layerPairs[2*nPairs] = {
+    CONSTANT_VAR const uint8_t layerPairs[2*nPairs] = {
       0, 1,  1, 2,  2, 3,
       // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
       0, 7,  1, 7,  2, 7,  7, 8,  8, 9, // neg
@@ -228,41 +21,71 @@ namespace gpuPixelDoublets {
     constexpr int16_t phi0p06 = 626;    // round(625.82270...) = phi2short(0.06);
     constexpr int16_t phi0p07 = 730;    // round(730.12648...) = phi2short(0.07);
 
-    constexpr const int16_t phicuts[nPairs] {
+    CONSTANT_VAR const int16_t phicuts[nPairs] {
       phi0p05, phi0p05, phi0p06,
       phi0p07, phi0p06, phi0p06, phi0p05, phi0p05,
       phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
     };
 
-#ifdef USE_ZCUT
-    float const minz[nPairs] = {
+    CONSTANT_VAR float const minz[nPairs] = {
       -20., -22., -22.,
       -30., -30.,-30., -70., -70.,
         0., 10., 15., -70., -70.
     };
 
-    float const maxz[nPairs] = {
+    CONSTANT_VAR float const maxz[nPairs] = {
       20., 22., 22.,
        0., -10., -15., 70., 70.,
       30., 30., 30., 70., 70.
     };
-#endif
 
-    float const maxr[nPairs] = {
+    CONSTANT_VAR float const maxr[nPairs] = {
       20., 20., 20.,
        9.,  7.,  6.,  5.,  5.,
        9.,  7.,  6.,  5.,  5.
     };
 
+
+  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
+
+  constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
+
+
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
+
+ __global__
+  void initDoublets(GPUCACell::OuterHitOfCell * isOuterHitOfCell, int nHits,
+                    CellNeighborsVector * cellNeighbors, CellNeighbors * cellNeighborsContainer,
+                    CellTracksVector * cellTracks, CellTracks * cellTracksContainer
+                   )
+  {
+     int first = blockIdx.x * blockDim.x + threadIdx.x;
+     for (int i=first; i<nHits; i+=gridDim.x*blockDim.x) isOuterHitOfCell[i].reset();
+  }
+
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
+  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
+
+  __global__
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
+  void getDoubletsFromHisto(GPUCACell * cells,
+                            uint32_t * nCells,
+                            CellNeighborsVector * cellNeighbors, CellTracksVector * cellTracks,
+                            TrackingRecHit2DSOAView const *  __restrict__ hhp,
+                            GPUCACell::OuterHitOfCell * isOuterHitOfCell,
+                            bool ideal_cond)
+  {
+
     auto const &  __restrict__ hh = *hhp;
-    doubletsFromHisto(layerPairs, nPairs, cells, nCells,
-                      hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
+    doubletsFromHisto(layerPairs, nPairs,
+                      cells, nCells,
+                      cellNeighbors, cellTracks,
                       hh, isOuterHitOfCell,
-                      phicuts, 
-#ifdef USE_ZCUT
-                      minz, maxz, 
-#endif
-                      maxr , ideal_cond);
+                      phicuts, minz, maxz, maxr,
+                      ideal_cond);
   }
 
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
new file mode 100644
index 0000000000000..cffb432cd8e37
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -0,0 +1,220 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+#include "GPUCACell.h"
+#include "CAConstants.h"
+
+
+// useful for benchmark
+// #define ONLY_PHICUT
+// #define NO_ZCUT
+// #define NO_CLSCUT
+
+namespace gpuPixelDoubletsAlgos {
+
+  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
+
+  constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
+
+
+  using CellNeighbors = CAConstants::CellNeighbors;
+  using CellTracks = CAConstants::CellTracks;
+  using CellNeighborsVector = CAConstants::CellNeighborsVector;
+  using CellTracksVector = CAConstants::CellTracksVector;
+
+  __device__
+  __forceinline__
+  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
+                         uint32_t nPairs,
+                         GPUCACell * cells,
+                         uint32_t * nCells,
+                         CellNeighborsVector * cellNeighbors, CellTracksVector * cellTracks,
+                         TrackingRecHit2DSOAView const &  __restrict__ hh,
+                         GPUCACell::OuterHitOfCell * isOuterHitOfCell,
+                         int16_t const * __restrict__ phicuts,
+                         float const * __restrict__ minz,
+                         float const * __restrict__ maxz,
+                         float const * __restrict__ maxr,
+                         bool ideal_cond)
+  {
+
+#ifndef NO_CLSCUT 
+    // ysize cuts (z in the barrel)  times 8
+    constexpr int minYsizeB1=36;
+    constexpr int minYsizeB2=28;
+    constexpr int maxDYsize12=28;
+    constexpr int maxDYsize=20;
+#endif
+
+    using Hist = TrackingRecHit2DSOAView::Hist;
+
+    auto const & __restrict__ hist = hh.phiBinner();
+    uint32_t const * __restrict__ offsets = hh.hitsLayerStart();
+    assert(offsets);
+
+    auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
+
+    // nPairsMax to be optimized later (originally was 64).
+    // If it should be much bigger, consider using a block-wide parallel prefix scan,
+    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+    const int nPairsMax = 16;
+    assert(nPairs <= nPairsMax);
+    __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
+    __shared__ uint32_t ntot;
+    if (threadIdx.y==0 && threadIdx.x==0) {
+      innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
+      for (uint32_t i = 1; i < nPairs; ++i) {
+        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
+      }
+      ntot = innerLayerCumulativeSize[nPairs-1];
+    }
+    __syncthreads();
+
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
+
+      uint32_t pairLayerId=0;
+      while (j >= innerLayerCumulativeSize[pairLayerId++]);
+      --pairLayerId; // move to lower_bound ??
+
+      assert(pairLayerId < nPairs);
+      assert(j < innerLayerCumulativeSize[pairLayerId]);
+      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);
+
+      uint8_t inner = layerPairs[2*pairLayerId];
+      uint8_t outer = layerPairs[2*pairLayerId+1];
+      assert(outer > inner);
+
+      auto hoff = Hist::histOff(outer);
+
+      auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
+      i += offsets[inner];
+
+      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
+
+      assert(i >= offsets[inner]);
+      assert(i < offsets[inner+1]);
+
+      // found hit corresponding to our cuda thread, now do the job
+      auto mez = hh.zGlobal(i);
+
+#ifndef NO_ZCUT
+     if (mez<minz[pairLayerId] || mez>maxz[pairLayerId]) continue;
+#endif
+
+#ifndef NO_CLSCUT
+      auto mes = hh.clusterSizeY(i);
+
+      // if ideal treat inner ladder as outer
+      auto mi = hh.detectorIndex(i);
+      if (inner==0) assert(mi<96);    
+      const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
+
+
+      if (inner==0 && outer>3 && isOuterLadder)  // B1 and F1
+         if (mes>0 && mes<minYsizeB1) continue; // only long cluster  (5*8)
+      if (inner==1 && outer>3)  // B2 and F1
+         if (mes>0 && mes<minYsizeB2) continue;
+#endif // NO_CLSCUT
+
+      auto mep = hh.iphi(i);
+      auto mer = hh.rGlobal(i);
+ 
+      constexpr float z0cut = 12.f;                     // cm
+      constexpr float hardPtCut = 0.5f;                 // GeV
+      constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
+      auto ptcut = [&](int j) {
+        auto r2t4 = minRadius2T4;
+        auto ri = mer;
+        auto ro = hh.rGlobal(j);
+        auto dphi = short2phi( min( abs(int16_t(mep-hh.iphi(j))), abs(int16_t(hh.iphi(j)-mep)) ) );
+        return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
+      };
+      auto z0cutoff = [&](int j) {
+        auto zo = hh.zGlobal(j);;
+        auto ro = hh.rGlobal(j);
+        auto dr = ro-mer;
+        return dr > maxr[pairLayerId] ||
+          dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
+      };
+
+#ifndef NO_CLSCUT
+      auto zsizeCut = [&](int j) {
+        auto onlyBarrel = outer<4;
+        auto so = hh.clusterSizeY(j);
+        auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize;
+        return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy;
+      };
+#endif
+
+      auto iphicut = phicuts[pairLayerId];
+
+      auto kl = Hist::bin(int16_t(mep-iphicut));
+      auto kh = Hist::bin(int16_t(mep+iphicut));
+      auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
+
+#ifdef GPU_DEBUG
+      int  tot  = 0;
+      int  nmin = 0;
+      int tooMany=0;
+#endif
+
+      auto khh = kh;
+      incr(khh);
+      for (auto kk = kl; kk != khh; incr(kk)) {
+#ifdef GPU_DEBUG
+        if (kk != kl && kk != kh)
+          nmin += hist.size(kk+hoff);
+#endif
+        auto const * __restrict__ p = hist.begin(kk+hoff);
+        auto const * __restrict__ e = hist.end(kk+hoff);
+        p+=first;
+        for (;p < e; p+=stride) {
+          auto oi=__ldg(p);
+          assert(oi>=offsets[outer]);
+          assert(oi<offsets[outer+1]);
+
+          if (std::min(std::abs(int16_t(hh.iphi(oi)-mep)), std::abs(int16_t(mep-hh.iphi(oi)))) > iphicut)
+            continue;
+#ifndef ONLY_PHICUT
+#ifndef NO_CLSCUT
+          if (zsizeCut(oi)) continue;
+#endif
+          if (z0cutoff(oi) || ptcut(oi)) continue;
+#endif
+          auto ind = atomicAdd(nCells, 1); 
+          if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector??
+          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
+          cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi);
+          isOuterHitOfCell[oi].push_back(ind);
+#ifdef GPU_DEBUG
+          if (isOuterHitOfCell[oi].full()) ++tooMany;
+          ++tot;
+#endif
+        }
+      }
+#ifdef GPU_DEBUG
+      if (tooMany > 0)
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
+#endif
+    }  // loop in block...
+  }
+
+} // namespace end
+
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h

From 548f5cf4615af66c6439b1057ed9cccac24b8503 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 14 May 2019 23:31:50 +0200
Subject: [PATCH 047/102] Clean up by clang-format (cms-patatrack#338)

---
 .../PixelTrackFitting/interface/BrokenLine.h  | 629 ++++++------
 .../PixelTrackFitting/interface/FitResult.h   |  28 +-
 .../PixelTrackFitting/interface/FitUtils.h    | 183 ++--
 .../interface/PixelNtupletsFitter.h           |  12 +-
 .../PixelTrackFitting/interface/RiemannFit.h  | 970 +++++++++---------
 .../plugins/PixelNtupletsFitterProducer.cc    |  27 +-
 .../plugins/PixelTrackProducer.cc             |  42 +-
 .../plugins/PixelTrackProducer.h              |  16 +-
 .../src/PixelNtupletsFitter.cc                |  65 +-
 .../test/PixelTrackRiemannFit.cc              | 317 +++---
 .../PixelTrackFitting/test/testEigenGPU.cu    | 413 ++++----
 .../PixelTrackFitting/test/testRiemannFit.cpp | 128 ++-
 .../plugins/BrokenLineFitOnGPU.cu             | 346 ++++---
 .../PixelTriplets/plugins/CAConstants.h       |  68 +-
 .../PixelTriplets/plugins/GPUCACell.h         | 236 ++---
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  19 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  97 +-
 .../PixelTriplets/plugins/RecHitsMap.h        | 133 +--
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  | 416 ++++----
 .../PixelTriplets/plugins/gpuFishbone.h       | 102 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 154 +--
 .../plugins/gpuPixelDoubletsAlgos.h           | 182 ++--
 22 files changed, 2291 insertions(+), 2292 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
index 32894c8aa432a..01d59e7af2100 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -1,12 +1,12 @@
-#ifndef RECOPIXELVERTEXING_PIXELTRACKFITTING_BROKENLINE_H
-#define RECOPIXELVERTEXING_PIXELTRACKFITTING_BROKENLINE_H
-
-#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
 
 #include <Eigen/Eigenvalues>
 
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
 namespace BrokenLine {
-  
+
   using namespace Rfit;
 
   //!< Karimäki's parameters: (phi, d, k=1/R)
@@ -16,22 +16,21 @@ namespace BrokenLine {
     |cov(phi, k )|cov( d , k )|cov( k , k )|
   */
   using karimaki_circle_fit = Rfit::circle_fit;
-  
-  
+
   /*!
     \brief data needed for the Broken Line fit procedure.
   */
-  template<int N>
+  template <int N>
   struct PreparedBrokenLineData {
-    int q; //!< particle charge
-    Matrix2xNd<N> radii; //!< xy data in the system in which the pre-fitted center is the origin
-    VectorNd<N> s; //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach
-    VectorNd<N> S; //!< total distance traveled (three-dimensional)
-    VectorNd<N> Z; //!< orthogonal coordinate to the pre-fitted line in the sz plane
-    VectorNd<N> VarBeta; //!< kink angles in the SZ plane
+    int q;                //!< particle charge
+    Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
+    VectorNd<N> s;  //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach
+    VectorNd<N> S;  //!< total distance traveled (three-dimensional)
+    VectorNd<N> Z;  //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
   };
-  
-   /*!
+
+  /*!
     \brief Computes the Coulomb multiple scattering variance of the planar angle.
     
     \param length length of the track in the material.
@@ -45,20 +44,22 @@ namespace BrokenLine {
     
     \return the variance of the planar angle ((theta_0)^2 /3).
   */
-  __host__ __device__ inline double MultScatt(const double& length, const double B, const double R, int Layer, double slope) {
+  __host__ __device__ inline double MultScatt(
+      const double& length, const double B, const double R, int Layer, double slope) {
     // limit R to 20GeV...
-    auto pt2 = std::min(20.,B*R);
-    pt2 *=pt2;
-    constexpr double XXI_0 = 0.06/16.; //!< inverse of radiation length of the material in cm
+    auto pt2 = std::min(20., B * R);
+    pt2 *= pt2;
+    constexpr double XXI_0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
     //if(Layer==1) XXI_0=0.06/16.;
     // else XXI_0=0.06/16.;
     //XX_0*=1;
-    constexpr double geometry_factor=0.7; //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
-    constexpr double fact = geometry_factor*sqr(13.6/1000.);
-    return fact/(pt2*(1.+sqr(slope)))
-      *(std::abs(length)*XXI_0)*sqr(1.+0.038*log(std::abs(length)*XXI_0));
+    constexpr double geometry_factor =
+        0.7;  //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+    constexpr double fact = geometry_factor * sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + sqr(slope))) * (std::abs(length) * XXI_0) *
+           sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
   }
-  
+
   /*!
     \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
     
@@ -68,13 +69,13 @@ namespace BrokenLine {
   */
   __host__ __device__ inline Matrix2d RotationMatrix(double slope) {
     Matrix2d Rot;
-    Rot(0,0)=1./sqrt(1.+sqr(slope));
-    Rot(0,1)=slope*Rot(0,0);
-    Rot(1,0)=-Rot(0,1);
-    Rot(1,1)=Rot(0,0);
+    Rot(0, 0) = 1. / sqrt(1. + sqr(slope));
+    Rot(0, 1) = slope * Rot(0, 0);
+    Rot(1, 0) = -Rot(0, 1);
+    Rot(1, 1) = Rot(0, 0);
     return Rot;
   }
-  
+
   /*!
     \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a translation of the coordinate system, such that the old origin has coordinates (x0,y0) in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
     
@@ -84,32 +85,30 @@ namespace BrokenLine {
     \param Jacob passed by reference in order to save stack.
   */
   __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, double x0, double y0, Matrix3d& Jacob) {
-    double A,U,BB,C,DO,DP,uu,xi,v,mu,lambda,zeta;
-    DP=x0*cos(circle.par(0))+y0*sin(circle.par(0));
-    DO=x0*sin(circle.par(0))-y0*cos(circle.par(0))+circle.par(1);
-    uu=1+circle.par(2)*circle.par(1);
-    C=-circle.par(2)*y0+uu*cos(circle.par(0));
-    BB=circle.par(2)*x0+uu*sin(circle.par(0));
-    A=2.*DO+circle.par(2)*(sqr(DO)+sqr(DP));
-    U=sqrt(1.+circle.par(2)*A);
-    xi=1./(sqr(BB)+sqr(C));
-    v=1.+circle.par(2)*DO;
-    lambda=(0.5*A)/(U*sqr(1.+U));
-    mu=1./(U*(1.+U))+circle.par(2)*lambda;
-    zeta=sqr(DO)+sqr(DP);
-    
-    Jacob << xi*uu*v, -xi*sqr(circle.par(2))*DP, xi*DP,
-      2.*mu*uu*DP, 2.*mu*v, mu*zeta-lambda*A,
-      0, 0, 1.;
-    
-    circle.par(0)=atan2(BB,C);
-    circle.par(1)=A/(1+U);
+    double A, U, BB, C, DO, DP, uu, xi, v, mu, lambda, zeta;
+    DP = x0 * cos(circle.par(0)) + y0 * sin(circle.par(0));
+    DO = x0 * sin(circle.par(0)) - y0 * cos(circle.par(0)) + circle.par(1);
+    uu = 1 + circle.par(2) * circle.par(1);
+    C = -circle.par(2) * y0 + uu * cos(circle.par(0));
+    BB = circle.par(2) * x0 + uu * sin(circle.par(0));
+    A = 2. * DO + circle.par(2) * (sqr(DO) + sqr(DP));
+    U = sqrt(1. + circle.par(2) * A);
+    xi = 1. / (sqr(BB) + sqr(C));
+    v = 1. + circle.par(2) * DO;
+    lambda = (0.5 * A) / (U * sqr(1. + U));
+    mu = 1. / (U * (1. + U)) + circle.par(2) * lambda;
+    zeta = sqr(DO) + sqr(DP);
+
+    Jacob << xi * uu * v, -xi * sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
+        mu * zeta - lambda * A, 0, 0, 1.;
+
+    circle.par(0) = atan2(BB, C);
+    circle.par(1) = A / (1 + U);
     // circle.par(2)=circle.par(2);
-    
-    circle.cov=Jacob*circle.cov*Jacob.transpose();
+
+    circle.cov = Jacob * circle.cov * Jacob.transpose();
   }
-  
- 
+
   /*!
     \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
     
@@ -119,50 +118,51 @@ namespace BrokenLine {
     \param B magnetic field in Gev/cm/c.
     \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
   */
-  template<typename M3xN, typename V4, int N>
+  template <typename M3xN, typename V4, int N>
   __host__ __device__ inline void prepareBrokenLineData(const M3xN& hits,
-							const V4& fast_fit,
-							const double B,
-							PreparedBrokenLineData<N> & results) {
+                                                        const V4& fast_fit,
+                                                        const double B,
+                                                        PreparedBrokenLineData<N>& results) {
     constexpr auto n = N;
     u_int i;
     Vector2d d;
     Vector2d e;
-    
-    d=hits.block(0,1,2,1)-hits.block(0,0,2,1);
-    e=hits.block(0,n-1,2,1)-hits.block(0,n-2,2,1);
-    results.q = cross2D(d,e)>0 ? -1 : 1;
-    
-    const double slope=-results.q/fast_fit(3);
-    
-    Matrix2d R=RotationMatrix(slope);
-    
+
+    d = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
+    e = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
+    results.q = cross2D(d, e) > 0 ? -1 : 1;
+
+    const double slope = -results.q / fast_fit(3);
+
+    Matrix2d R = RotationMatrix(slope);
+
     // calculate radii and s
-    results.radii=hits.block(0,0,2,n)-fast_fit.head(2)*MatrixXd::Constant(1,n,1);
-    e=-fast_fit(2)*fast_fit.head(2)/fast_fit.head(2).norm();
-    for(i=0;i<n;i++) {
-      d=results.radii.block(0,i,2,1);
-      results.s(i)=results.q*fast_fit(2)*atan2(cross2D(d,e),d.dot(e)); // calculates the arc length
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * MatrixXd::Constant(1, n, 1);
+    e = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
+    for (i = 0; i < n; i++) {
+      d = results.radii.block(0, i, 2, 1);
+      results.s(i) = results.q * fast_fit(2) * atan2(cross2D(d, e), d.dot(e));  // calculates the arc length
     }
-    VectorNd<N> z=hits.block(2,0,1,n).transpose();
-    
+    VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
+
     //calculate S and Z
-    Matrix2xNd<N> pointsSZ=Matrix2xNd<N>::Zero();
-    for(i=0;i<n;i++) {
-      pointsSZ(0,i)=results.s(i);
-      pointsSZ(1,i)=z(i);
-      pointsSZ.block(0,i,2,1)=R*pointsSZ.block(0,i,2,1);
+    Matrix2xNd<N> pointsSZ = Matrix2xNd<N>::Zero();
+    for (i = 0; i < n; i++) {
+      pointsSZ(0, i) = results.s(i);
+      pointsSZ(1, i) = z(i);
+      pointsSZ.block(0, i, 2, 1) = R * pointsSZ.block(0, i, 2, 1);
     }
-    results.S=pointsSZ.block(0,0,1,n).transpose();
-    results.Z=pointsSZ.block(1,0,1,n).transpose();
-    
+    results.S = pointsSZ.block(0, 0, 1, n).transpose();
+    results.Z = pointsSZ.block(1, 0, 1, n).transpose();
+
     //calculate VarBeta
-    results.VarBeta(0)=results.VarBeta(n-1)=0;
-    for(i=1;i<n-1;i++) {
-      results.VarBeta(i)=MultScatt(results.S(i+1)-results.S(i),B,fast_fit(2),i+2,slope)+MultScatt(results.S(i)-results.S(i-1),B,fast_fit(2),i+1,slope);
+    results.VarBeta(0) = results.VarBeta(n - 1) = 0;
+    for (i = 1; i < n - 1; i++) {
+      results.VarBeta(i) = MultScatt(results.S(i + 1) - results.S(i), B, fast_fit(2), i + 2, slope) +
+                           MultScatt(results.S(i) - results.S(i - 1), B, fast_fit(2), i + 1, slope);
     }
   }
-  
+
   /*!
     \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. This is the whole matrix in the case of the line fit and the main n-by-n block in the case of the circle fit.
     
@@ -172,28 +172,38 @@ namespace BrokenLine {
     
     \return the n-by-n matrix of the linear system
   */
-  template<int N>
-  __host__ __device__ inline MatrixNd<N> MatrixC_u(const VectorNd<N>& w, const VectorNd<N>& S, const VectorNd<N>& VarBeta) {
-    constexpr u_int n=N;
+  template <int N>
+  __host__ __device__ inline MatrixNd<N> MatrixC_u(const VectorNd<N>& w,
+                                                   const VectorNd<N>& S,
+                                                   const VectorNd<N>& VarBeta) {
+    constexpr u_int n = N;
     u_int i;
-    
-    MatrixNd<N> C_U=MatrixNd<N>::Zero();
-    for(i=0;i<n;i++) {
-      C_U(i,i)=w(i);
-      if(i>1) C_U(i,i)+=1./(VarBeta(i-1)*sqr(S(i)-S(i-1)));
-      if(i>0 && i<n-1) C_U(i,i)+=(1./VarBeta(i))*sqr((S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
-      if(i<n-2) C_U(i,i)+=1./(VarBeta(i+1)*sqr(S(i+1)-S(i)));
-      
-      if(i>0 && i<n-1) C_U(i,i+1)=1./(VarBeta(i)*(S(i+1)-S(i)))*(-(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1))));
-      if(i<n-2) C_U(i,i+1)+=1./(VarBeta(i+1)*(S(i+1)-S(i)))*(-(S(i+2)-S(i))/((S(i+2)-S(i+1))*(S(i+1)-S(i))));
-      
-      if(i<n-2) C_U(i,i+2)=1./(VarBeta(i+1)*(S(i+2)-S(i+1))*(S(i+1)-S(i)));
-
-      C_U(i,i)*=0.5;
+
+    MatrixNd<N> C_U = MatrixNd<N>::Zero();
+    for (i = 0; i < n; i++) {
+      C_U(i, i) = w(i);
+      if (i > 1)
+        C_U(i, i) += 1. / (VarBeta(i - 1) * sqr(S(i) - S(i - 1)));
+      if (i > 0 && i < n - 1)
+        C_U(i, i) += (1. / VarBeta(i)) * sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+      if (i < n - 2)
+        C_U(i, i) += 1. / (VarBeta(i + 1) * sqr(S(i + 1) - S(i)));
+
+      if (i > 0 && i < n - 1)
+        C_U(i, i + 1) =
+            1. / (VarBeta(i) * (S(i + 1) - S(i))) * (-(S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+      if (i < n - 2)
+        C_U(i, i + 1) += 1. / (VarBeta(i + 1) * (S(i + 1) - S(i))) *
+                         (-(S(i + 2) - S(i)) / ((S(i + 2) - S(i + 1)) * (S(i + 1) - S(i))));
+
+      if (i < n - 2)
+        C_U(i, i + 2) = 1. / (VarBeta(i + 1) * (S(i + 2) - S(i + 1)) * (S(i + 1) - S(i)));
+
+      C_U(i, i) *= 0.5;
     }
-    return C_U+C_U.transpose();
+    return C_U + C_U.transpose();
   }
-  
+
   /*!
     \brief A very fast helix fit.
     
@@ -203,34 +213,31 @@ namespace BrokenLine {
     
     \warning sign of theta is (intentionally, for now) mistaken for negative charges.
   */
-  
-  template<typename M3xN, typename V4>
-  __host__ __device__ inline void BL_Fast_fit(const M3xN& hits, V4 & result)
-  {
-    
+
+  template <typename M3xN, typename V4>
+  __host__ __device__ inline void BL_Fast_fit(const M3xN& hits, V4& result) {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
-    constexpr auto n = N; // get the number of hits
-    
-    const Vector2d a=hits.block(0,n/2,2,1)-hits.block(0,0,2,1);
-    const Vector2d b=hits.block(0,n-1,2,1)-hits.block(0,n/2,2,1);
-    const Vector2d c=hits.block(0,0,2,1)-hits.block(0,n-1,2,1);
+    constexpr auto n = N;  // get the number of hits
+
+    const Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    const Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
 
-    auto tmp = 0.5/cross2D(c,a);
-    result(0)=hits(0,0)-(a(1)*c.squaredNorm()+c(1)*a.squaredNorm())*tmp;
-    result(1)=hits(1,0)+(a(0)*c.squaredNorm()+c(0)*a.squaredNorm())*tmp;
+    auto tmp = 0.5 / cross2D(c, a);
+    result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
+    result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
     // check Wikipedia for these formulas
-    
-    result(2)=sqrt(a.squaredNorm()*b.squaredNorm()*c.squaredNorm())/(2.*std::abs(cross2D(b,a)));
+
+    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(cross2D(b, a)));
     // Using Math Olympiad's formula R=abc/(4A)
-    
-    const Vector2d d=hits.block(0,0,2,1)-result.head(2);
-    const Vector2d e=hits.block(0,n-1,2,1)-result.head(2);
-    
-    result(3)=result(2)*atan2(cross2D(d, e), d.dot(e))/(hits(2,n-1)-hits(2,0));
+
+    const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+
+    result(3) = result(2) * atan2(cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
     // ds/dz slope between last and first point
-    
   }
-  
+
   /*!
     \brief Performs the Broken Line fit in the curved track case (that is, the fit parameters are the interceptions u and the curvature correction \Delta\kappa).
     
@@ -248,129 +255,134 @@ namespace BrokenLine {
     The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and \Delta\kappa and their covariance matrix.
     The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
   */
-  template<typename M3xN, typename M6xN, typename V4, int N>
+  template <typename M3xN, typename M6xN, typename V4, int N>
   __host__ __device__ inline void BL_Circle_fit(const M3xN& hits,
-						const  M6xN & hits_ge,
-						const V4& fast_fit,
-						const double B,
-						PreparedBrokenLineData<N>& data,
-						karimaki_circle_fit & circle_results
-                                               ) {
-
+                                                const M6xN& hits_ge,
+                                                const V4& fast_fit,
+                                                const double B,
+                                                PreparedBrokenLineData<N>& data,
+                                                karimaki_circle_fit& circle_results) {
     constexpr u_int n = N;
     u_int i;
-    
-    circle_results.q=data.q;
-    auto & radii=data.radii;
-    const auto & s=data.s;
-    const auto & S=data.S;
-    auto & Z=data.Z;
-    auto & VarBeta=data.VarBeta;
-    const double slope=-circle_results.q/fast_fit(3);
-    VarBeta*=1.+sqr(slope); // the kink angles are projected!
-    
-    for(i=0;i<n;i++) {
-      Z(i)=radii.block(0,i,2,1).norm()-fast_fit(2);
+
+    circle_results.q = data.q;
+    auto& radii = data.radii;
+    const auto& s = data.s;
+    const auto& S = data.S;
+    auto& Z = data.Z;
+    auto& VarBeta = data.VarBeta;
+    const double slope = -circle_results.q / fast_fit(3);
+    VarBeta *= 1. + sqr(slope);  // the kink angles are projected!
+
+    for (i = 0; i < n; i++) {
+      Z(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
     }
-    
-    Matrix2d V; // covariance matrix
-    VectorNd<N> w; // weights
-    Matrix2d RR; // rotation matrix point by point
+
+    Matrix2d V;     // covariance matrix
+    VectorNd<N> w;  // weights
+    Matrix2d RR;    // rotation matrix point by point
     //double Slope; // slope of the circle point by point
-    for(i=0;i<n;i++) {
-      V(0,0)=hits_ge.col(i)[0];                // x errors
-      V(0,1)=V(1,0)=hits_ge.col(i)[1];   // cov_xy
-      V(1,1)=hits_ge.col(i)[2];                // y errors
+    for (i = 0; i < n; i++) {
+      V(0, 0) = hits_ge.col(i)[0];            // x errors
+      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      V(1, 1) = hits_ge.col(i)[2];            // y errors
       //Slope=-radii(0,i)/radii(1,i);
-      RR=RotationMatrix(-radii(0,i)/radii(1,i));
-      w(i)=1./((RR*V*RR.transpose())(1,1)); // compute the orthogonal weight point by point
+      RR = RotationMatrix(-radii(0, i) / radii(1, i));
+      w(i) = 1. / ((RR * V * RR.transpose())(1, 1));  // compute the orthogonal weight point by point
     }
-    
+
     VectorNplusONEd<N> r_u;
-    r_u(n)=0;
-    for(i=0;i<n;i++) {
-      r_u(i)=w(i)*Z(i);
+    r_u(n) = 0;
+    for (i = 0; i < n; i++) {
+      r_u(i) = w(i) * Z(i);
     }
-    
+
     MatrixNplusONEd<N> C_U;
-    C_U.block(0,0,n,n)=MatrixC_u(w,s,VarBeta);
-    C_U(n,n) =0;
+    C_U.block(0, 0, n, n) = MatrixC_u(w, s, VarBeta);
+    C_U(n, n) = 0;
     //add the border to the C_u matrix
-    for(i=0;i<n;i++) {
-      C_U(i,n) =0;
-      if(i>0 && i<n-1) {
-	C_U(i,n)+=-(s(i+1)-s(i-1))*(s(i+1)-s(i-1))/(2.*VarBeta(i)*(s(i+1)-s(i))*(s(i)-s(i-1)));
+    for (i = 0; i < n; i++) {
+      C_U(i, n) = 0;
+      if (i > 0 && i < n - 1) {
+        C_U(i, n) +=
+            -(s(i + 1) - s(i - 1)) * (s(i + 1) - s(i - 1)) / (2. * VarBeta(i) * (s(i + 1) - s(i)) * (s(i) - s(i - 1)));
       }
-      if(i>1) {
-	C_U(i,n)+=(s(i)-s(i-2))/(2.*VarBeta(i-1)*(s(i)-s(i-1)));
+      if (i > 1) {
+        C_U(i, n) += (s(i) - s(i - 2)) / (2. * VarBeta(i - 1) * (s(i) - s(i - 1)));
       }
-      if(i<n-2) {
-	C_U(i,n)+=(s(i+2)-s(i))/(2.*VarBeta(i+1)*(s(i+1)-s(i)));
+      if (i < n - 2) {
+        C_U(i, n) += (s(i + 2) - s(i)) / (2. * VarBeta(i + 1) * (s(i + 1) - s(i)));
       }
-      C_U(n,i) = C_U(i,n);
-      if(i>0 && i<n-1) C_U(n,n)+=sqr(s(i+1)-s(i-1))/(4.*VarBeta(i));
+      C_U(n, i) = C_U(i, n);
+      if (i > 0 && i < n - 1)
+        C_U(n, n) += sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
     }
 
 #ifdef CPP_DUMP
     std::cout << "CU5\n" << C_U << std::endl;
 #endif
     MatrixNplusONEd<N> I;
-    choleskyInversion::invert(C_U,I);
+    choleskyInversion::invert(C_U, I);
     // MatrixNplusONEd<N> I = C_U.inverse();
 #ifdef CPP_DUMP
     std::cout << "I5\n" << I << std::endl;
 #endif
 
-    
-    VectorNplusONEd<N> u = I*r_u; // obtain the fitted parameters by solving the linear system
-    
+    VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+
     // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
-    
-    radii.block(0,0,2,1)/=radii.block(0,0,2,1).norm();
-    radii.block(0,1,2,1)/=radii.block(0,1,2,1).norm();
-    
-    Vector2d d=hits.block(0,0,2,1)+(-Z(0)+u(0))*radii.block(0,0,2,1);
-    Vector2d e=hits.block(0,1,2,1)+(-Z(1)+u(1))*radii.block(0,1,2,1);
-    
-    circle_results.par << atan2((e-d)(1),(e-d)(0)),
-      -circle_results.q*(fast_fit(2)-sqrt(sqr(fast_fit(2))- 0.25*(e-d).squaredNorm())),
-      circle_results.q*(1./fast_fit(2)+u(n));
-    
-    assert(circle_results.q*circle_results.par(1)<=0);
-    
-    Vector2d eMinusd=e-d;
-    double tmp1=eMinusd.squaredNorm();
-    
+
+    radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
+    radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
+
+    Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
+    Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
+
+    circle_results.par << atan2((e - d)(1), (e - d)(0)),
+        -circle_results.q * (fast_fit(2) - sqrt(sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
+        circle_results.q * (1. / fast_fit(2) + u(n));
+
+    assert(circle_results.q * circle_results.par(1) <= 0);
+
+    Vector2d eMinusd = e - d;
+    double tmp1 = eMinusd.squaredNorm();
+
     Matrix3d Jacob;
-    Jacob << (radii(1,0)*eMinusd(0)-eMinusd(1)*radii(0,0))/tmp1,(radii(1,1)*eMinusd(0)-eMinusd(1)*radii(0,1))/tmp1,0,
-      (circle_results.q/2)*(eMinusd(0)*radii(0,0)+eMinusd(1)*radii(1,0))/sqrt(sqr(2*fast_fit(2))-tmp1),(circle_results.q/2)*(eMinusd(0)*radii(0,1)+eMinusd(1)*radii(1,1))/sqrt(sqr(2*fast_fit(2))-tmp1),0,
-      0,0,circle_results.q;
-    
-    circle_results.cov << I(0,0), I(0,1), I(0,n),
-      I(1,0), I(1,1), I(1,n),
-      I(n,0), I(n,1), I(n,n);
-    
-    circle_results.cov=Jacob*circle_results.cov*Jacob.transpose();
-    
+    Jacob << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
+        (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0,
+        (circle_results.q / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) /
+            sqrt(sqr(2 * fast_fit(2)) - tmp1),
+        (circle_results.q / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) /
+            sqrt(sqr(2 * fast_fit(2)) - tmp1),
+        0, 0, 0, circle_results.q;
+
+    circle_results.cov << I(0, 0), I(0, 1), I(0, n), I(1, 0), I(1, 1), I(1, n), I(n, 0), I(n, 1), I(n, n);
+
+    circle_results.cov = Jacob * circle_results.cov * Jacob.transpose();
+
     //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
-    
-    TranslateKarimaki(circle_results,0.5*(e-d)(0),0.5*(e-d)(1),Jacob);
-    circle_results.cov(0,0)+=(1+sqr(slope))*MultScatt(S(1)-S(0),B,fast_fit(2),2,slope);
-    
+
+    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), Jacob);
+    circle_results.cov(0, 0) += (1 + sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
+
     //...And translate back to the original system
-    
-    TranslateKarimaki(circle_results,d(0),d(1),Jacob);
-    
+
+    TranslateKarimaki(circle_results, d(0), d(1), Jacob);
+
     // compute chi2
-    circle_results.chi2=0;
-    for(i=0;i<n;i++) {
-      circle_results.chi2+=w(i)*sqr(Z(i)-u(i));
-      if(i>0 && i<n-1) circle_results.chi2+=sqr(u(i-1)/(s(i)-s(i-1))-u(i)*(s(i+1)-s(i-1))/((s(i+1)-s(i))*(s(i)-s(i-1)))+u(i+1)/(s(i+1)-s(i))+(s(i+1)-s(i-1))*u(n)/2)/VarBeta(i);
+    circle_results.chi2 = 0;
+    for (i = 0; i < n; i++) {
+      circle_results.chi2 += w(i) * sqr(Z(i) - u(i));
+      if (i > 0 && i < n - 1)
+        circle_results.chi2 +=
+            sqr(u(i - 1) / (s(i) - s(i - 1)) - u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
+                u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
+            VarBeta(i);
     }
-    
+
     // assert(circle_results.chi2>=0);
   }
-  
+
   /*!
     \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
     
@@ -388,93 +400,97 @@ namespace BrokenLine {
     The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and their covariance matrix.
     The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
   */
-  template<typename V4, typename M6xN, int N>
-  __host__ __device__ inline void BL_Line_fit(const  M6xN & hits_ge,
-					      const V4& fast_fit,
-					      const double B,
-					      const PreparedBrokenLineData<N>& data,
-					      line_fit & line_results) {
+  template <typename V4, typename M6xN, int N>
+  __host__ __device__ inline void BL_Line_fit(const M6xN& hits_ge,
+                                              const V4& fast_fit,
+                                              const double B,
+                                              const PreparedBrokenLineData<N>& data,
+                                              line_fit& line_results) {
     constexpr u_int n = N;
     u_int i;
-    
-    const auto & radii=data.radii;
-    const auto & S=data.S;
-    const auto & Z=data.Z;
-    const auto& VarBeta=data.VarBeta;
-    
-    const double slope=-data.q/fast_fit(3);
-    Matrix2d R=RotationMatrix(slope);
-    
-    Matrix3d V=Matrix3d::Zero(); // covariance matrix XYZ
-    Matrix2x3d JacobXYZtosZ=Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz)
-    VectorNd<N> w=VectorNd<N>::Zero();
-    for(i=0;i<n;i++) {
-      V(0,0)=hits_ge.col(i)[0];                // x errors
-      V(0,1)=V(1,0)=hits_ge.col(i)[1];   // cov_xy
-      V(0,2)=V(2,0)= hits_ge.col(i)[3];   // cov_xz
-      V(1,1)=hits_ge.col(i)[2];                // y errors
-      V(2,1)=V(1,2)=hits_ge.col(i)[4];   // cov_yz
-      V(2,2)=hits_ge.col(i)[5];                // z errors
-      auto tmp = 1./radii.block(0,i,2,1).norm();
-      JacobXYZtosZ(0,0)=radii(1,i)*tmp;
-      JacobXYZtosZ(0,1)=-radii(0,i)*tmp;
-      JacobXYZtosZ(1,2)=1.;
-      w(i)=1./((R*JacobXYZtosZ*V*JacobXYZtosZ.transpose()*R.transpose())(1,1)); // compute the orthogonal weight point by point
+
+    const auto& radii = data.radii;
+    const auto& S = data.S;
+    const auto& Z = data.Z;
+    const auto& VarBeta = data.VarBeta;
+
+    const double slope = -data.q / fast_fit(3);
+    Matrix2d R = RotationMatrix(slope);
+
+    Matrix3d V = Matrix3d::Zero();                 // covariance matrix XYZ
+    Matrix2x3d JacobXYZtosZ = Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    VectorNd<N> w = VectorNd<N>::Zero();
+    for (i = 0; i < n; i++) {
+      V(0, 0) = hits_ge.col(i)[0];            // x errors
+      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      V(0, 2) = V(2, 0) = hits_ge.col(i)[3];  // cov_xz
+      V(1, 1) = hits_ge.col(i)[2];            // y errors
+      V(2, 1) = V(1, 2) = hits_ge.col(i)[4];  // cov_yz
+      V(2, 2) = hits_ge.col(i)[5];            // z errors
+      auto tmp = 1. / radii.block(0, i, 2, 1).norm();
+      JacobXYZtosZ(0, 0) = radii(1, i) * tmp;
+      JacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
+      JacobXYZtosZ(1, 2) = 1.;
+      w(i) = 1. / ((R * JacobXYZtosZ * V * JacobXYZtosZ.transpose() * R.transpose())(
+                      1, 1));  // compute the orthogonal weight point by point
     }
-    
+
     VectorNd<N> r_u;
-    for(i=0;i<n;i++) {
-      r_u(i)=w(i)*Z(i);
+    for (i = 0; i < n; i++) {
+      r_u(i) = w(i) * Z(i);
     }
 #ifdef CPP_DUMP
-    std::cout << "CU4\n" << MatrixC_u(w,S,VarBeta) << std::endl;
+    std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl;
 #endif
-    MatrixNd<N> I; choleskyInversion::invert(MatrixC_u(w,S,VarBeta),I);
+    MatrixNd<N> I;
+    choleskyInversion::invert(MatrixC_u(w, S, VarBeta), I);
     //    MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
 #ifdef CPP_DUMP
     std::cout << "I4\n" << I << std::endl;
 #endif
 
-    VectorNd<N> u=I*r_u; // obtain the fitted parameters by solving the linear system
-    
+    VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+
     // line parameters in the system in which the first hit is the origin and with axis along SZ
-    line_results.par << (u(1)-u(0))/(S(1)-S(0)), u(0);
-    auto idiff = 1./(S(1)-S(0));
-    line_results.cov << (I(0,0)-2*I(0,1)+I(1,1))*sqr(idiff)+MultScatt(S(1)-S(0),B,fast_fit(2),2,slope),
-      (I(0,1)-I(0,0))*idiff,
-      (I(0,1)-I(0,0))*idiff, I(0,0);
-    
+    line_results.par << (u(1) - u(0)) / (S(1) - S(0)), u(0);
+    auto idiff = 1. / (S(1) - S(0));
+    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * sqr(idiff) +
+                            MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope),
+        (I(0, 1) - I(0, 0)) * idiff, (I(0, 1) - I(0, 0)) * idiff, I(0, 0);
+
     // translate to the original SZ system
     Matrix2d Jacob;
-    Jacob(0,0)=1.;
-    Jacob(0,1)=0;
-    Jacob(1,0)=-S(0);
-    Jacob(1,1)=1.;
-    line_results.par(1)+=-line_results.par(0)*S(0);
-    line_results.cov=Jacob*line_results.cov*Jacob.transpose();
-    
+    Jacob(0, 0) = 1.;
+    Jacob(0, 1) = 0;
+    Jacob(1, 0) = -S(0);
+    Jacob(1, 1) = 1.;
+    line_results.par(1) += -line_results.par(0) * S(0);
+    line_results.cov = Jacob * line_results.cov * Jacob.transpose();
+
     // rotate to the original sz system
-    auto tmp=R(0,0)-line_results.par(0)*R(0,1);
-    Jacob(1,1)=1./tmp;
-    Jacob(0,0)=Jacob(1,1)*Jacob(1,1);
-    Jacob(0,1)=0;
-    Jacob(1,0)=line_results.par(1)*R(0,1)*Jacob(0,0);
-    line_results.par(1)=line_results.par(1)*Jacob(1,1);
-    line_results.par(0)=(R(0,1)+line_results.par(0)*R(0,0))*Jacob(1,1);
-    line_results.cov=Jacob*line_results.cov*Jacob.transpose();
-    
+    auto tmp = R(0, 0) - line_results.par(0) * R(0, 1);
+    Jacob(1, 1) = 1. / tmp;
+    Jacob(0, 0) = Jacob(1, 1) * Jacob(1, 1);
+    Jacob(0, 1) = 0;
+    Jacob(1, 0) = line_results.par(1) * R(0, 1) * Jacob(0, 0);
+    line_results.par(1) = line_results.par(1) * Jacob(1, 1);
+    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * Jacob(1, 1);
+    line_results.cov = Jacob * line_results.cov * Jacob.transpose();
+
     // compute chi2
-    line_results.chi2=0;
-    for(i=0;i<n;i++) {
-      line_results.chi2+=w(i)*sqr(Z(i)-u(i));
-      if(i>0 && i<n-1) line_results.chi2+=sqr(u(i-1)/(S(i)-S(i-1))
-					      -u(i)*(S(i+1)-S(i-1))/((S(i+1)-S(i))*(S(i)-S(i-1)))
-					      +u(i+1)/(S(i+1)-S(i)))/VarBeta(i);
+    line_results.chi2 = 0;
+    for (i = 0; i < n; i++) {
+      line_results.chi2 += w(i) * sqr(Z(i) - u(i));
+      if (i > 0 && i < n - 1)
+        line_results.chi2 +=
+            sqr(u(i - 1) / (S(i) - S(i - 1)) - u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
+                u(i + 1) / (S(i + 1) - S(i))) /
+            VarBeta(i);
     }
-    
+
     // assert(line_results.chi2>=0);
   }
-  
+
   /*!
     \brief Helix fit by three step:
     -fast pre-fit (see Fast_fit() for further info); \n
@@ -509,40 +525,37 @@ namespace BrokenLine {
     
     \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
   */
-  template<int N>
-  inline helix_fit BL_Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float,6,4>& hits_ge, const double B)
-{
+  template <int N>
+  inline helix_fit BL_Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float, 6, 4>& hits_ge, const double B) {
     helix_fit helix;
     Vector4d fast_fit;
-    BL_Fast_fit(hits,fast_fit);
+    BL_Fast_fit(hits, fast_fit);
 
     PreparedBrokenLineData<N> data;
     karimaki_circle_fit circle;
     line_fit line;
     Matrix3d Jacob;
-    
-    prepareBrokenLineData(hits,fast_fit,B,data);
-    BL_Line_fit(hits_ge,fast_fit,B,data,line);
-    BL_Circle_fit(hits,hits_ge,fast_fit,B,data,circle);
-    
+
+    prepareBrokenLineData(hits, fast_fit, B, data);
+    BL_Line_fit(hits_ge, fast_fit, B, data, line);
+    BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+
     // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
-    Jacob << 1.,0,0,
-      0,1.,0,
-      0,0,-std::abs(circle.par(2))*B/(sqr(circle.par(2))*circle.par(2));
-    circle.par(2)=B/std::abs(circle.par(2));
-    circle.cov=Jacob*circle.cov*Jacob.transpose();
-    
+    Jacob << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (sqr(circle.par(2)) * circle.par(2));
+    circle.par(2) = B / std::abs(circle.par(2));
+    circle.cov = Jacob * circle.cov * Jacob.transpose();
+
     helix.par << circle.par, line.par;
-    helix.cov=MatrixXd::Zero(5, 5);
-    helix.cov.block(0,0,3,3)=circle.cov;
-    helix.cov.block(3,3,2,2)=line.cov;
-    helix.q=circle.q;
-    helix.chi2_circle=circle.chi2;
-    helix.chi2_line=line.chi2;
-    
+    helix.cov = MatrixXd::Zero(5, 5);
+    helix.cov.block(0, 0, 3, 3) = circle.cov;
+    helix.cov.block(3, 3, 2, 2) = line.cov;
+    helix.q = circle.q;
+    helix.chi2_circle = circle.chi2;
+    helix.chi2_line = line.chi2;
+
     return helix;
   }
-  
+
 }  // namespace BrokenLine
 
-#endif
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
index e6ab9f93ca306..b97dda4e65919 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
@@ -8,10 +8,7 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
-
-namespace Rfit
-{
-
+namespace Rfit {
 
   using Vector2d = Eigen::Vector2d;
   using Vector3d = Eigen::Vector3d;
@@ -23,13 +20,10 @@ namespace Rfit
   using Matrix5d = Eigen::Matrix<double, 5, 5>;
   using Matrix6d = Eigen::Matrix<double, 6, 6>;
 
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;  // used for inputs hits
 
- template<int N>
- using Matrix3xNd = Eigen::Matrix<double, 3, N>; // used for inputs hits
-
-
-  struct circle_fit
-  {
+  struct circle_fit {
     Vector3d par;  //!< parameter: (X0,Y0,R)
     Matrix3d cov;
     /*!< covariance matrix: \n
@@ -40,9 +34,8 @@ namespace Rfit
     int32_t q;  //!< particle charge
     float chi2;
   };
-  
-  struct line_fit
-  {
+
+  struct line_fit {
     Vector2d par;  //!<(cotan(theta),Zip)
     Matrix2d cov;
     /*!<
@@ -51,9 +44,8 @@ namespace Rfit
     */
     double chi2;
   };
-  
-  struct helix_fit
-  {
+
+  struct helix_fit {
     Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
     Matrix5d cov;
     /*!< ()->cov() \n
@@ -67,7 +59,7 @@ namespace Rfit
     float chi2_line;
     //    Vector4d fast_fit;
     int32_t q;  //!< particle charge
-  }; // __attribute__((aligned(16)));
+  };            // __attribute__((aligned(16)));
 
-} // namespace RFit
+}  // namespace Rfit
 #endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
index 03ccb011645ec..7a78cf4cff47c 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -1,95 +1,75 @@
 #ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
 #define RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
 
-
-#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-
 #include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 
+namespace Rfit {
 
-namespace Rfit
-{
-
-
-  constexpr double d = 1.e-4;          //!< used in numerical derivative (J2 in Circle_fit())
-
-
+  constexpr double d = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
 
   using VectorXd = Eigen::VectorXd;
   using MatrixXd = Eigen::MatrixXd;
-  template<int N>
+  template <int N>
   using MatrixNd = Eigen::Matrix<double, N, N>;
-  template<int N>
-  using MatrixNplusONEd = Eigen::Matrix<double, N+1, N+1>; 
-  template<int N>
+  template <int N>
+  using MatrixNplusONEd = Eigen::Matrix<double, N + 1, N + 1>;
+  template <int N>
   using ArrayNd = Eigen::Array<double, N, N>;
-  template<int N>
+  template <int N>
   using Matrix2Nd = Eigen::Matrix<double, 2 * N, 2 * N>;
-  template<int N>
+  template <int N>
   using Matrix3Nd = Eigen::Matrix<double, 3 * N, 3 * N>;
-  template<int N>
+  template <int N>
   using Matrix2xNd = Eigen::Matrix<double, 2, N>;
-  template<int N>
+  template <int N>
   using Array2xNd = Eigen::Array<double, 2, N>;
-  template<int N>
+  template <int N>
   using MatrixNx3d = Eigen::Matrix<double, N, 3>;
-  template<int N>
+  template <int N>
   using MatrixNx5d = Eigen::Matrix<double, N, 5>;
-  template<int N>
+  template <int N>
   using VectorNd = Eigen::Matrix<double, N, 1>;
-  template<int N>
-  using VectorNplusONEd  = Eigen::Matrix<double, N+1, 1>;
-  template<int N>
+  template <int N>
+  using VectorNplusONEd = Eigen::Matrix<double, N + 1, 1>;
+  template <int N>
   using Vector2Nd = Eigen::Matrix<double, 2 * N, 1>;
-  template<int N>
+  template <int N>
   using Vector3Nd = Eigen::Matrix<double, 3 * N, 1>;
-  template<int N>
+  template <int N>
   using RowVectorNd = Eigen::Matrix<double, 1, 1, N>;
-  template<int N>
+  template <int N>
   using RowVector2Nd = Eigen::Matrix<double, 1, 2 * N>;
 
-
   using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
 
-  
   using Matrix3f = Eigen::Matrix3f;
   using Vector3f = Eigen::Vector3f;
   using Vector4f = Eigen::Vector4f;
   using Vector6f = Eigen::Matrix<double, 6, 1>;
 
-
-
-  
   using u_int = unsigned int;
 
-
-
-  
   template <class C>
-  __host__ __device__ void printIt(C* m, const char* prefix = "")
-  {
+  __host__ __device__ void printIt(C* m, const char* prefix = "") {
 #ifdef RFIT_DEBUG
-    for (u_int r = 0; r < m->rows(); ++r)
-      {
-        for (u_int c = 0; c < m->cols(); ++c)
-	  {
-            printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
-	  }
+    for (u_int r = 0; r < m->rows(); ++r) {
+      for (u_int c = 0; c < m->cols(); ++c) {
+        printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
       }
+    }
 #endif
   }
-  
+
   /*!
     \brief raise to square.
   */
   template <typename T>
-  constexpr T sqr(const T a)
-  {
+  constexpr T sqr(const T a) {
     return a * a;
   }
-  
+
   /*!
     \brief Compute cross product of two 2D vector (assuming z component 0),
     returning z component of the result.
@@ -97,18 +77,17 @@ namespace Rfit
     \param b second 2D vector in the product.
     \return z component of the cross product.
   */
-  
-  __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b)
-  {
+
+  __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b) {
     return a.x() * b.y() - a.y() * b.x();
   }
-  
+
   /*!
    *  load error in CMSSW format to our formalism
    *  
    */
-  template<typename M6xNf, typename M2Nd>
-  __host__ __device__ void loadCovariance2D(M6xNf const & ge,  M2Nd & hits_cov) {
+  template <typename M6xNf, typename M2Nd>
+  __host__ __device__ void loadCovariance2D(M6xNf const& ge, M2Nd& hits_cov) {
     // Index numerology:
     // i: index of the hits/point (0,..,3)
     // j: index of space component (x,y,z)
@@ -122,20 +101,25 @@ namespace Rfit
     // | 1  2  4 |
     // | 3  4  5 |
     constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
-    for (uint32_t i=0; i< hits_in_fit; ++i) {
-      auto ge_idx = 0; auto j=0; auto l=0;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2; j=1; l=1;
+    for (uint32_t i = 0; i < hits_in_fit; ++i) {
+      auto ge_idx = 0;
+      auto j = 0;
+      auto l = 0;
       hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1; j=1; l=0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
+      ge_idx = 2;
+      j = 1;
+      l = 1;
       hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1;
+      j = 1;
+      l = 0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
     }
   }
-  
-  template<typename M6xNf, typename M3xNd>
-  __host__ __device__ void loadCovariance(M6xNf const & ge,  M3xNd & hits_cov) {
-    
+
+  template <typename M6xNf, typename M3xNd>
+  __host__ __device__ void loadCovariance(M6xNf const& ge, M3xNd& hits_cov) {
     // Index numerology:
     // i: index of the hits/point (0,..,3)
     // j: index of space component (x,y,z)
@@ -149,22 +133,34 @@ namespace Rfit
     // | 1  2  4 |
     // | 3  4  5 |
     constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
-    for (uint32_t i=0; i<hits_in_fit; ++i) {
-      auto ge_idx = 0; auto j=0; auto l=0;
+    for (uint32_t i = 0; i < hits_in_fit; ++i) {
+      auto ge_idx = 0;
+      auto j = 0;
+      auto l = 0;
       hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2; j=1; l=1;
+      ge_idx = 2;
+      j = 1;
+      l = 1;
       hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 5; j=2; l=2;
+      ge_idx = 5;
+      j = 2;
+      l = 2;
       hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1; j=1; l=0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
-	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 3; j=2; l=0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
-	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 4; j=2; l=1;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) =
-	hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      ge_idx = 1;
+      j = 1;
+      l = 0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
+      ge_idx = 3;
+      j = 2;
+      l = 0;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
+      ge_idx = 4;
+      j = 2;
+      l = 1;
+      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+          ge.col(i)[ge_idx];
     }
   }
 
@@ -176,28 +172,23 @@ namespace Rfit
     \param B magnetic field in Gev/cm/c unit.
     \param error flag for errors computation.
   */
-  __host__ __device__
-  inline void par_uvrtopak(circle_fit& circle, const double B, const bool error)
-  {
+  __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool error) {
     Vector3d par_pak;
     const double temp0 = circle.par.head(2).squaredNorm();
     const double temp1 = sqrt(temp0);
-    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)),
-      circle.q * (temp1 - circle.par(2)), circle.par(2) * B;
-    if (error)
-      {
-        const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-        const double temp3 = 1. / temp1 * circle.q;
-        Matrix3d J4;
-        J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., 
-	  circle.par(0) * temp3, circle.par(1) * temp3, -circle.q,
-	  0., 0., B;
-        circle.cov = J4 * circle.cov * J4.transpose();
-      }
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
+        circle.par(2) * B;
+    if (error) {
+      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+      const double temp3 = 1. / temp1 * circle.q;
+      Matrix3d J4;
+      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.q, 0., 0., B;
+      circle.cov = J4 * circle.cov * J4.transpose();
+    }
     circle.par = par_pak;
   }
-  
-}
 
+}  // namespace Rfit
 
-#endif
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
index fac80941a48e0..9fb8843589669 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_PixelNtupletsFitter_H
-#define RecoPixelVertexing_PixelTrackFitting_PixelNtupletsFitter_H
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h
 
 #include <vector>
 
@@ -12,16 +12,16 @@
 
 class PixelNtupletsFitter final : public PixelFitterBase {
 public:
-  explicit PixelNtupletsFitter(float nominalB, const MagneticField *field, bool useRiemannFit);
+  explicit PixelNtupletsFitter(float nominalB, const MagneticField* field, bool useRiemannFit);
   ~PixelNtupletsFitter() override = default;
-  std::unique_ptr<reco::Track> run(const std::vector<const TrackingRecHit *>& hits,
+  std::unique_ptr<reco::Track> run(const std::vector<const TrackingRecHit*>& hits,
                                    const TrackingRegion& region,
                                    const edm::EventSetup& setup) const override;
 
 private:
   float nominalB_;
-  const MagneticField *field_;
+  const MagneticField* field_;
   bool useRiemannFit_;
 };
 
-#endif // RecoPixelVertexing_PixelTrackFitting_PixelNtupletsFitter_H
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 656047aababf9..f69b425ef884a 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -3,11 +3,9 @@
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
-namespace Rfit
-{
+namespace Rfit {
 
-  
-/*!  Compute the Radiation length in the uniform hypothesis
+  /*!  Compute the Radiation length in the uniform hypothesis
  *
  * The Pixel detector, barrel and forward, is considered as an omogeneous
  * cilinder of material, whose radiation lengths has been derived from the TDR
@@ -29,22 +27,19 @@ namespace Rfit
  * \return incremental radiation lengths that correspond to each segment.
  */
 
-
-  template<typename VNd1, typename VNd2>
-__host__ __device__ inline
-void computeRadLenUniformMaterial(const VNd1 &length_values,
-    VNd2 & rad_lengths) {
-  // Radiation length of the pixel detector in the uniform assumption, with
-  // 0.06 rad_len at 16 cm
-  constexpr double XX_0_inv = 0.06/16.;
-  u_int n = length_values.rows();
-  rad_lengths(0) = length_values(0)*XX_0_inv;
-  for (u_int j = 1; j < n; ++j) {
-    rad_lengths(j) = std::abs(length_values(j)-length_values(j-1))*XX_0_inv;
+  template <typename VNd1, typename VNd2>
+  __host__ __device__ inline void computeRadLenUniformMaterial(const VNd1& length_values, VNd2& rad_lengths) {
+    // Radiation length of the pixel detector in the uniform assumption, with
+    // 0.06 rad_len at 16 cm
+    constexpr double XX_0_inv = 0.06 / 16.;
+    u_int n = length_values.rows();
+    rad_lengths(0) = length_values(0) * XX_0_inv;
+    for (u_int j = 1; j < n; ++j) {
+      rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * XX_0_inv;
+    }
   }
-}
 
-/*!
+  /*!
     \brief Compute the covariance matrix along cartesian S-Z of points due to
     multiple Coulomb scattering to be used in the line_fit, for the barrel
     and forward cases.
@@ -63,20 +58,19 @@ void computeRadLenUniformMaterial(const VNd1 &length_values,
     correspond to the case at eta = 0.
  */
 
-  template<typename V4, typename VNd1, typename VNd2, int N>
-__host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
-						 const V4& fast_fit,
-						 VNd1 const& s_arcs,
-						 VNd2 const& z_values,
-						 const double theta,
-						 const double B, 
-                                                 MatrixNd<N>& ret)
-{
+  template <typename V4, typename VNd1, typename VNd2, int N>
+  __host__ __device__ inline auto Scatter_cov_line(Matrix2d const* cov_sz,
+                                                   const V4& fast_fit,
+                                                   VNd1 const& s_arcs,
+                                                   VNd2 const& z_values,
+                                                   const double theta,
+                                                   const double B,
+                                                   MatrixNd<N>& ret) {
 #ifdef RFIT_DEBUG
     Rfit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
 #endif
     constexpr u_int n = N;
-    double p_t = std::min(20.,fast_fit(2) * B);   // limit pt to avoid too small error!!!
+    double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
     VectorNd<N> rad_lengths_S;
     // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
@@ -92,16 +86,13 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
 #endif
     Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
     for (u_int k = 0; k < n; ++k) {
-     tmp(k, k) = cov_sz[k](0, 0);
-     tmp(k + n, k + n) = cov_sz[k](1, 1);
-     tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
-    } 
-    for (u_int k = 0; k < n; ++k)
-    {
-      for (u_int l = k; l < n; ++l)
-      {
-        for (u_int i = 0; i < std::min(k, l); ++i)
-        {
+      tmp(k, k) = cov_sz[k](0, 0);
+      tmp(k + n, k + n) = cov_sz[k](1, 1);
+      tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
+    }
+    for (u_int k = 0; k < n; ++k) {
+      for (u_int l = k; l < n; ++l) {
+        for (u_int i = 0; i < std::min(k, l); ++i) {
           tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
         }
         tmp(l + n, k + n) = tmp(k + n, l + n);
@@ -113,9 +104,9 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
     Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
     ret = tmp.block(n, n, n, n);
-}
+  }
 
-/*!
+  /*!
     \brief Compute the covariance matrix (in radial coordinates) of points in
     the transverse plane due to multiple Coulomb scattering.
     \param p2D 2D points in the transverse plane.
@@ -128,52 +119,47 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
     \details Only the tangential component is computed (the radial one is
     negligible).
  */
-  template<typename M2xN, typename V4, int N>
+  template <typename M2xN, typename V4, int N>
   __host__ __device__ inline MatrixNd<N> Scatter_cov_rad(const M2xN& p2D,
-							 const V4& fast_fit,
-							 VectorNd<N> const& rad,
-							 double B)
-{
+                                                         const V4& fast_fit,
+                                                         VectorNd<N> const& rad,
+                                                         double B) {
     constexpr u_int n = N;
-    double p_t = std::min(20.,fast_fit(2) * B);   // limit pt to avoid too small error!!!
+    double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
     double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
     double theta = atan(fast_fit(3));
-    theta = theta < 0. ? theta + M_PI :  theta;
+    theta = theta < 0. ? theta + M_PI : theta;
     VectorNd<N> s_values;
     VectorNd<N> rad_lengths;
     const Vector2d o(fast_fit(0), fast_fit(1));
 
     // associated Jacobian, used in weights and errors computation
-    for (u_int i = 0; i < n; ++i)
-    {  // x
-        Vector2d p = p2D.block(0, i, 2, 1) - o;
-        const double cross = cross2D(-o, p);
-        const double dot = (-o).dot(p);
-        const double atan2_ = atan2(cross, dot);
-        s_values(i) = std::abs(atan2_ * fast_fit(2));
+    for (u_int i = 0; i < n; ++i) {  // x
+      Vector2d p = p2D.block(0, i, 2, 1) - o;
+      const double cross = cross2D(-o, p);
+      const double dot = (-o).dot(p);
+      const double atan2_ = atan2(cross, dot);
+      s_values(i) = std::abs(atan2_ * fast_fit(2));
     }
-    computeRadLenUniformMaterial(s_values*sqrt(1. + 1./(fast_fit(3)*fast_fit(3))), rad_lengths);
+    computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / (fast_fit(3) * fast_fit(3))), rad_lengths);
     MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
-    VectorNd<N> sig2 =  (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
-    sig2 *= 0.000225 / ( p_2 * sqr(sin(theta)) );
-    for (u_int k = 0; k < n; ++k)
-    {
-        for (u_int l = k; l < n; ++l)
-        {
-            for (u_int i = 0; i < std::min(k, l); ++i)
-            {
-              scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
-            }
-            scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+    VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+    sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
+    for (u_int k = 0; k < n; ++k) {
+      for (u_int l = k; l < n; ++l) {
+        for (u_int i = 0; i < std::min(k, l); ++i) {
+          scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
         }
+        scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+      }
     }
 #ifdef RFIT_DEBUG
     Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
 #endif
     return scatter_cov_rad;
-}
+  }
 
-/*!
+  /*!
     \brief Transform covariance matrix from radial (only tangential component)
     to Cartesian coordinates (only transverse plane component).
     \param p2D 2D points in the transverse plane.
@@ -181,11 +167,10 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
     \return cov_cart covariance matrix in Cartesian coordinates.
 */
 
-  template<typename M2xN, int N>
+  template <typename M2xN, int N>
   __host__ __device__ inline Matrix2Nd<N> cov_radtocart(const M2xN& p2D,
-                                                   const MatrixNd<N>& cov_rad,
-                                                   const VectorNd<N>& rad)
-{
+                                                        const MatrixNd<N>& cov_rad,
+                                                        const VectorNd<N>& rad) {
 #ifdef RFIT_DEBUG
     printf("Address of p2D: %p\n", &p2D);
 #endif
@@ -194,24 +179,22 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
     Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
     VectorNd<N> rad_inv = rad.cwiseInverse();
     printIt(&rad_inv, "cov_radtocart - rad_inv:");
-    for (u_int i = 0; i < n; ++i)
-    {
-        for (u_int j = i; j < n; ++j)
-        {
-            cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
-            cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
-            cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
-            cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
-            cov_cart(j, i) = cov_cart(i, j);
-            cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
-            cov_cart(j + n, i) = cov_cart(i, j + n);
-            cov_cart(j, i + n) = cov_cart(i + n, j);
-        }
+    for (u_int i = 0; i < n; ++i) {
+      for (u_int j = i; j < n; ++j) {
+        cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(j, i) = cov_cart(i, j);
+        cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
+        cov_cart(j + n, i) = cov_cart(i, j + n);
+        cov_cart(j, i + n) = cov_cart(i + n, j);
+      }
     }
     return cov_cart;
-}
+  }
 
-/*!
+  /*!
     \brief Transform covariance matrix from Cartesian coordinates (only
     transverse plane component) to radial coordinates (both radial and
     tangential component but only diagonal terms, correlation between different
@@ -221,28 +204,26 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
     \return cov_rad covariance matrix in raidal coordinate.
     \warning correlation between different point are not computed.
 */
-    template<typename M2xN, int N>
-    __host__ __device__ inline VectorNd<N> cov_carttorad(const M2xN& p2D,
-							 const Matrix2Nd<N>& cov_cart,
-							 const VectorNd<N>& rad)
-{
+  template <typename M2xN, int N>
+  __host__ __device__ inline VectorNd<N> cov_carttorad(const M2xN& p2D,
+                                                       const Matrix2Nd<N>& cov_cart,
+                                                       const VectorNd<N>& rad) {
     constexpr u_int n = N;
     VectorNd<N> cov_rad;
     const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
-    for (u_int i = 0; i < n; ++i)
-    {
-        //!< in case you have (0,0) to avoid dividing by 0 radius
-        if (rad(i) < 1.e-4)
-            cov_rad(i) = cov_cart(i, i);
-        else
-        {
-            cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) - 2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
-        }
+    for (u_int i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);
+      else {
+        cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
+                                    2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+      }
     }
     return cov_rad;
-}
+  }
 
-/*!
+  /*!
     \brief Transform covariance matrix from Cartesian coordinates (only
     transverse plane component) to coordinates system orthogonal to the
     pre-fitted circle in each point.
@@ -254,33 +235,32 @@ __host__ __device__ inline auto Scatter_cov_line(Matrix2d const * cov_sz,
     \return cov_rad covariance matrix in the pre-fitted circle's
     orthogonal system.
 */
-template<typename M2xN, typename V4, int N>
-  __host__ __device__ inline VectorNd<N> cov_carttorad_prefit(const M2xN& p2D, const Matrix2Nd<N>& cov_cart,
-							      V4& fast_fit,
-							      const VectorNd<N>& rad)
-{
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline VectorNd<N> cov_carttorad_prefit(const M2xN& p2D,
+                                                              const Matrix2Nd<N>& cov_cart,
+                                                              V4& fast_fit,
+                                                              const VectorNd<N>& rad) {
     constexpr u_int n = N;
     VectorNd<N> cov_rad;
-    for (u_int i = 0; i < n; ++i)
-    {
-        //!< in case you have (0,0) to avoid dividing by 0 radius
-        if (rad(i) < 1.e-4)
-            cov_rad(i) = cov_cart(i, i);  // TO FIX
-        else
-        {
-            Vector2d a = p2D.col(i);
-            Vector2d b = p2D.col(i) - fast_fit.head(2);
-            const double x2 = a.dot(b);
-            const double y2 = cross2D(a, b);
-            const double tan_c = -y2 / x2;
-            const double tan_c2 = sqr(tan_c);
-            cov_rad(i) = 1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
-        }
+    for (u_int i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);  // TO FIX
+      else {
+        Vector2d a = p2D.col(i);
+        Vector2d b = p2D.col(i) - fast_fit.head(2);
+        const double x2 = a.dot(b);
+        const double y2 = cross2D(a, b);
+        const double tan_c = -y2 / x2;
+        const double tan_c2 = sqr(tan_c);
+        cov_rad(i) =
+            1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+      }
     }
     return cov_rad;
-}
+  }
 
-/*!
+  /*!
     \brief Compute the points' weights' vector for the circle fit when multiple
     scattering is managed.
     Further information in attached documentation.
@@ -291,13 +271,12 @@ template<typename M2xN, typename V4, int N>
     diagonal cov matrix. Further investigation needed.
 */
 
-  template<int N>  
-  __host__ __device__ inline VectorNd<N> Weight_circle(const MatrixNd<N>& cov_rad_inv)
-{
+  template <int N>
+  __host__ __device__ inline VectorNd<N> Weight_circle(const MatrixNd<N>& cov_rad_inv) {
     return cov_rad_inv.colwise().sum().transpose();
-}
+  }
 
-/*!
+  /*!
     \brief Find particle q considering the  sign of cross product between
     particles velocity (estimated by the first 2 hits) and the vector radius
     between the first hit and the center of the fitted circle.
@@ -305,14 +284,15 @@ template<typename M2xN, typename V4, int N>
     \param par_uvr result of the circle fit in this form: (X0,Y0,R).
     \return q int 1 or -1.
 */
-template<typename M2xN> 
-  __host__ __device__ inline int32_t Charge(const M2xN& p2D, const Vector3d& par_uvr)
-{
-    return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) > 0)? -1 : 1;
-}
-
+  template <typename M2xN>
+  __host__ __device__ inline int32_t Charge(const M2xN& p2D, const Vector3d& par_uvr) {
+    return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
+            0)
+               ? -1
+               : 1;
+  }
 
-/*!
+  /*!
     \brief Compute the eigenvector associated to the minimum eigenvalue.
     \param A the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored.
@@ -327,8 +307,7 @@ template<typename M2xN>
     For this optimization the matrix type must be known at compiling time.
 */
 
-__host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2)
-{
+  __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
 #ifdef RFIT_DEBUG
     printf("min_eigen3D - enter\n");
 #endif
@@ -340,9 +319,9 @@ __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2)
     printf("min_eigen3D - exit\n");
 #endif
     return solver.eigenvectors().col(min_index);
-}
+  }
 
-/*!
+  /*!
     \brief A faster version of min_eigen3D() where double precision is not
     needed.
     \param A the Matrix you want to know eigenvector and eigenvalue.
@@ -353,16 +332,15 @@ __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2)
     speed up in  single precision.
 */
 
-__host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A)
-{
+  __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
     Eigen::SelfAdjointEigenSolver<Matrix3f> solver(3);
     solver.computeDirect(A.cast<float>());
     int min_index;
     solver.eigenvalues().minCoeff(&min_index);
     return solver.eigenvectors().col(min_index).cast<double>();
-}
+  }
 
-/*!
+  /*!
     \brief 2D version of min_eigen3D().
     \param A the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored
@@ -372,16 +350,15 @@ __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A)
     significantly in single precision.
 */
 
-__host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2)
-{
+  __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
     Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
     solver.computeDirect(A);
     int min_index;
     chi2 = solver.eigenvalues().minCoeff(&min_index);
     return solver.eigenvectors().col(min_index);
-}
+  }
 
-/*!
+  /*!
     \brief A very fast helix fit: it fits a circle by three points (first, middle
     and last point) and a line by two points (first and last).
     \param hits points to be fitted
@@ -394,11 +371,10 @@ __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2)
     - computation of error due to multiple scattering.
 */
 
-template<typename M3xN, typename V4>
-__host__ __device__ inline void Fast_fit(const M3xN& hits, V4 & result)
-{
+  template <typename M3xN, typename V4>
+  __host__ __device__ inline void Fast_fit(const M3xN& hits, V4& result) {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
-    constexpr auto n = N; // get the number of hits
+    constexpr auto n = N;  // get the number of hits
     printIt(&hits, "Fast_fit - hits: ");
 
     // CIRCLE FIT
@@ -416,18 +392,18 @@ __host__ __device__ inline void Fast_fit(const M3xN& hits, V4 & result)
     // * build orthogonal lines through mid points
     // * make a system and solve for X0 and Y0.
     // * add the initial point
-    bool flip =  abs(b.x()) < abs(b.y());
+    bool flip = abs(b.x()) < abs(b.y());
     auto bx = flip ? b.y() : b.x();
     auto by = flip ? b.x() : b.y();
     auto cx = flip ? c.y() : c.x();
     auto cy = flip ? c.x() : c.y();
     //!< in case b.x is 0 (2 hits with same x)
-    auto div = 2. * (cx * by - bx*cy);
+    auto div = 2. * (cx * by - bx * cy);
     // if aligned TO FIX
-    auto Y0 = (cx*b2 - bx*c2) / div;
-    auto X0 = (0.5*b2 - Y0*by) / bx;
-    result(0) = hits(0, 0) + ( flip ? Y0 : X0);
-    result(1) = hits(1, 0) + ( flip ? X0 : Y0);
+    auto Y0 = (cx * b2 - bx * c2) / div;
+    auto X0 = (0.5 * b2 - Y0 * by) / bx;
+    result(0) = hits(0, 0) + (flip ? Y0 : X0);
+    result(1) = hits(1, 0) + (flip ? X0 : Y0);
     result(2) = sqrt(sqr(X0) + sqr(Y0));
     printIt(&result, "Fast_fit - result: ");
 
@@ -446,9 +422,9 @@ __host__ __device__ inline void Fast_fit(const M3xN& hits, V4 & result)
 #ifdef RFIT_DEBUG
     printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
 #endif
-}
+  }
 
-/*!
+  /*!
     \brief Fit a generic number of 2D points with a circle using Riemann-Chernov
     algorithm. Covariance matrix of fitted parameter is optionally computed.
     Multiple scattering (currently only in barrel layer) is optionally handled.
@@ -475,14 +451,13 @@ __host__ __device__ inline void Fast_fit(const M3xN& hits, V4 & result)
     \bug further investigation needed for error propagation with multiple
     scattering.
 */
-template<typename M2xN, typename V4, int N>
-__host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
-                                                 const Matrix2Nd<N>& hits_cov2D,
-                                                 const V4& fast_fit,
-                                                 const VectorNd<N>& rad,
-                                                 const double B,
-                                                 const bool error)
-{
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline circle_fit Circle_fit(const M2xN& hits2D,
+                                                   const Matrix2Nd<N>& hits_cov2D,
+                                                   const V4& fast_fit,
+                                                   const VectorNd<N>& rad,
+                                                   const double B,
+                                                   const bool error) {
 #ifdef RFIT_DEBUG
     printf("circle_fit - enter\n");
 #endif
@@ -500,22 +475,22 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
     MatrixNd<N> G;
     double renorm;
     {
-        MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad).asDiagonal();
-        MatrixNd<N> scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
-        printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
-        printIt(&hits2D, "circle_fit - hits2D bis:");
+      MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad).asDiagonal();
+      MatrixNd<N> scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
+      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
+      printIt(&hits2D, "circle_fit - hits2D bis:");
 #ifdef RFIT_DEBUG
-        printf("Address of hits2D: a) %p\n", &hits2D);
-#endif
-        V += cov_radtocart(hits2D, scatter_cov_rad, rad);
-        printIt(&V, "circle_fit - V:");
-        cov_rad += scatter_cov_rad;
-        printIt(&cov_rad, "circle_fit - cov_rad:");
-        choleskyInversion::invert(cov_rad,G);
-        // G = cov_rad.inverse();
-	renorm = G.sum();
-        G *= 1. / renorm;
-        weight = Weight_circle(G);
+      printf("Address of hits2D: a) %p\n", &hits2D);
+#endif
+      V += cov_radtocart(hits2D, scatter_cov_rad, rad);
+      printIt(&V, "circle_fit - V:");
+      cov_rad += scatter_cov_rad;
+      printIt(&cov_rad, "circle_fit - cov_rad:");
+      choleskyInversion::invert(cov_rad, G);
+      // G = cov_rad.inverse();
+      renorm = G.sum();
+      G *= 1. / renorm;
+      weight = Weight_circle(G);
     }
     printIt(&weight, "circle_fit - weight:");
 
@@ -552,7 +527,8 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
     // COST FUNCTION
 
     // compute
-    Vector3d r0; r0.noalias() = p3D * weight;  // center of gravity
+    Vector3d r0;
+    r0.noalias() = p3D * weight;  // center of gravity
     const Matrix3xNd<N> X = p3D.colwise() - r0;
     Matrix3d A = X * G * X.transpose();
     printIt(&A, "circle_fit - A:");
@@ -611,183 +587,174 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
     printf("circle_fit - ERROR PROPAGATION\n");
 #endif
     // ERROR PROPAGATION
-    if (error)
-    {
+    if (error) {
 #ifdef RFIT_DEBUG
-        printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
 #endif
-        ArrayNd<N> Vcs_[2][2];  // cov matrix of center & scaled points
-        MatrixNd<N> C[3][3];  // cov matrix of 3D transformed points
+      ArrayNd<N> Vcs_[2][2];  // cov matrix of center & scaled points
+      MatrixNd<N> C[3][3];    // cov matrix of 3D transformed points
 #ifdef RFIT_DEBUG
-        printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
 #endif
-        {
+      {
+        Eigen::Matrix<double, 1, 1> cm;
+        Eigen::Matrix<double, 1, 1> cm2;
+        cm = mc.transpose() * V * mc;
+        const double c = cm(0, 0);
+        Matrix2Nd<N> Vcs;
+        Vcs.template triangularView<Eigen::Upper>() =
+            (sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
+                              (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
+                              (mc * mc.transpose()));
+
+        printIt(&Vcs, "circle_fit - Vcs:");
+        C[0][0] = Vcs.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
+        Vcs_[0][1] = Vcs.block(0, n, n, n);
+        C[1][1] = Vcs.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
+        Vcs_[1][0] = Vcs_[0][1].transpose();
+        printIt(&Vcs, "circle_fit - Vcs:");
+      }
+
+      {
+        const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+        const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+        const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
+        const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
+        const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
+        const ArrayNd<N> t10 = t01.transpose();
+        Vcs_[0][0] = C[0][0];
+        ;
+        C[0][1] = Vcs_[0][1];
+        C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
+        Vcs_[1][1] = C[1][1];
+        C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
+        MatrixNd<N> tmp;
+        tmp.template triangularView<Eigen::Upper>() =
+            (2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
+                   Vcs_[1][1] * Vcs_[1][1]) +
+             4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11))
+                .matrix();
+        C[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+      }
+      printIt(&C[0][0], "circle_fit - C[0][0]:");
+
+      Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+      for (u_int i = 0; i < 3; ++i) {
+        for (u_int j = i; j < 3; ++j) {
+          Eigen::Matrix<double, 1, 1> tmp;
+          tmp = weight.transpose() * C[i][j] * weight;
+          const double c = tmp(0, 0);
+          C0(i, j) = c;  //weight.transpose() * C[i][j] * weight;
+          C0(j, i) = C0(i, j);
+        }
+      }
+      printIt(&C0, "circle_fit - C0:");
+
+      const MatrixNd<N> W = weight * weight.transpose();
+      const MatrixNd<N> H = MatrixNd<N>::Identity().rowwise() - weight.transpose();
+      const MatrixNx3d<N> s_v = H * p3D.transpose();
+      printIt(&W, "circle_fit - W:");
+      printIt(&H, "circle_fit - H:");
+      printIt(&s_v, "circle_fit - s_v:");
+
+      MatrixNd<N> D_[3][3];  // cov(s_v)
+      {
+        D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
+        D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
+        D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
+        D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
+        D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
+        D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
+        D_[1][0] = D_[0][1].transpose();
+        D_[2][0] = D_[0][2].transpose();
+        D_[2][1] = D_[1][2].transpose();
+      }
+      printIt(&D_[0][0], "circle_fit - D_[0][0]:");
+
+      constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+      Matrix6d E;  // cov matrix of the 6 independent elements of A
+      for (u_int a = 0; a < 6; ++a) {
+        const u_int i = nu[a][0], j = nu[a][1];
+        for (u_int b = a; b < 6; ++b) {
+          const u_int k = nu[b][0], l = nu[b][1];
+          VectorNd<N> t0(n);
+          VectorNd<N> t1(n);
+          if (l == k) {
+            t0 = 2. * D_[j][l] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = 2. * D_[i][l] * s_v.col(l);
+          } else {
+            t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
+          }
+
+          if (i == j) {
             Eigen::Matrix<double, 1, 1> cm;
-            Eigen::Matrix<double, 1, 1> cm2;
-            cm = mc.transpose() * V * mc;
+            cm = s_v.col(i).transpose() * (t0 + t1);
             const double c = cm(0, 0);
-	     Matrix2Nd<N> Vcs; Vcs. template triangularView<Eigen::Upper>()  = (sqr(s) * V
-				  + sqr(sqr(s)) * 1. / (4. * q * n) *
-				  (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
-				  (mc * mc.transpose()));
-
-            printIt(&Vcs, "circle_fit - Vcs:");
-            C[0][0] = Vcs.block(0, 0, n, n). template selfadjointView<Eigen::Upper>();
-            Vcs_[0][1] = Vcs.block(0, n, n, n);
-            C[1][1] = Vcs.block(n, n, n, n). template selfadjointView<Eigen::Upper>();
-            Vcs_[1][0] = Vcs_[0][1].transpose();
-            printIt(&Vcs, "circle_fit - Vcs:");
+            E(a, b) = 0. + c;
+          } else {
+            Eigen::Matrix<double, 1, 1> cm;
+            cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+            const double c = cm(0, 0);
+            E(a, b) = 0. + c;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+          }
+          if (b != a)
+            E(b, a) = E(a, b);
         }
+      }
+      printIt(&E, "circle_fit - E:");
+
+      Eigen::Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
+      for (u_int a = 0; a < 6; ++a) {
+        const u_int i = nu[a][0], j = nu[a][1];
+        Matrix3d Delta = Matrix3d::Zero();
+        Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
+        J2.col(a) = min_eigen3D_fast(A + Delta);
+        const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
+        J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
+      }
+      printIt(&J2, "circle_fit - J2:");
 
-        {
-            const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
-            const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
-            const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
-            const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
-            const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
-            const ArrayNd<N> t10 = t01.transpose();
-            Vcs_[0][0] = C[0][0];;
-            C[0][1] = Vcs_[0][1];
-            C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
-            Vcs_[1][1] = C[1][1];
-            C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
-            MatrixNd<N> tmp;
-            tmp. template triangularView<Eigen::Upper>()
-	      =  ( 2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
-					   Vcs_[1][1] * Vcs_[1][1]) +
-				     4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11) ).matrix();
-	    C[2][2] = tmp. template selfadjointView<Eigen::Upper>();
-        }
-        printIt(&C[0][0], "circle_fit - C[0][0]:");
-
-        Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
-        for (u_int i = 0; i < 3; ++i)
-        {
-            for (u_int j = i; j < 3; ++j)
-            {
-                Eigen::Matrix<double, 1, 1> tmp;
-                tmp = weight.transpose() * C[i][j] * weight;
-                const double c = tmp(0, 0);
-                C0(i, j) = c;  //weight.transpose() * C[i][j] * weight;
-                C0(j, i) = C0(i, j);
-            }
-        }
-        printIt(&C0, "circle_fit - C0:");
-
-        const MatrixNd<N> W = weight * weight.transpose();
-        const MatrixNd<N> H = MatrixNd<N>::Identity().rowwise() - weight.transpose();
-        const MatrixNx3d<N> s_v = H * p3D.transpose();
-        printIt(&W, "circle_fit - W:");
-        printIt(&H, "circle_fit - H:");
-        printIt(&s_v, "circle_fit - s_v:");
-
-        MatrixNd<N> D_[3][3];  // cov(s_v)
-        {
-            D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
-            D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
-            D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
-            D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
-            D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
-            D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
-            D_[1][0] = D_[0][1].transpose();
-            D_[2][0] = D_[0][2].transpose();
-            D_[2][1] = D_[1][2].transpose();
-        }
-        printIt(&D_[0][0], "circle_fit - D_[0][0]:");
-
-        constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
-
-        Matrix6d E;  // cov matrix of the 6 independent elements of A
-        for (u_int a = 0; a < 6; ++a)
-        {
-            const u_int i = nu[a][0], j = nu[a][1];
-            for (u_int b = a; b < 6; ++b)
-            {
-                const u_int k = nu[b][0], l = nu[b][1];
-                VectorNd<N> t0(n);
-                VectorNd<N> t1(n);
-                if (l == k)
-                {
-                    t0 = 2. * D_[j][l] * s_v.col(l);
-                    if (i == j)
-                        t1 = t0;
-                    else
-                        t1 = 2. * D_[i][l] * s_v.col(l);
-                }
-                else
-                {
-                    t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
-                    if (i == j)
-                        t1 = t0;
-                    else
-                        t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
-                }
-
-                if (i == j)
-                {
-                    Eigen::Matrix<double, 1, 1> cm;
-                    cm = s_v.col(i).transpose() * (t0 + t1);
-                    const double c = cm(0, 0);
-                    E(a, b) = 0. + c;
-                }
-                else
-                {
-                    Eigen::Matrix<double, 1, 1> cm;
-                    cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
-                    const double c = cm(0, 0);
-                    E(a, b) = 0. + c;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
-                }
-                if (b != a)
-                    E(b, a) = E(a, b);
-            }
-        }
-        printIt(&E, "circle_fit - E:");
-
-        Eigen::Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
-        for (u_int a = 0; a < 6; ++a)
-        {
-            const u_int i = nu[a][0], j = nu[a][1];
-            Matrix3d Delta = Matrix3d::Zero();
-            Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
-            J2.col(a) = min_eigen3D_fast(A + Delta);
-            const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
-            J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
-        }
-        printIt(&J2, "circle_fit - J2:");
-
-        Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
-        {
-            Matrix3d t0 = J2 * E * J2.transpose();
-            Vector3d t1 = -t0 * r0;
-            Cvc.block(0, 0, 3, 3) = t0;
-            Cvc.block(0, 3, 3, 1) = t1;
-            Cvc.block(3, 0, 1, 3) = t1.transpose();
-            Eigen::Matrix<double, 1, 1> cm1;
-            Eigen::Matrix<double, 1, 1> cm3;
-            cm1 = (v.transpose() * C0 * v);
-            //      cm2 = (C0.cwiseProduct(t0)).sum();
-            cm3 = (r0.transpose() * t0 * r0);
-            const double c = cm1(0, 0) + (C0.cwiseProduct(t0)).sum() + cm3(0, 0);
-            Cvc(3, 3) = c;
-            // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
-        }
-        printIt(&Cvc, "circle_fit - Cvc:");
+      Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
+      {
+        Matrix3d t0 = J2 * E * J2.transpose();
+        Vector3d t1 = -t0 * r0;
+        Cvc.block(0, 0, 3, 3) = t0;
+        Cvc.block(0, 3, 3, 1) = t1;
+        Cvc.block(3, 0, 1, 3) = t1.transpose();
+        Eigen::Matrix<double, 1, 1> cm1;
+        Eigen::Matrix<double, 1, 1> cm3;
+        cm1 = (v.transpose() * C0 * v);
+        //      cm2 = (C0.cwiseProduct(t0)).sum();
+        cm3 = (r0.transpose() * t0 * r0);
+        const double c = cm1(0, 0) + (C0.cwiseProduct(t0)).sum() + cm3(0, 0);
+        Cvc(3, 3) = c;
+        // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+      }
+      printIt(&Cvc, "circle_fit - Cvc:");
 
-        Eigen::Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
-        {
-            const double t = 1. / h;
-            J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
-                v(0)*v2x2_inv*t, v(1)*v2x2_inv*t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
-        }
-        printIt(&J3, "circle_fit - J3:");
+      Eigen::Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+      {
+        const double t = 1. / h;
+        J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
+            v(0) * v2x2_inv * t, v(1) * v2x2_inv * t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+      }
+      printIt(&J3, "circle_fit - J3:");
 
-        const RowVector2Nd<N> Jq = mc.transpose() * s * 1. / n;  // var(q)
-        printIt(&Jq, "circle_fit - Jq:");
+      const RowVector2Nd<N> Jq = mc.transpose() * s * 1. / n;  // var(q)
+      printIt(&Jq, "circle_fit - Jq:");
 
-        Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
-                           + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
+      Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                         + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
 
-        circle.cov = cov_uvr;
+      circle.cov = cov_uvr;
     }
 
     printIt(&circle.cov, "Circle cov:");
@@ -795,10 +762,9 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
     printf("circle_fit - exit\n");
 #endif
     return circle;
-}
-
+  }
 
-/*!  \brief Perform an ordinary least square fit in the s-z plane to compute
+  /*!  \brief Perform an ordinary least square fit in the s-z plane to compute
  * the parameters cotTheta and Zip.
  *
  * The fit is performed in the rotated S3D-Z' plane, following the formalism of
@@ -814,170 +780,162 @@ __host__ __device__ inline circle_fit Circle_fit(const  M2xN& hits2D,
  * what is done in the same fit in the Broken Line approach.
  */
 
-  template<typename M3xN, typename M6xN, typename V4>
-__host__ __device__ 
-inline line_fit Line_fit(const M3xN& hits,
-			 const  M6xN & hits_ge,
-			 const circle_fit& circle,
-			 const V4& fast_fit,
-			 const double B,
-			 const bool error) {
-    
-  constexpr uint32_t N = M3xN::ColsAtCompileTime;
-  constexpr auto n = N;
-  double theta = -circle.q*atan(fast_fit(3));
-  theta = theta < 0. ? theta + M_PI : theta;
-    
-  // Prepare the Rotation Matrix to rotate the points
-  Eigen::Matrix<double, 2, 2> rot;
-  rot << sin(theta), cos(theta), -cos(theta), sin(theta);
-
-  
-  // PROJECTION ON THE CILINDER
-  //
-  // p2D will be:
-  // [s1, s2, s3, ..., sn]
-  // [z1, z2, z3, ..., zn]
-  // s values will be ordinary x-values
-  // z values will be ordinary y-values
-
-  Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero(); 
-  Eigen::Matrix<double, 2, 6> Jx;
+  template <typename M3xN, typename M6xN, typename V4>
+  __host__ __device__ inline line_fit Line_fit(const M3xN& hits,
+                                               const M6xN& hits_ge,
+                                               const circle_fit& circle,
+                                               const V4& fast_fit,
+                                               const double B,
+                                               const bool error) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;
+    double theta = -circle.q * atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI : theta;
+
+    // Prepare the Rotation Matrix to rotate the points
+    Eigen::Matrix<double, 2, 2> rot;
+    rot << sin(theta), cos(theta), -cos(theta), sin(theta);
+
+    // PROJECTION ON THE CILINDER
+    //
+    // p2D will be:
+    // [s1, s2, s3, ..., sn]
+    // [z1, z2, z3, ..., zn]
+    // s values will be ordinary x-values
+    // z values will be ordinary y-values
+
+    Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
+    Eigen::Matrix<double, 2, 6> Jx;
 
 #ifdef RFIT_DEBUG
-  printf("Line_fit - B: %g\n", B);
-  printIt(&hits, "Line_fit points: ");
-  printIt(&hits_ge, "Line_fit covs: ");
-  printIt(&rot, "Line_fit rot: ");
-#endif
-  // x & associated Jacobian
-  // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
-  // Slide 11
-  // a ==> -o i.e. the origin of the circle in XY plane, negative
-  // b ==> p i.e. distances of the points wrt the origin of the circle.
-  const Vector2d o(circle.par(0), circle.par(1));
-
-  // associated Jacobian, used in weights and errors computation
-  Matrix6d Cov = Matrix6d::Zero();
-  Matrix2d cov_sz[N];
-  for (u_int i = 0; i < n; ++i)
-  {
-    Vector2d p = hits.block(0, i, 2, 1) - o;
-    const double cross = cross2D(-o, p);
-    const double dot = (-o).dot(p);
-    // atan2(cross, dot) give back the angle in the transverse plane so tha the
-    // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
-    const double atan2_ = -circle.q * atan2(cross, dot);
-//    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
-    p2D(0, i) = atan2_ * circle.par(2);
-
-    // associated Jacobian, used in weights and errors- computation
-    const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
-    double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
-    if (error)
-    {
-      d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
-      d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
-      d_R = atan2_;
+    printf("Line_fit - B: %g\n", B);
+    printIt(&hits, "Line_fit points: ");
+    printIt(&hits_ge, "Line_fit covs: ");
+    printIt(&rot, "Line_fit rot: ");
+#endif
+    // x & associated Jacobian
+    // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+    // Slide 11
+    // a ==> -o i.e. the origin of the circle in XY plane, negative
+    // b ==> p i.e. distances of the points wrt the origin of the circle.
+    const Vector2d o(circle.par(0), circle.par(1));
+
+    // associated Jacobian, used in weights and errors computation
+    Matrix6d Cov = Matrix6d::Zero();
+    Matrix2d cov_sz[N];
+    for (u_int i = 0; i < n; ++i) {
+      Vector2d p = hits.block(0, i, 2, 1) - o;
+      const double cross = cross2D(-o, p);
+      const double dot = (-o).dot(p);
+      // atan2(cross, dot) give back the angle in the transverse plane so tha the
+      // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
+      const double atan2_ = -circle.q * atan2(cross, dot);
+      //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
+      p2D(0, i) = atan2_ * circle.par(2);
+
+      // associated Jacobian, used in weights and errors- computation
+      const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+      double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+      if (error) {
+        d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
+        d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
+        d_R = atan2_;
+      }
+      const double d_x = temp0 * (o(1) * dot + o(0) * cross);
+      const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
+      Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+      Cov.block(0, 0, 3, 3) = circle.cov;
+      Cov(3, 3) = hits_ge.col(i)[0];              // x errors
+      Cov(4, 4) = hits_ge.col(i)[2];              // y errors
+      Cov(5, 5) = hits_ge.col(i)[5];              // z errors
+      Cov(3, 4) = Cov(4, 3) = hits_ge.col(i)[1];  // cov_xy
+      Cov(3, 5) = Cov(5, 3) = hits_ge.col(i)[3];  // cov_xz
+      Cov(4, 5) = Cov(5, 4) = hits_ge.col(i)[4];  // cov_yz
+      Matrix2d tmp = Jx * Cov * Jx.transpose();
+      cov_sz[i].noalias() = rot * tmp * rot.transpose();
     }
-    const double d_x = temp0 * (o(1) * dot + o(0) * cross);
-    const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-    Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
-
-
-    
-    Cov.block(0, 0, 3, 3) = circle.cov;
-    Cov(3, 3) = hits_ge.col(i)[0];                // x errors
-    Cov(4, 4) = hits_ge.col(i)[2];                // y errors
-    Cov(5, 5) = hits_ge.col(i)[5];                // z errors
-    Cov(3, 4) = Cov(4, 3) =  hits_ge.col(i)[1];   // cov_xy
-    Cov(3, 5) = Cov(5, 3) =  hits_ge.col(i)[3];   // cov_xz
-    Cov(4, 5) = Cov(5, 4) =  hits_ge.col(i)[4];   // cov_yz
-    Matrix2d tmp = Jx * Cov * Jx.transpose();
-    cov_sz[i].noalias() = rot*tmp*rot.transpose();
-  }
-  // Math of d_{X0,Y0,R,x,y} all verified by hand
-  p2D.row(1) = hits.row(2);
+    // Math of d_{X0,Y0,R,x,y} all verified by hand
+    p2D.row(1) = hits.row(2);
 
-  // The following matrix will contain errors orthogonal to the rotated S
-  // component only, with the Multiple Scattering properly treated!!
-  MatrixNd<N> cov_with_ms; 
-  Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B,cov_with_ms);
+    // The following matrix will contain errors orthogonal to the rotated S
+    // component only, with the Multiple Scattering properly treated!!
+    MatrixNd<N> cov_with_ms;
+    Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B, cov_with_ms);
 #ifdef RFIT_DEBUG
-  printIt(cov_sz, "line_fit - cov_sz:");
-  printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+    printIt(cov_sz, "line_fit - cov_sz:");
+    printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
 #endif
 
-  // Rotate Points with the shape [2, n]
-  Matrix2xNd<N> p2D_rot = rot*p2D;
+    // Rotate Points with the shape [2, n]
+    Matrix2xNd<N> p2D_rot = rot * p2D;
 
 #ifdef RFIT_DEBUG
-  printf("Fast fit Tan(theta): %g\n", fast_fit(3));
-  printf("Rotation angle: %g\n", theta);
-  printIt(&rot, "Rotation Matrix:");
-  printIt(&p2D, "Original Hits(s,z):");
-  printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
-  printIt(&rot, "Rotation Matrix:");
+    printf("Fast fit Tan(theta): %g\n", fast_fit(3));
+    printf("Rotation angle: %g\n", theta);
+    printIt(&rot, "Rotation Matrix:");
+    printIt(&p2D, "Original Hits(s,z):");
+    printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
+    printIt(&rot, "Rotation Matrix:");
 #endif
 
-  // Build the A Matrix
-  Matrix2xNd<N> A;
-  A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+    // Build the A Matrix
+    Matrix2xNd<N> A;
+    A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
 
 #ifdef RFIT_DEBUG
-  printIt(&A, "A Matrix:");
+    printIt(&A, "A Matrix:");
 #endif
 
-  // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
-  MatrixNd<N> Vy_inv; choleskyInversion::invert(cov_with_ms,Vy_inv);
-  // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
-  Eigen::Matrix<double, 2, 2> Cov_params = A*Vy_inv*A.transpose();
-  // Compute the Covariance Matrix of the fit parameters
-  choleskyInversion::invert(Cov_params,Cov_params);
-
-
-  // Now Compute the Parameters in the form [2,1]
-  // The first component is q.
-  // The second component is m.
-  Eigen::Matrix<double, 2, 1> sol = Cov_params*A*Vy_inv*p2D_rot.row(1).transpose();
+    // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
+    MatrixNd<N> Vy_inv;
+    choleskyInversion::invert(cov_with_ms, Vy_inv);
+    // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
+    Eigen::Matrix<double, 2, 2> Cov_params = A * Vy_inv * A.transpose();
+    // Compute the Covariance Matrix of the fit parameters
+    choleskyInversion::invert(Cov_params, Cov_params);
 
+    // Now Compute the Parameters in the form [2,1]
+    // The first component is q.
+    // The second component is m.
+    Eigen::Matrix<double, 2, 1> sol = Cov_params * A * Vy_inv * p2D_rot.row(1).transpose();
 
 #ifdef RFIT_DEBUG
-  printIt(&sol, "Rotated solutions:");
+    printIt(&sol, "Rotated solutions:");
 #endif
 
-  // We need now to transfer back the results in the original s-z plane
-  auto common_factor = 1./(sin(theta)-sol(1,0)*cos(theta));
-  Eigen::Matrix<double, 2, 2> J;
-  J << 0., common_factor*common_factor, common_factor, sol(0,0)*cos(theta)*common_factor*common_factor;
+    // We need now to transfer back the results in the original s-z plane
+    auto common_factor = 1. / (sin(theta) - sol(1, 0) * cos(theta));
+    Eigen::Matrix<double, 2, 2> J;
+    J << 0., common_factor * common_factor, common_factor, sol(0, 0) * cos(theta) * common_factor * common_factor;
 
-  double m = common_factor*(sol(1,0)*sin(theta)+cos(theta));
-  double q = common_factor*sol(0,0);
-  auto cov_mq = J * Cov_params * J.transpose();
+    double m = common_factor * (sol(1, 0) * sin(theta) + cos(theta));
+    double q = common_factor * sol(0, 0);
+    auto cov_mq = J * Cov_params * J.transpose();
 
-  VectorNd<N> res = p2D_rot.row(1).transpose() - A.transpose() * sol;
-  double chi2 = res.transpose()*Vy_inv*res;
+    VectorNd<N> res = p2D_rot.row(1).transpose() - A.transpose() * sol;
+    double chi2 = res.transpose() * Vy_inv * res;
 
-  line_fit line;
-  line.par << m, q;
-  line.cov << cov_mq;
-  line.chi2 = chi2;
+    line_fit line;
+    line.par << m, q;
+    line.cov << cov_mq;
+    line.chi2 = chi2;
 
 #ifdef RFIT_DEBUG
-  printf("Common_factor: %g\n", common_factor);
-  printIt(&J, "Jacobian:");
-  printIt(&sol, "Rotated solutions:");
-  printIt(&Cov_params, "Cov_params:");
-  printIt(&cov_mq, "Rotated Covariance Matrix:");
-  printIt(&(line.par), "Real Parameters:");
-  printIt(&(line.cov), "Real Covariance Matrix:");
-  printf("Chi2: %g\n", chi2);
+    printf("Common_factor: %g\n", common_factor);
+    printIt(&J, "Jacobian:");
+    printIt(&sol, "Rotated solutions:");
+    printIt(&Cov_params, "Cov_params:");
+    printIt(&cov_mq, "Rotated Covariance Matrix:");
+    printIt(&(line.par), "Real Parameters:");
+    printIt(&(line.cov), "Real Covariance Matrix:");
+    printf("Chi2: %g\n", chi2);
 #endif
 
-  return line;
-}
+    return line;
+  }
 
-/*!
+  /*!
     \brief Helix fit by three step:
     -fast pre-fit (see Fast_fit() for further info); \n
     -circle fit of hits projected in the transverse plane by Riemann-Chernov
@@ -1010,40 +968,38 @@ inline line_fit Line_fit(const M3xN& hits,
    \bug see Circle_fit(), Line_fit() and Fast_fit() bugs.
 */
 
-template<int N>
-inline helix_fit Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float,6,N>& hits_ge, const double B,
-                           const bool error)
-{
+  template <int N>
+  inline helix_fit Helix_fit(const Matrix3xNd<N>& hits,
+                             const Eigen::Matrix<float, 6, N>& hits_ge,
+                             const double B,
+                             const bool error) {
     constexpr u_int n = N;
     VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
     // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
-    Vector4d fast_fit; 
-    Fast_fit(hits,fast_fit);
-    Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
-    Rfit::loadCovariance2D(hits_ge,hits_cov);
-    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n),
-                                   hits_cov,
-                                   fast_fit, rad, B, error);
+    Vector4d fast_fit;
+    Fast_fit(hits, fast_fit);
+    Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+    Rfit::loadCovariance2D(hits_ge, hits_cov);
+    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, B, error);
     line_fit line = Line_fit(hits, hits_ge, circle, fast_fit, B, error);
 
     par_uvrtopak(circle, B, error);
 
     helix_fit helix;
     helix.par << circle.par, line.par;
-    if (error)
-    {
-        helix.cov = MatrixXd::Zero(5, 5);
-        helix.cov.block(0, 0, 3, 3) = circle.cov;
-        helix.cov.block(3, 3, 2, 2) = line.cov;
+    if (error) {
+      helix.cov = MatrixXd::Zero(5, 5);
+      helix.cov.block(0, 0, 3, 3) = circle.cov;
+      helix.cov.block(3, 3, 2, 2) = line.cov;
     }
     helix.q = circle.q;
     helix.chi2_circle = circle.chi2;
     helix.chi2_line = line.chi2;
 
     return helix;
-}
+  }
 
 }  // namespace Rfit
 
-#endif
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
index eeca145ab93e3..67ebe16e86840 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
@@ -1,27 +1,22 @@
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/global/EDProducer.h"
-
+#include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitter.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
-
 #include "MagneticField/Engine/interface/MagneticField.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitter.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
-class PixelNtupletsFitterProducer: public edm::global::EDProducer<> {
+class PixelNtupletsFitterProducer : public edm::global::EDProducer<> {
 public:
   explicit PixelNtupletsFitterProducer(const edm::ParameterSet& iConfig)
-    : useRiemannFit_(iConfig.getParameter<bool>("useRiemannFit"))
-  {
+      : useRiemannFit_(iConfig.getParameter<bool>("useRiemannFit")) {
     produces<PixelFitter>();
   }
   ~PixelNtupletsFitterProducer() override {}
@@ -37,13 +32,11 @@ class PixelNtupletsFitterProducer: public edm::global::EDProducer<> {
   void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 };
 
-
 void PixelNtupletsFitterProducer::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
   edm::ESHandle<MagneticField> fieldESH;
   iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
   float bField = 1 / PixelRecoUtilities::fieldInInvGev(iSetup);
-  auto impl = std::make_unique<PixelNtupletsFitter>(bField,
-      fieldESH.product(), useRiemannFit_);
+  auto impl = std::make_unique<PixelNtupletsFitter>(bField, fieldESH.product(), useRiemannFit_);
   auto prod = std::make_unique<PixelFitter>(std::move(impl));
   iEvent.put(std::move(prod));
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
index 57c6d6ec0e806..91c3a44cc8643 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
@@ -1,58 +1,52 @@
-#include "storeTracks.h"
-#include "PixelTrackProducer.h"
+#include <vector>
 
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
-#include "DataFormats/TrackReco/interface/Track.h"
-#include "DataFormats/TrackReco/interface/TrackFwd.h"
-#include "DataFormats/TrackReco/interface/TrackExtra.h"
-#include "DataFormats/Common/interface/OrphanHandle.h"
-
-#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 
-#include <vector>
+#include "PixelTrackProducer.h"
+#include "storeTracks.h"
 
 using namespace pixeltrackfitting;
 using edm::ParameterSet;
 
-PixelTrackProducer::PixelTrackProducer(const ParameterSet& cfg)
-  :   theReconstruction(cfg, consumesCollector())
-{
-  edm::LogInfo("PixelTrackProducer")<<" construction...";
+PixelTrackProducer::PixelTrackProducer(const ParameterSet& cfg) : theReconstruction(cfg, consumesCollector()) {
+  edm::LogInfo("PixelTrackProducer") << " construction...";
   produces<reco::TrackCollection>();
   produces<TrackingRecHitCollection>();
   produces<reco::TrackExtraCollection>();
 }
 
-PixelTrackProducer::~PixelTrackProducer() { }
+PixelTrackProducer::~PixelTrackProducer() {}
 
 void PixelTrackProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
-  desc.add<std::string>("passLabel", "pixelTracks"); // What is this? It is not used anywhere in this code.
+  desc.add<std::string>("passLabel", "pixelTracks");  // What is this? It is not used anywhere in this code.
   PixelTrackReconstruction::fillDescriptions(desc);
 
   descriptions.add("pixelTracks", desc);
 }
 
-void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es)
-{
-  LogDebug("PixelTrackProducer, produce")<<"event# :"<<ev.id();
+void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es) {
+  LogDebug("PixelTrackProducer, produce") << "event# :" << ev.id();
 
   TracksWithTTRHs tracks;
-    theReconstruction.run(tracks, ev, es);
+  theReconstruction.run(tracks, ev, es);
   edm::ESHandle<TrackerTopology> httopo;
   es.get<TrackerTopologyRcd>().get(httopo);
 
   // store tracks
   storeTracks(ev, tracks, *httopo);
 }
-
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
index 8852c884c7cc5..c38fd44c0d7f5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
@@ -1,14 +1,18 @@
-#ifndef PixelTrackProducer_h
-#define PixelTrackProducer_h
+#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h
+#define RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h
 
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstruction.h"
 
-namespace edm { class Event; class EventSetup; class ParameterSet; class ConfigurationDescriptions; }
+namespace edm {
+  class Event;
+  class EventSetup;
+  class ParameterSet;
+  class ConfigurationDescriptions;
+}  // namespace edm
 class TrackerTopology;
 
-class PixelTrackProducer :  public edm::stream::EDProducer<> {
-
+class PixelTrackProducer : public edm::stream::EDProducer<> {
 public:
   explicit PixelTrackProducer(const edm::ParameterSet& conf);
 
@@ -22,4 +26,4 @@ class PixelTrackProducer :  public edm::stream::EDProducer<> {
   PixelTrackReconstruction theReconstruction;
 };
 
-#endif // PixelTrackProducer_h
+#endif  // RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
index 92b8cac8f8fe9..32a9aeb982094 100644
--- a/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
@@ -1,51 +1,39 @@
-#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
-
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-
-#include "FWCore/Framework/interface/EventSetup.h"
-
+#include "CommonTools/Utils/interface/DynArray.h"
 #include "DataFormats/GeometryCommonDetAlgo/interface/GlobalError.h"
+#include "DataFormats/GeometryCommonDetAlgo/interface/Measurement1D.h"
 #include "DataFormats/GeometryVector/interface/GlobalPoint.h"
 #include "DataFormats/GeometryVector/interface/LocalPoint.h"
-
+#include "DataFormats/GeometryVector/interface/Pi.h"
 #include "DataFormats/TrackingRecHit/interface/TrackingRecHit.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "Geometry/CommonDetUnit/interface/GeomDet.h"
-#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
-
-#include "DataFormats/GeometryCommonDetAlgo/interface/Measurement1D.h"
-
 #include "Geometry/CommonDetUnit/interface/GeomDetType.h"
-
 #include "MagneticField/Engine/interface/MagneticField.h"
-
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-
-#include "DataFormats/GeometryVector/interface/Pi.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackBuilder.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackErrorParam.h"
-
-#include "CommonTools/Utils/interface/DynArray.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 using namespace std;
 
+PixelNtupletsFitter::PixelNtupletsFitter(float nominalB, const MagneticField* field, bool useRiemannFit)
+    : nominalB_(nominalB), field_(field), useRiemannFit_(useRiemannFit) {}
 
-PixelNtupletsFitter::PixelNtupletsFitter(float nominalB, const MagneticField* field,
-                                         bool useRiemannFit)
-    : nominalB_(nominalB), field_(field),
-    useRiemannFit_(useRiemannFit) {}
-
-std::unique_ptr<reco::Track> PixelNtupletsFitter::run(
-    const std::vector<const TrackingRecHit*>& hits, const TrackingRegion& region, const edm::EventSetup& ) const {
-
+std::unique_ptr<reco::Track> PixelNtupletsFitter::run(const std::vector<const TrackingRecHit*>& hits,
+                                                      const TrackingRegion& region,
+                                                      const edm::EventSetup&) const {
   using namespace Rfit;
 
   std::unique_ptr<reco::Track> ret;
 
   unsigned int nhits = hits.size();
 
-  if (nhits < 2) return ret;
+  if (nhits < 2)
+    return ret;
 
   declareDynArray(GlobalPoint, nhits, points);
   declareDynArray(GlobalError, nhits, errors);
@@ -58,22 +46,20 @@ std::unique_ptr<reco::Track> PixelNtupletsFitter::run(
     isBarrel[i] = recHit->detUnit()->type().isBarrel();
   }
 
-   assert(nhits==4);
-   Rfit::Matrix3xNd<4> hits_gp;
+  assert(nhits == 4);
+  Rfit::Matrix3xNd<4> hits_gp;
 
-   Eigen::Matrix<float,6,4> hits_ge = Eigen::Matrix<float,6,4>::Zero();
+  Eigen::Matrix<float, 6, 4> hits_ge = Eigen::Matrix<float, 6, 4>::Zero();
 
   for (unsigned int i = 0; i < nhits; ++i) {
     hits_gp.col(i) << points[i].x(), points[i].y(), points[i].z();
 
-    hits_ge.col(i) <<  errors[i].cxx(), errors[i].cyx(), errors[i].cyy(),
-                              errors[i].czx(), errors[i].czy(), errors[i].czz();
+    hits_ge.col(i) << errors[i].cxx(), errors[i].cyx(), errors[i].cyy(), errors[i].czx(), errors[i].czy(),
+        errors[i].czz();
   }
 
-
-  helix_fit fittedTrack = useRiemannFit_ ?
-	Rfit::Helix_fit(hits_gp, hits_ge, nominalB_, true)
-        : BrokenLine::BL_Helix_fit(hits_gp, hits_ge, nominalB_);
+  helix_fit fittedTrack = useRiemannFit_ ? Rfit::Helix_fit(hits_gp, hits_ge, nominalB_, true)
+                                         : BrokenLine::BL_Helix_fit(hits_gp, hits_ge, nominalB_);
 
   int iCharge = fittedTrack.q;
 
@@ -111,7 +97,6 @@ std::unique_ptr<reco::Track> PixelNtupletsFitter::run(
   Measurement1D cotTheta(valCotTheta, errValCotTheta);
   Measurement1D zip(valZip, errValZip);
 
-  ret.reset(
-      builder.build(pt, phi, cotTheta, tip, zip, chi2, iCharge, hits, field_, region.origin()));
+  ret.reset(builder.build(pt, phi, cotTheta, tip, zip, chi2, iCharge, hits, field_, region.origin()));
   return ret;
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
index f19618e23d252..5395b93629f49 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
@@ -1,11 +1,11 @@
 #define _USE_MATH_DEFINES
 
+#include <chrono>
 #include <cmath>
 #include <iomanip>
 #include <iostream>
+#include <memory>
 #include <random>
-#include <memory>  // unique_ptr
-#include<chrono>
 
 #include <TFile.h>
 #include <TH1F.h>
@@ -22,16 +22,16 @@ using namespace Rfit;
 using std::unique_ptr;
 
 namespace Rfit {
-using Vector3i = Eigen::Matrix<int, 3, 1>;
-using Vector4i = Eigen::Matrix<int, 4, 1>;
-using Vector6d = Eigen::Matrix<double, 6, 1>;
-using Vector8d = Eigen::Matrix<double, 8, 1>;
+  using Vector3i = Eigen::Matrix<int, 3, 1>;
+  using Vector4i = Eigen::Matrix<int, 4, 1>;
+  using Vector6d = Eigen::Matrix<double, 6, 1>;
+  using Vector8d = Eigen::Matrix<double, 8, 1>;
 };  // namespace Rfit
 
 // quadruplets...
 struct hits_gen {
   Matrix3xNd<4> hits;
-  Eigen::Matrix<float,6,4> hits_ge;
+  Eigen::Matrix<float, 6, 4> hits_ge;
   Vector5d true_par;
 };
 
@@ -75,19 +75,18 @@ void smearing(const Vector5d& err, const bool& isbarrel, double& x, double& y, d
   }
 }
 
-template<int N>
-void Hits_cov(Eigen::Matrix<float,6,4> & V, const unsigned int& i, const unsigned int& n, const Matrix3xNd<N>& hits,
-              const Vector5d& err, bool isbarrel) {
+template <int N>
+void Hits_cov(Eigen::Matrix<float, 6, 4>& V,
+              const unsigned int& i,
+              const unsigned int& n,
+              const Matrix3xNd<N>& hits,
+              const Vector5d& err,
+              bool isbarrel) {
   if (isbarrel) {
     double R2 = Rfit::sqr(hits(0, i)) + Rfit::sqr(hits(1, i));
-    V.col(i)[0] =
-        (Rfit::sqr(err[1]) * Rfit::sqr(hits(1, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(0, i))) /
-        R2;
-    V.col(i)[2] =
-        (Rfit::sqr(err[1]) * Rfit::sqr(hits(0, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(1, i))) /
-        R2;
-    V.col(i)[1] =
-        (Rfit::sqr(err[0]) - Rfit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2;
+    V.col(i)[0] = (Rfit::sqr(err[1]) * Rfit::sqr(hits(1, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(0, i))) / R2;
+    V.col(i)[2] = (Rfit::sqr(err[1]) * Rfit::sqr(hits(0, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(1, i))) / R2;
+    V.col(i)[1] = (Rfit::sqr(err[0]) - Rfit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2;
     V.col(i)[5] = Rfit::sqr(err[2]);
   } else {
     V.col(i)[0] = Rfit::sqr(err[3]);
@@ -99,19 +98,19 @@ void Hits_cov(Eigen::Matrix<float,6,4> & V, const unsigned int& i, const unsigne
 hits_gen Hits_gen(const unsigned int& n, const Matrix<double, 6, 1>& gen_par) {
   hits_gen gen;
   gen.hits = MatrixXd::Zero(3, n);
-  gen.hits_ge = Eigen::Matrix<float,6,4>::Zero();
+  gen.hits_ge = Eigen::Matrix<float, 6, 4>::Zero();
   // err /= 10000.;
   constexpr double rad[8] = {2.95, 6.8, 10.9, 16., 3.1, 7., 11., 16.2};
   // constexpr double R_err[8] = {5./10000, 5./10000, 5./10000, 5./10000, 5./10000,
   // 5./10000, 5./10000, 5./10000};  constexpr double Rp_err[8] = {35./10000, 18./10000,
   // 15./10000, 34./10000, 35./10000, 18./10000, 15./10000, 34./10000};  constexpr double z_err[8] =
   // {72./10000, 38./10000, 25./10000, 56./10000, 72./10000, 38./10000, 25./10000, 56./10000};
-  constexpr double R_err[8] = {10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000,
-                               10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000};
-  constexpr double Rp_err[8] = {35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000,
-                                35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000};
-  constexpr double z_err[8] = {72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000,
-                               72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000};
+  constexpr double R_err[8] = {
+      10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000};
+  constexpr double Rp_err[8] = {
+      35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000, 35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000};
+  constexpr double z_err[8] = {
+      72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000, 72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000};
   const double x2 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180);
   const double y2 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180);
   const double alpha = atan2(y2, x2);
@@ -124,11 +123,12 @@ hits_gen Hits_gen(const unsigned int& n, const Matrix<double, 6, 1>& gen_par) {
     const double gamma = alpha + beta;
     gen.hits(0, i) = rad[i] * cos(gamma);
     gen.hits(1, i) = rad[i] * sin(gamma);
-    gen.hits(2, i) = gen_par(2) + 1 / tan(gen_par(5) * pi / 180) * 2. *
-                                      asin(sqrt(Rfit::sqr((gen_par(0) - gen.hits(0, i))) +
-                                                Rfit::sqr((gen_par(1) - gen.hits(1, i)))) /
-                                           (2. * gen_par(4))) *
-                                      gen_par(4);
+    gen.hits(2, i) =
+        gen_par(2) +
+        1 / tan(gen_par(5) * pi / 180) * 2. *
+            asin(sqrt(Rfit::sqr((gen_par(0) - gen.hits(0, i))) + Rfit::sqr((gen_par(1) - gen.hits(1, i)))) /
+                 (2. * gen_par(4))) *
+            gen_par(4);
     // isbarrel(i) = ??
     Vector5d err;
     err << R_err[i], Rp_err[i], z_err[i], 0, 0;
@@ -154,29 +154,26 @@ Vector5d True_par(const Matrix<double, 6, 1>& gen_par, const int& charge, const
                    0)
                       ? -1
                       : 1;
-  true_par(4) = gen_par(2) +
-                1 / tan(gen_par(5) * pi / 180) * dir * 2.f *
-                    asin(sqrt(Rfit::sqr((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1))) +
-                              Rfit::sqr((gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)))) /
-                         (2.f * gen_par(4))) *
-                    gen_par(4);
+  true_par(4) = gen_par(2) + 1 / tan(gen_par(5) * pi / 180) * dir * 2.f *
+                                 asin(sqrt(Rfit::sqr((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1))) +
+                                           Rfit::sqr((gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)))) /
+                                      (2.f * gen_par(4))) *
+                                 gen_par(4);
   return true_par;
 }
 
-Matrix<double, 6, 1> New_par(const Matrix<double, 6, 1>& gen_par, const int& charge,
-                             const double& B_field) {
+Matrix<double, 6, 1> New_par(const Matrix<double, 6, 1>& gen_par, const int& charge, const double& B_field) {
   Matrix<double, 6, 1> new_par;
   new_par.block(0, 0, 3, 1) = gen_par.block(0, 0, 3, 1);
   new_par(3) = gen_par(3) - charge * 90;
   new_par(4) = gen_par(4) / B_field;
-//  new_par(5) = atan(sinh(gen_par(5))) * 180 / pi;
-  new_par(5) = 2.*atan(exp(-gen_par(5))) * 180 / pi;
+  //  new_par(5) = atan(sinh(gen_par(5))) * 180 / pi;
+  new_par(5) = 2. * atan(exp(-gen_par(5))) * 180 / pi;
   return new_par;
 }
 
-template<typename Fit, size_t N>
-void computePull(std::array<Fit, N> & fit, const char * label,
-    int n_, int iteration, const Vector5d & true_par) {
+template <typename Fit, size_t N>
+void computePull(std::array<Fit, N>& fit, const char* label, int n_, int iteration, const Vector5d& true_par) {
   Eigen::Matrix<double, 41, Eigen::Dynamic, 1> score(41, iteration);
 
   std::string histo_name("Phi Pull");
@@ -226,50 +223,45 @@ void computePull(std::array<Fit, N> & fit, const char * label,
     pt_error.Fill(sqrt(fit[x].cov(2, 2)));
     theta_error.Fill(sqrt(fit[x].cov(3, 3)));
     dz_error.Fill(sqrt(fit[x].cov(4, 4)));
-    score(5, x) =
-      (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / (fit[x].cov(0, 1));
-    score(6, x) =
-      (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(0, 2));
-    score(7, x) =
-      (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(1, 2));
-    score(8, x) =
-      (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / (fit[x].cov(3, 4));
+    score(5, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / (fit[x].cov(0, 1));
+    score(6, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(0, 2));
+    score(7, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(1, 2));
+    score(8, x) = (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / (fit[x].cov(3, 4));
     score(9, x) = fit[x].chi2_circle;
     score(25, x) = fit[x].chi2_line;
     score(10, x) = sqrt(fit[x].cov(0, 0)) / fit[x].par(0) * 100;
     score(13, x) = sqrt(fit[x].cov(3, 3)) / fit[x].par(3) * 100;
     score(14, x) = sqrt(fit[x].cov(4, 4)) / fit[x].par(4) * 100;
-    score(15, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(3) - true_par(3)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(3, 3));
-    score(16, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(3) - true_par(3)) /
-      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(3, 3));
-    score(17, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(3) - true_par(3)) /
-      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(3, 3));
-    score(18, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(4, 4));
-    score(19, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(4, 4));
-    score(20, x) = (fit[x].par(2) - true_par(2)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(4, 4));
-    score(21, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(1, 1));
-    score(22, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) /
-      sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(2, 2));
-    score(23, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) /
-      sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(2, 2));
-    score(24, x) = (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) /
-      sqrt(fit[x].cov(3, 3)) / sqrt(fit[x].cov(4, 4));
+    score(15, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(3, 3));
+    score(16, x) =
+        (fit[x].par(1) - true_par(1)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(3, 3));
+    score(17, x) =
+        (fit[x].par(2) - true_par(2)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(3, 3));
+    score(18, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(4, 4));
+    score(19, x) =
+        (fit[x].par(1) - true_par(1)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(4, 4));
+    score(20, x) =
+        (fit[x].par(2) - true_par(2)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(4, 4));
+    score(21, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(1, 1));
+    score(22, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(2, 2));
+    score(23, x) =
+        (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(2, 2));
+    score(24, x) =
+        (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(3, 3)) / sqrt(fit[x].cov(4, 4));
     score(30, x) = fit[x].par(0);
     score(31, x) = fit[x].par(1);
     score(32, x) = fit[x].par(2);
     score(33, x) = fit[x].par(3);
     score(34, x) = fit[x].par(4);
-    score(35, x) = sqrt(fit[x].cov(0,0));
-    score(36, x) = sqrt(fit[x].cov(1,1));
-    score(37, x) = sqrt(fit[x].cov(2,2));
-    score(38, x) = sqrt(fit[x].cov(3,3));
-    score(39, x) = sqrt(fit[x].cov(4,4));
-
+    score(35, x) = sqrt(fit[x].cov(0, 0));
+    score(36, x) = sqrt(fit[x].cov(1, 1));
+    score(37, x) = sqrt(fit[x].cov(2, 2));
+    score(38, x) = sqrt(fit[x].cov(3, 3));
+    score(39, x) = sqrt(fit[x].cov(4, 4));
   }
 
   double phi_ = score.row(0).mean();
@@ -278,45 +270,45 @@ void computePull(std::array<Fit, N> & fit, const char * label,
   double coT_ = score.row(3).mean();
   double Zip_ = score.row(4).mean();
   std::cout << std::setprecision(5) << std::scientific << label << " AVERAGE FITTED VALUES: \n"
-    << "phi: " << score.row(30).mean() << " +/- " << score.row(35).mean() << " [+/-] " << sqrt(score.row(35).array().abs2().mean() - score.row(35).mean()*score.row(35).mean()) << std::endl
-    << "d0:  " << score.row(31).mean() << " +/- " << score.row(36).mean() << " [+/-] " << sqrt(score.row(36).array().abs2().mean() - score.row(36).mean()*score.row(36).mean()) << std::endl
-    << "pt:  " << score.row(32).mean() << " +/- " << score.row(37).mean() << " [+/-] " << sqrt(score.row(37).array().abs2().mean() - score.row(37).mean()*score.row(37).mean()) << std::endl
-    << "coT: " << score.row(33).mean() << " +/- " << score.row(38).mean() << " [+/-] " << sqrt(score.row(38).array().abs2().mean() - score.row(38).mean()*score.row(38).mean()) << std::endl
-    << "Zip: " << score.row(34).mean() << " +/- " << score.row(39).mean() << " [+/-] " << sqrt(score.row(39).array().abs2().mean() - score.row(39).mean()*score.row(39).mean()) << std::endl;
+            << "phi: " << score.row(30).mean() << " +/- " << score.row(35).mean() << " [+/-] "
+            << sqrt(score.row(35).array().abs2().mean() - score.row(35).mean() * score.row(35).mean()) << std::endl
+            << "d0:  " << score.row(31).mean() << " +/- " << score.row(36).mean() << " [+/-] "
+            << sqrt(score.row(36).array().abs2().mean() - score.row(36).mean() * score.row(36).mean()) << std::endl
+            << "pt:  " << score.row(32).mean() << " +/- " << score.row(37).mean() << " [+/-] "
+            << sqrt(score.row(37).array().abs2().mean() - score.row(37).mean() * score.row(37).mean()) << std::endl
+            << "coT: " << score.row(33).mean() << " +/- " << score.row(38).mean() << " [+/-] "
+            << sqrt(score.row(38).array().abs2().mean() - score.row(38).mean() * score.row(38).mean()) << std::endl
+            << "Zip: " << score.row(34).mean() << " +/- " << score.row(39).mean() << " [+/-] "
+            << sqrt(score.row(39).array().abs2().mean() - score.row(39).mean() * score.row(39).mean()) << std::endl;
 
   Matrix5d correlation;
-  correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(),
-              score.row(20).mean(), score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(),
-              score.row(19).mean(), score.row(22).mean(), score.row(23).mean(), 1., score.row(17).mean(),
-              score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), score.row(17).mean(), 1.,
-              score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
-              score.row(24).mean(), 1.;
+  correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(), score.row(20).mean(),
+      score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(), score.row(19).mean(), score.row(22).mean(),
+      score.row(23).mean(), 1., score.row(17).mean(), score.row(20).mean(), score.row(15).mean(), score.row(16).mean(),
+      score.row(17).mean(), 1., score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
+      score.row(24).mean(), 1.;
 
-  cout << "\n" << label << " PULLS (mean, sigma, relative_error):\n"
-    << "phi:  " << phi_ << "     "
-    << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(10).mean()) << "%\n"
-    << "a0 :  " << a_ << "     "
-    << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(11).mean()) << "%\n"
-    << "pt :  " << pt_ << "     "
-    << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(12).mean()) << "%\n"
-    << "coT:  " << coT_ << "     "
-    << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(13).mean()) << "%\n"
-    << "Zip:  " << Zip_ << "     "
-    << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
-    << abs(score.row(14).mean()) << "%\n\n"
-    << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
-    << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
-    << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
-    << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
-    << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
-    << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
-    << "correlation matrix:\n"
-    << correlation << "\n\n"
-    << endl;
+  cout << "\n"
+       << label << " PULLS (mean, sigma, relative_error):\n"
+       << "phi:  " << phi_ << "     " << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(10).mean()) << "%\n"
+       << "a0 :  " << a_ << "     " << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(11).mean()) << "%\n"
+       << "pt :  " << pt_ << "     " << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(12).mean()) << "%\n"
+       << "coT:  " << coT_ << "     " << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(13).mean()) << "%\n"
+       << "Zip:  " << Zip_ << "     " << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(14).mean()) << "%\n\n"
+       << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
+       << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
+       << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
+       << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
+       << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
+       << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
+       << "correlation matrix:\n"
+       << correlation << "\n\n"
+       << endl;
 
   phi_pull.Fit("gaus", "Q");
   dxy_pull.Fit("gaus", "Q");
@@ -335,7 +327,6 @@ void computePull(std::array<Fit, N> & fit, const char * label,
   pt_error.Write();
 }
 
-
 void test_helix_fit(bool getcin) {
   int n_;
   const double B_field = 3.8 * c_speed / pow(10, 9) / 100;
@@ -348,27 +339,27 @@ void test_helix_fit(bool getcin) {
   cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration debug" << endl;
   if (getcin) {
     cout << "hits: ";
-    cin  >> n_;
+    cin >> n_;
     cout << "x: ";
-    cin  >> gen_par(0);
+    cin >> gen_par(0);
     cout << "y: ";
-    cin  >> gen_par(1);
+    cin >> gen_par(1);
     cout << "z: ";
-    cin  >> gen_par(2);
+    cin >> gen_par(2);
     cout << "phi: ";
-    cin  >> gen_par(3);
+    cin >> gen_par(3);
     cout << "p_t: ";
-    cin  >> gen_par(4);
+    cin >> gen_par(4);
     cout << "eta: ";
-    cin  >> gen_par(5);
+    cin >> gen_par(5);
   } else {
-     n_ = 4;
-     gen_par(0) = -0.1;  // x
-     gen_par(1) = 0.1;   // y
-     gen_par(2) = -1.;  // z
-     gen_par(3) = 45.;   // phi
-     gen_par(4) = 10.;   // R (p_t)
-     gen_par(5) = 1.;   // eta
+    n_ = 4;
+    gen_par(0) = -0.1;  // x
+    gen_par(1) = 0.1;   // y
+    gen_par(2) = -1.;   // z
+    gen_par(3) = 45.;   // phi
+    gen_par(4) = 10.;   // R (p_t)
+    gen_par(5) = 1.;    // eta
   }
 
   const int iteration = 5000;
@@ -377,15 +368,14 @@ void test_helix_fit(bool getcin) {
   std::array<helix_fit, iteration> helixRiemann_fit;
 
   std::cout << "\nTrue parameters: "
-    << "phi: " << true_par(0) << " "
-    << "dxy: " << true_par(1) << " "
-    << "pt: " << true_par(2) << " "
-    << "CotT: " << true_par(3) << " "
-    << "Zip: " << true_par(4) << " "
-    << std::endl;
+            << "phi: " << true_par(0) << " "
+            << "dxy: " << true_par(1) << " "
+            << "pt: " << true_par(2) << " "
+            << "CotT: " << true_par(3) << " "
+            << "Zip: " << true_par(4) << " " << std::endl;
   auto start = std::chrono::high_resolution_clock::now();
-  auto delta = start-start;
-  for (int i = 0; i < 100*iteration; i++) {
+  auto delta = start - start;
+  for (int i = 0; i < 100 * iteration; i++) {
     hits_gen gen;
     gen = Hits_gen(n_, gen_par);
     //      gen.hits = MatrixXd::Zero(3, 4);
@@ -394,43 +384,44 @@ void test_helix_fit(bool getcin) {
     //      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
     //      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
     //      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
-    delta -= std::chrono::high_resolution_clock::now()-start;
-    helixRiemann_fit[i%iteration] =
+    delta -= std::chrono::high_resolution_clock::now() - start;
+    helixRiemann_fit[i % iteration] =
 #ifdef USE_BL
-      BrokenLine::BL_Helix_fit(gen.hits, gen.hits_ge, B_field);
+        BrokenLine::BL_Helix_fit(gen.hits, gen.hits_ge, B_field);
 #else
-      Rfit::Helix_fit(gen.hits, gen.hits_ge, B_field, true);
+        Rfit::Helix_fit(gen.hits, gen.hits_ge, B_field, true);
 #endif
-    delta += std::chrono::high_resolution_clock::now()-start;
+    delta += std::chrono::high_resolution_clock::now() - start;
 
-    if (helixRiemann_fit[i%iteration].par(0)>10.) std::cout << "error" << std::endl;
-    if (0==i)
-      cout << std::setprecision(6)
-        << "phi:  " << helixRiemann_fit[i].par(0) << " +/- " << sqrt(helixRiemann_fit[i].cov(0, 0)) << " vs "
-        << true_par(0) << endl
-        << "Tip:  " << helixRiemann_fit[i].par(1) << " +/- " << sqrt(helixRiemann_fit[i].cov(1, 1)) << " vs "
-        << true_par(1) << endl
-        << "p_t:  " << helixRiemann_fit[i].par(2) << " +/- " << sqrt(helixRiemann_fit[i].cov(2, 2)) << " vs "
-        << true_par(2) << endl
-        << "theta:" << helixRiemann_fit[i].par(3) << " +/- " << sqrt(helixRiemann_fit[i].cov(3, 3)) << " vs "
-        << true_par(3) << endl
-        << "Zip:  " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs "
-        << true_par(4) << endl
-        << "charge:" << helixRiemann_fit[i].q << " vs 1" << endl
-        << "covariance matrix:" << endl
-        << helixRiemann_fit[i].cov << endl
-        << "Initial hits:\n" << gen.hits << endl
-        << "Initial Covariance:\n" << gen.hits_ge << endl;
-        
+    if (helixRiemann_fit[i % iteration].par(0) > 10.)
+      std::cout << "error" << std::endl;
+    if (0 == i)
+      cout << std::setprecision(6) << "phi:  " << helixRiemann_fit[i].par(0) << " +/- "
+           << sqrt(helixRiemann_fit[i].cov(0, 0)) << " vs " << true_par(0) << endl
+           << "Tip:  " << helixRiemann_fit[i].par(1) << " +/- " << sqrt(helixRiemann_fit[i].cov(1, 1)) << " vs "
+           << true_par(1) << endl
+           << "p_t:  " << helixRiemann_fit[i].par(2) << " +/- " << sqrt(helixRiemann_fit[i].cov(2, 2)) << " vs "
+           << true_par(2) << endl
+           << "theta:" << helixRiemann_fit[i].par(3) << " +/- " << sqrt(helixRiemann_fit[i].cov(3, 3)) << " vs "
+           << true_par(3) << endl
+           << "Zip:  " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs "
+           << true_par(4) << endl
+           << "charge:" << helixRiemann_fit[i].q << " vs 1" << endl
+           << "covariance matrix:" << endl
+           << helixRiemann_fit[i].cov << endl
+           << "Initial hits:\n"
+           << gen.hits << endl
+           << "Initial Covariance:\n"
+           << gen.hits_ge << endl;
   }
-  std::cout << "elapsted time " << double(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count())/1.e6 << std::endl;
+  std::cout << "elapsted time " << double(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count()) / 1.e6
+            << std::endl;
   computePull(helixRiemann_fit, "Riemann", n_, iteration, true_par);
 }
 
 int main(int nargs, char**) {
   TFile f("TestFitResults.root", "RECREATE");
-  test_helix_fit(nargs>1);
+  test_helix_fit(nargs > 1);
   f.Close();
   return 0;
 }
-
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 88ba8139f01ae..a206feca83b52 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -3,359 +3,331 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
 #ifdef USE_BL
 #include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
 #else
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
 #endif
 
-
 #include "test_common.h"
 
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-
 using namespace Eigen;
 
 namespace Rfit {
-  constexpr uint32_t maxNumberOfTracks() { return 5*1024; }
-  constexpr uint32_t stride() { return maxNumberOfTracks();}
+  constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks(); }
   // hits
-  template<int N>
-  using Matrix3xNd = Eigen::Matrix<double,3,N>;
-  template<int N>
-  using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >;
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()>>;
   // errors
-  template<int N>
-  using Matrix6xNf = Eigen::Matrix<float,6,N>;
-  template<int N>
-  using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >;
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()>>;
   // fast fit
-  using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
-
-}
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()>>;
 
+}  // namespace Rfit
 
-
-template<int N>
-__global__
-void kernelPrintSizes(double * __restrict__ phits,
-                      float * __restrict__ phits_ge
-		      ) {
-  auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits+i,3,4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,4);
-  if (i!=0) return;
-  printf("GPU sizes %lu %lu %lu %lu %lu\n",sizeof(hits[i]),sizeof(hits_ge[i]),
-	 sizeof(Vector4d),sizeof(Rfit::line_fit),sizeof(Rfit::circle_fit));
+template <int N>
+__global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
+  if (i != 0)
+    return;
+  printf("GPU sizes %lu %lu %lu %lu %lu\n",
+         sizeof(hits[i]),
+         sizeof(hits_ge[i]),
+         sizeof(Vector4d),
+         sizeof(Rfit::line_fit),
+         sizeof(Rfit::circle_fit));
 }
 
-
-template<int N>
-__global__
-void kernelFastFit(double * __restrict__ phits, double * __restrict__ presults) {
-  auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits+i,3,N);
-  Rfit::Map4d result(presults+i,4);
+template <int N>
+__global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d result(presults + i, 4);
 #ifdef USE_BL
   BrokenLine::BL_Fast_fit(hits, result);
 #else
-  Rfit::Fast_fit(hits,  result);
+  Rfit::Fast_fit(hits, result);
 #endif
 }
 
 #ifdef USE_BL
 
-template<int N>
-__global__
-void kernelBrokenLineFit(double * __restrict__ phits,
-			 float * __restrict__ phits_ge, 
-			 double * __restrict__ pfast_fit_input, 
-			 double B,
-			 Rfit::circle_fit * circle_fit,
-			 Rfit::line_fit * line_fit
-			 ) {
-  auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits+i,3,N);
-  Rfit::Map4d   fast_fit_input(pfast_fit_input+i,4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
-  
+template <int N>
+__global__ void kernelBrokenLineFit(double* __restrict__ phits,
+                                    float* __restrict__ phits_ge,
+                                    double* __restrict__ pfast_fit_input,
+                                    double B,
+                                    Rfit::circle_fit* circle_fit,
+                                    Rfit::line_fit* line_fit) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+
   BrokenLine::PreparedBrokenLineData<N> data;
   Rfit::Matrix3d Jacob;
-  
-  auto & line_fit_results = line_fit[i];
-  auto & circle_fit_results = circle_fit[i];
-  
-  BrokenLine::prepareBrokenLineData(hits,fast_fit_input,B,data);
-  BrokenLine::BL_Line_fit(hits_ge,fast_fit_input,B,data,line_fit_results);
-  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit_input,B,data,circle_fit_results);
-  Jacob << 1.,0,0,
-    0,1.,0,
-    0,0,-B/std::copysign(Rfit::sqr(circle_fit_results.par(2)),circle_fit_results.par(2));
-  circle_fit_results.par(2)=B/std::abs(circle_fit_results.par(2));
-  circle_fit_results.cov=Jacob*circle_fit_results.cov*Jacob.transpose();
+
+  auto& line_fit_results = line_fit[i];
+  auto& circle_fit_results = circle_fit[i];
+
+  BrokenLine::prepareBrokenLineData(hits, fast_fit_input, B, data);
+  BrokenLine::BL_Line_fit(hits_ge, fast_fit_input, B, data, line_fit_results);
+  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 
 #ifdef TEST_DEBUG
-if (0==i) {
-  printf("Circle param %f,%f,%f\n",circle_fit[i].par(0),circle_fit[i].par(1),circle_fit[i].par(2));
- }
+  if (0 == i) {
+    printf("Circle param %f,%f,%f\n", circle_fit[i].par(0), circle_fit[i].par(1), circle_fit[i].par(2));
+  }
 #endif
 }
 
 #else
 
-template<int N>
-__global__
-void kernelCircleFit(double * __restrict__ phits,
-    float * __restrict__ phits_ge, 
-    double * __restrict__ pfast_fit_input, 
-    double B,
-    Rfit::circle_fit * circle_fit_resultsGPU) {
-
-  auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits+i,3,N);
-  Rfit::Map4d   fast_fit_input(pfast_fit_input+i,4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
+template <int N>
+__global__ void kernelCircleFit(double* __restrict__ phits,
+                                float* __restrict__ phits_ge,
+                                double* __restrict__ pfast_fit_input,
+                                double B,
+                                Rfit::circle_fit* circle_fit_resultsGPU) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
 
   constexpr auto n = N;
 
   Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
-  Rfit::Matrix2Nd<N> hits_cov =  MatrixXd::Zero(2 * n, 2 * n);
-  Rfit::loadCovariance2D(hits_ge,hits_cov);
-  
+  Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+  Rfit::loadCovariance2D(hits_ge, hits_cov);
+
 #ifdef TEST_DEBUG
-if (0==i) {
-  printf("hits %f, %f\n", hits.block(0,0,2,n)(0,0), hits.block(0,0,2,n)(0,1));
-  printf("hits %f, %f\n", hits.block(0,0,2,n)(1,0), hits.block(0,0,2,n)(1,1));
-  printf("fast_fit_input(0): %f\n", fast_fit_input(0));
-  printf("fast_fit_input(1): %f\n", fast_fit_input(1));
-  printf("fast_fit_input(2): %f\n", fast_fit_input(2));
-  printf("fast_fit_input(3): %f\n", fast_fit_input(3));
-  printf("rad(0,0): %f\n", rad(0,0));
-  printf("rad(1,1): %f\n", rad(1,1));
-  printf("rad(2,2): %f\n", rad(2,2));
-  printf("hits_cov(0,0): %f\n", (*hits_cov)(0,0));
-  printf("hits_cov(1,1): %f\n", (*hits_cov)(1,1));
-  printf("hits_cov(2,2): %f\n", (*hits_cov)(2,2));
-  printf("hits_cov(11,11): %f\n", (*hits_cov)(11,11));
-  printf("B: %f\n", B);
-}
+  if (0 == i) {
+    printf("hits %f, %f\n", hits.block(0, 0, 2, n)(0, 0), hits.block(0, 0, 2, n)(0, 1));
+    printf("hits %f, %f\n", hits.block(0, 0, 2, n)(1, 0), hits.block(0, 0, 2, n)(1, 1));
+    printf("fast_fit_input(0): %f\n", fast_fit_input(0));
+    printf("fast_fit_input(1): %f\n", fast_fit_input(1));
+    printf("fast_fit_input(2): %f\n", fast_fit_input(2));
+    printf("fast_fit_input(3): %f\n", fast_fit_input(3));
+    printf("rad(0,0): %f\n", rad(0, 0));
+    printf("rad(1,1): %f\n", rad(1, 1));
+    printf("rad(2,2): %f\n", rad(2, 2));
+    printf("hits_cov(0,0): %f\n", (*hits_cov)(0, 0));
+    printf("hits_cov(1,1): %f\n", (*hits_cov)(1, 1));
+    printf("hits_cov(2,2): %f\n", (*hits_cov)(2, 2));
+    printf("hits_cov(11,11): %f\n", (*hits_cov)(11, 11));
+    printf("B: %f\n", B);
+  }
 #endif
-  circle_fit_resultsGPU[i] =
-    Rfit::Circle_fit(hits.block(0,0,2,n), hits_cov,
-      fast_fit_input, rad, B, true);
+  circle_fit_resultsGPU[i] = Rfit::Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
 #ifdef TEST_DEBUG
-if (0==i) {
-  printf("Circle param %f,%f,%f\n",circle_fit_resultsGPU[i].par(0),circle_fit_resultsGPU[i].par(1),circle_fit_resultsGPU[i].par(2));
-}
+  if (0 == i) {
+    printf("Circle param %f,%f,%f\n",
+           circle_fit_resultsGPU[i].par(0),
+           circle_fit_resultsGPU[i].par(1),
+           circle_fit_resultsGPU[i].par(2));
+  }
 #endif
 }
 
-template<int N>
-__global__
-void kernelLineFit(double * __restrict__ phits,
-		   float * __restrict__ phits_ge,
-                   double B,
-                   Rfit::circle_fit * circle_fit,
-                   double * __restrict__ pfast_fit_input,
-                   Rfit::line_fit * line_fit)
-{
-  auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits+i,3,N);
-  Rfit::Map4d   fast_fit_input(pfast_fit_input+i,4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
+template <int N>
+__global__ void kernelLineFit(double* __restrict__ phits,
+                              float* __restrict__ phits_ge,
+                              double B,
+                              Rfit::circle_fit* circle_fit,
+                              double* __restrict__ pfast_fit_input,
+                              Rfit::line_fit* line_fit) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
   line_fit[i] = Rfit::Line_fit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
 }
 #endif
 
-template<typename M3xN, typename M6xN>
-__device__ __host__
-void fillHitsAndHitsCov(M3xN & hits, M6xN & hits_ge) {
-
+template <typename M3xN, typename M6xN>
+__device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
   constexpr uint32_t N = M3xN::ColsAtCompileTime;
 
-  if (N==5) {
-    hits << 2.934787,  6.314229, 8.936963, 10.360559,  12.856387,
-      0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
-      -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
-    hits_ge.col(0) << 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05;
-    hits_ge.col(1) << 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06;
-    hits_ge.col(2) << 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07;
-    hits_ge.col(3) << 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08;
-    hits_ge.col(4) << 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07;
+  if (N == 5) {
+    hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+        -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
     return;
   }
 
- 
-  
-  if (N>3) 
-    hits << 1.98645, 4.72598, 7.65632, 11.3151,
-      2.18002, 4.88864, 7.75845, 11.3134,
-      2.46338, 6.99838,  11.808,  17.793;
+  if (N > 3)
+    hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
   else
-    hits << 1.98645, 4.72598, 7.65632,
-      2.18002, 4.88864, 7.75845,
-      2.46338, 6.99838,  11.808;
-  
+    hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
+
   hits_ge.col(0)[0] = 7.14652e-06;
   hits_ge.col(1)[0] = 2.15789e-06;
   hits_ge.col(2)[0] = 1.63328e-06;
-  if (N>3) hits_ge.col(3)[0] = 6.27919e-06;
+  if (N > 3)
+    hits_ge.col(3)[0] = 6.27919e-06;
   hits_ge.col(0)[2] = 6.10348e-06;
   hits_ge.col(1)[2] = 2.08211e-06;
   hits_ge.col(2)[2] = 1.61672e-06;
-  if (N>3) hits_ge.col(3)[2] = 6.28081e-06;
+  if (N > 3)
+    hits_ge.col(3)[2] = 6.28081e-06;
   hits_ge.col(0)[5] = 5.184e-05;
   hits_ge.col(1)[5] = 1.444e-05;
   hits_ge.col(2)[5] = 6.25e-06;
-  if (N>3) hits_ge.col(3)[5] = 3.136e-05;
+  if (N > 3)
+    hits_ge.col(3)[5] = 3.136e-05;
   hits_ge.col(0)[1] = -5.60077e-06;
   hits_ge.col(1)[1] = -1.11936e-06;
   hits_ge.col(2)[1] = -6.24945e-07;
-  if (N>3) hits_ge.col(3)[1] = -5.28e-06;
+  if (N > 3)
+    hits_ge.col(3)[1] = -5.28e-06;
 }
 
-
-template<int N>
-__global__
-void kernelFillHitsAndHitsCov(double * __restrict__ phits,
-  float * phits_ge) {
-  auto i = blockIdx.x*blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits+i,3,N);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+i,6,N);
-  hits_ge = MatrixXf::Zero(6,N);
-  fillHitsAndHitsCov(hits,hits_ge);
+template <int N>
+__global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  Rfit::Map3xNd<N> hits(phits + i, 3, N);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  hits_ge = MatrixXf::Zero(6, N);
+  fillHitsAndHitsCov(hits, hits_ge);
 }
 
-template<int N>
+template <int N>
 void testFit() {
   constexpr double B = 0.0113921;
   Rfit::Matrix3xNd<N> hits;
-  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6,N);
-  double * hitsGPU = nullptr;;
-  float * hits_geGPU = nullptr;
-  double * fast_fit_resultsGPU = nullptr;
-  double * fast_fit_resultsGPUret = new double[Rfit::maxNumberOfTracks()*sizeof(Vector4d)];
-  Rfit::circle_fit * circle_fit_resultsGPU = nullptr;
-  Rfit::circle_fit * circle_fit_resultsGPUret = new Rfit::circle_fit();
-  Rfit::line_fit * line_fit_resultsGPU = nullptr;
-  Rfit::line_fit * line_fit_resultsGPUret = new Rfit::line_fit();
+  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+  double* hitsGPU = nullptr;
+  ;
+  float* hits_geGPU = nullptr;
+  double* fast_fit_resultsGPU = nullptr;
+  double* fast_fit_resultsGPUret = new double[Rfit::maxNumberOfTracks() * sizeof(Vector4d)];
+  Rfit::circle_fit* circle_fit_resultsGPU = nullptr;
+  Rfit::circle_fit* circle_fit_resultsGPUret = new Rfit::circle_fit();
+  Rfit::line_fit* line_fit_resultsGPU = nullptr;
+  Rfit::line_fit* line_fit_resultsGPUret = new Rfit::line_fit();
 
   fillHitsAndHitsCov(hits, hits_ge);
 
-  std::cout << "sizes " << N << ' '
-	    << sizeof(hits) << ' ' << sizeof(hits_ge)
-	    << ' ' << sizeof(Vector4d) 
-	    << ' ' << sizeof(Rfit::line_fit) 
-            << ' ' << sizeof(Rfit::circle_fit)
-            << std::endl;
-  
+  std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' '
+            << sizeof(Rfit::line_fit) << ' ' << sizeof(Rfit::circle_fit) << std::endl;
+
   std::cout << "Generated hits:\n" << hits << std::endl;
   std::cout << "Generated cov:\n" << hits_ge << std::endl;
 
   // FAST_FIT_CPU
 #ifdef USE_BL
-  Vector4d fast_fit_results; BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+  Vector4d fast_fit_results;
+  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
 #else
-  Vector4d fast_fit_results; Rfit::Fast_fit(hits, fast_fit_results);
+  Vector4d fast_fit_results;
+  Rfit::Fast_fit(hits, fast_fit_results);
 #endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
   // for timing    purposes we fit    4096 tracks
   constexpr uint32_t Ntracks = 4096;
-  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix3xNd<N>)));
-  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::Matrix6xNf<N>)));
-  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Vector4d)));
-  cudaCheck(cudaMalloc(&line_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::line_fit)));
-  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix3xNd<N>)));
+  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix6xNf<N>)));
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMalloc(&line_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::circle_fit)));
 
-  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, Rfit::maxNumberOfTracks()*sizeof(Vector4d)));
-  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, Rfit::maxNumberOfTracks()*sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
 
-
-  kernelPrintSizes<N><<<Ntracks/64, 64>>>(hitsGPU,hits_geGPU);
-  kernelFillHitsAndHitsCov<N><<<Ntracks/64, 64>>>(hitsGPU,hits_geGPU);
+  kernelPrintSizes<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
+  kernelFillHitsAndHitsCov<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
 
   // FAST_FIT GPU
-  kernelFastFit<N><<<Ntracks/64, 64>>>(hitsGPU, fast_fit_resultsGPU);
+  kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
   cudaDeviceSynchronize();
-  
-  cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, Rfit::maxNumberOfTracks()*sizeof(Vector4d), cudaMemcpyDeviceToHost);
-  Rfit::Map4d fast_fit(fast_fit_resultsGPUret+10,4);
+
+  cudaMemcpy(fast_fit_resultsGPUret,
+             fast_fit_resultsGPU,
+             Rfit::maxNumberOfTracks() * sizeof(Vector4d),
+             cudaMemcpyDeviceToHost);
+  Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
 
-
 #ifdef USE_BL
   // CIRCLE AND LINE FIT CPU
   BrokenLine::PreparedBrokenLineData<N> data;
   BrokenLine::karimaki_circle_fit circle_fit_results;
   Rfit::line_fit line_fit_results;
   Rfit::Matrix3d Jacob;
-  BrokenLine::prepareBrokenLineData(hits,fast_fit_results,B,data);
-  BrokenLine::BL_Line_fit(hits_ge,fast_fit_results,B,data,line_fit_results);
-  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit_results,B,data,circle_fit_results);
-  Jacob << 1.,0,0,
-    0,1.,0,
-    0,0,-B/std::copysign(Rfit::sqr(circle_fit_results.par(2)),circle_fit_results.par(2));
-  circle_fit_results.par(2)=B/std::abs(circle_fit_results.par(2));
-  circle_fit_results.cov=Jacob*circle_fit_results.cov*Jacob.transpose();
+  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 
   // fit on GPU
-  kernelBrokenLineFit<N><<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU,
-					  fast_fit_resultsGPU, B,
-					  circle_fit_resultsGPU,
-					  line_fit_resultsGPU);
+  kernelBrokenLineFit<N>
+      <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU);
   cudaDeviceSynchronize();
 
-  
 #else
   // CIRCLE_FIT CPU
   Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge,hits_cov);
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, N),
-      hits_cov,
-      fast_fit_results, rad, B, true);
+  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
+  Rfit::loadCovariance2D(hits_ge, hits_cov);
+  Rfit::circle_fit circle_fit_results =
+      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
 
   // CIRCLE_FIT GPU
-  kernelCircleFit<N><<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU,
-      fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  kernelCircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
   cudaDeviceSynchronize();
- 
+
   // LINE_FIT CPU
   Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
 
-
-  kernelLineFit<N><<<Ntracks/64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  kernelLineFit<N>
+      <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
   cudaDeviceSynchronize();
 #endif
 
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
-  
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU,
-	     sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
-  
-  
+
   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
-    // LINE_FIT GPU
+  // LINE_FIT GPU
   cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
-  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N==5 ? 1e-4 : 1e-6)); // requires fma on CPU
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
 
-  
   std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
   std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
   std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl;
   std::cout << "Fitted cov (LineFit): GPU\n" << line_fit_resultsGPUret->cov << std::endl;
-
 }
 
-int main (int argc, char * argv[]) {
+int main(int argc, char* argv[]) {
   exitSansCUDADevices();
 
   testFit<4>();
@@ -366,4 +338,3 @@ int main (int argc, char * argv[]) {
 
   return 0;
 }
-
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
index a1e1049392ad0..370828c4fcef9 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
@@ -14,22 +14,22 @@
 using namespace Eigen;
 
 namespace Rfit {
-  constexpr uint32_t maxNumberOfTracks() { return 5*1024; }
-  constexpr uint32_t stride() { return maxNumberOfTracks();}
+  constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks(); }
   // hits
-  template<int N>
-  using Matrix3xNd = Eigen::Matrix<double,3,N>;
-  template<int N>
-  using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >;
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()> >;
   // errors
-  template<int N>
-  using Matrix6xNf = Eigen::Matrix<float,6,N>;
-  template<int N>
-  using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >;
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()> >;
   // fast fit
-  using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
 
-}
+}  // namespace Rfit
 
 /*
 Hit global: 641,0 2: 2.934787,0.773211,-10.980247
@@ -44,120 +44,110 @@ Hit global: 641,4 1824: 12.856387,4.422212,-47.518867
 Error: 641,4 1824: 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07
 */
 
-template<typename M3xN, typename M6xN>
-void fillHitsAndHitsCov(M3xN & hits, M6xN & hits_ge) {
-
+template <typename M3xN, typename M6xN>
+void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
   constexpr uint32_t N = M3xN::ColsAtCompileTime;
 
-  if (N==5) {
-    hits << 2.934787,  6.314229, 8.936963, 10.360559,  12.856387,
-      0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
-      -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
-    hits_ge.col(0) << 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05;
-    hits_ge.col(1) << 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06;
-    hits_ge.col(2) << 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07;
-    hits_ge.col(3) << 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08;
-    hits_ge.col(4) << 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07;
+  if (N == 5) {
+    hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+        -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
     return;
   }
 
-  
-  if (N>3) 
-    hits << 1.98645, 4.72598, 7.65632, 11.3151,
-      2.18002, 4.88864, 7.75845, 11.3134,
-      2.46338, 6.99838,  11.808,  17.793;
+  if (N > 3)
+    hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
   else
-    hits << 1.98645, 4.72598, 7.65632,
-      2.18002, 4.88864, 7.75845,
-      2.46338, 6.99838,  11.808;
-  
+    hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
+
   hits_ge.col(0)[0] = 7.14652e-06;
   hits_ge.col(1)[0] = 2.15789e-06;
   hits_ge.col(2)[0] = 1.63328e-06;
-  if (N>3) hits_ge.col(3)[0] = 6.27919e-06;
+  if (N > 3)
+    hits_ge.col(3)[0] = 6.27919e-06;
   hits_ge.col(0)[2] = 6.10348e-06;
   hits_ge.col(1)[2] = 2.08211e-06;
   hits_ge.col(2)[2] = 1.61672e-06;
-  if (N>3) hits_ge.col(3)[2] = 6.28081e-06;
+  if (N > 3)
+    hits_ge.col(3)[2] = 6.28081e-06;
   hits_ge.col(0)[5] = 5.184e-05;
   hits_ge.col(1)[5] = 1.444e-05;
   hits_ge.col(2)[5] = 6.25e-06;
-  if (N>3) hits_ge.col(3)[5] = 3.136e-05;
+  if (N > 3)
+    hits_ge.col(3)[5] = 3.136e-05;
   hits_ge.col(0)[1] = -5.60077e-06;
   hits_ge.col(1)[1] = -1.11936e-06;
   hits_ge.col(2)[1] = -6.24945e-07;
-  if (N>3) hits_ge.col(3)[1] = -5.28e-06;
+  if (N > 3)
+    hits_ge.col(3)[1] = -5.28e-06;
 }
 
-
-template<int N>
+template <int N>
 void testFit() {
   constexpr double B = 0.0113921;
   Rfit::Matrix3xNd<N> hits;
-  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6,N);
+  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
 
   fillHitsAndHitsCov(hits, hits_ge);
 
-  std::cout << "sizes " << N << ' '
-	    <<sizeof(hits) << ' ' << sizeof(hits_ge)
-	    << ' ' << sizeof(Vector4d)<< std::endl;
-  
+  std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << std::endl;
+
   std::cout << "Generated hits:\n" << hits << std::endl;
   std::cout << "Generated cov:\n" << hits_ge << std::endl;
 
   // FAST_FIT_CPU
 #ifdef USE_BL
-  Vector4d fast_fit_results; BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+  Vector4d fast_fit_results;
+  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
 #else
-  Vector4d fast_fit_results; Rfit::Fast_fit(hits, fast_fit_results);
+  Vector4d fast_fit_results;
+  Rfit::Fast_fit(hits, fast_fit_results);
 #endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
-
   // CIRCLE_FIT CPU
 
-
 #ifdef USE_BL
   BrokenLine::PreparedBrokenLineData<N> data;
   BrokenLine::karimaki_circle_fit circle_fit_results;
   Rfit::Matrix3d Jacob;
-    
-  BrokenLine::prepareBrokenLineData(hits,fast_fit_results,B,data);
+
+  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
   Rfit::line_fit line_fit_results;
-  BrokenLine::BL_Line_fit(hits_ge,fast_fit_results,B,data,line_fit_results);
-  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit_results,B,data,circle_fit_results);
-  Jacob << 1.,0,0,
-    0,1.,0,
-    0,0,-B/std::copysign(Rfit::sqr(circle_fit_results.par(2)),circle_fit_results.par(2));
-  circle_fit_results.par(2)=B/std::abs(circle_fit_results.par(2));
-  circle_fit_results.cov=Jacob*circle_fit_results.cov*Jacob.transpose();
+  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 #else
   Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
-  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge,hits_cov);
-  Rfit::circle_fit circle_fit_results = Rfit::Circle_fit(hits.block(0, 0, 2, N),
-      hits_cov,
-      fast_fit_results, rad, B, true);
+  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
+  Rfit::loadCovariance2D(hits_ge, hits_cov);
+  Rfit::circle_fit circle_fit_results =
+      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
   // LINE_FIT CPU
   Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
   Rfit::par_uvrtopak(circle_fit_results, B, true);
 
 #endif
-  
-  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par
-	    << "\nchi2 " << circle_fit_results.chi2 << std::endl;
-  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par
-	    << "\nchi2 " << line_fit_results.chi2 << std::endl;
+
+  std::cout << "Fitted values (CircleFit):\n"
+            << circle_fit_results.par << "\nchi2 " << circle_fit_results.chi2 << std::endl;
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << "\nchi2 " << line_fit_results.chi2 << std::endl;
 
   std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
   std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
 }
 
-int main (int argc, char * argv[]) {
+int main(int argc, char* argv[]) {
   testFit<4>();
   testFit<3>();
   testFit<5>();
 
   return 0;
 }
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 894a80616af02..12d1707fdd388 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -2,150 +2,155 @@
 // Author: Felice Pantaleo, CERN
 //
 
-#include "HelixFitOnGPU.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
-
 #include <cstdint>
+
 #include <cuda_runtime.h>
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
 
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "HelixFitOnGPU.h"
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
 
-
 // #define BL_DUMP_HITS
 
-template<int N>
-__global__
-void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
-    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
-    HitsOnGPU const * __restrict__ hhp,
-    double * __restrict__ phits,
-    float * __restrict__ phits_ge,
-    double * __restrict__ pfast_fit,
-    uint32_t nHits,
-    uint32_t offset)
-{
-
+template <int N>
+__global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets,
+                                CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                HitsOnGPU const *__restrict__ hhp,
+                                double *__restrict__ phits,
+                                float *__restrict__ phits_ge,
+                                double *__restrict__ pfast_fit,
+                                uint32_t nHits,
+                                uint32_t offset) {
   constexpr uint32_t hitsInFit = N;
 
-  assert(hitsInFit<=nHits);
+  assert(hitsInFit <= nHits);
 
-  assert(pfast_fit); assert(foundNtuplets);
+  assert(pfast_fit);
+  assert(foundNtuplets);
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
 
 #ifdef BROKENLINE_DEBUG
-  if (0==local_start) printf("%d Ntuple of size %d for %d hits to fit\n",tupleMultiplicity->size(nHits), nHits, hitsInFit);
+  if (0 == local_start)
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
 #endif
 
   auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+  if (tuple_start >= tupleMultiplicity->size(nHits))
+    return;
 
   // get it from the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
-  assert (helix_start < foundNtuplets->nbins());
+  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  assert(helix_start < foundNtuplets->nbins());
 
-  assert (foundNtuplets->size(helix_start)==nHits);
+  assert(foundNtuplets->size(helix_start) == nHits);
 
-  Rfit::Map3xNd<N> hits(phits+local_start);
-  Rfit::Map4d   fast_fit(pfast_fit+local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
+  Rfit::Map3xNd<N> hits(phits + local_start);
+  Rfit::Map4d fast_fit(pfast_fit + local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
 
 #ifdef BL_DUMP_HITS
   __shared__ int done;
   done = 0;
-  __syncthreads(); 
-  bool dump =  (foundNtuplets->size(helix_start)==5 &&
-                0 == atomicAdd(&done,1));
+  __syncthreads();
+  bool dump = (foundNtuplets->size(helix_start) == 5 && 0 == atomicAdd(&done, 1));
 #endif
 
   // Prepare data structure
-  auto const * hitId = foundNtuplets->begin(helix_start);
+  auto const *hitId = foundNtuplets->begin(helix_start);
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     float ge[6];
     hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
 #ifdef BL_DUMP_HITS
-    if (dump){
-      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detectorIndex(hit),i,hhp->xGlobal(hit),hhp->yGlobal(hit),hhp->zGlobal(hit));
-      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detetectorIndex(hit),i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+    if (dump) {
+      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
+             helix_start,
+             hhp->detectorIndex(hit),
+             i,
+             hhp->xGlobal(hit),
+             hhp->yGlobal(hit),
+             hhp->zGlobal(hit));
+      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
+             helix_start,
+             hhp->detetectorIndex(hit),
+             i,
+             ge[0],
+             ge[1],
+             ge[2],
+             ge[3],
+             ge[4],
+             ge[5]);
     }
 #endif
     hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
-    hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
+    hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
   }
-  BrokenLine::BL_Fast_fit(hits,fast_fit);
+  BrokenLine::BL_Fast_fit(hits, fast_fit);
 
   // no NaN here....
-  assert(fast_fit(0)==fast_fit(0));
-  assert(fast_fit(1)==fast_fit(1));
-  assert(fast_fit(2)==fast_fit(2));
-  assert(fast_fit(3)==fast_fit(3));
-
-} 
-
-template<int N>
-__global__
-void kernelBLFit(
-    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
-    double B,
-    Rfit::helix_fit *results,
-    double * __restrict__ phits,
-    float * __restrict__ phits_ge,
-    double * __restrict__ pfast_fit,
-    uint32_t nHits,
-    uint32_t offset)
-{
-
-  assert(N<=nHits);
+  assert(fast_fit(0) == fast_fit(0));
+  assert(fast_fit(1) == fast_fit(1));
+  assert(fast_fit(2) == fast_fit(2));
+  assert(fast_fit(3) == fast_fit(3));
+}
 
-  assert(results); assert(pfast_fit);
+template <int N>
+__global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                            double B,
+                            Rfit::helix_fit *results,
+                            double *__restrict__ phits,
+                            float *__restrict__ phits_ge,
+                            double *__restrict__ pfast_fit,
+                            uint32_t nHits,
+                            uint32_t offset) {
+  assert(N <= nHits);
 
+  assert(results);
+  assert(pfast_fit);
 
   // same as above...
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
   auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+  if (tuple_start >= tupleMultiplicity->size(nHits))
+    return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
+  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
-
-  Rfit::Map3xNd<N> hits(phits+local_start);
-  Rfit::Map4d   fast_fit(pfast_fit+local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
+  Rfit::Map3xNd<N> hits(phits + local_start);
+  Rfit::Map4d fast_fit(pfast_fit + local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
 
   BrokenLine::PreparedBrokenLineData<N> data;
   Rfit::Matrix3d Jacob;
 
   BrokenLine::karimaki_circle_fit circle;
   Rfit::line_fit line;
- 
-  BrokenLine::prepareBrokenLineData(hits,fast_fit,B,data);
-  BrokenLine::BL_Line_fit(hits_ge,fast_fit,B,data,line);
-  BrokenLine::BL_Circle_fit(hits,hits_ge,fast_fit,B,data,circle);
-  Jacob << 1,0,0,
-    0,1,0,
-    0,0,-B/std::copysign(Rfit::sqr(circle.par(2)),circle.par(2));
-  circle.par(2)=B/std::abs(circle.par(2));
-  circle.cov=Jacob*circle.cov*Jacob.transpose();
 
+  BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
+  BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
+  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+  Jacob << 1, 0, 0, 0, 1, 0, 0, 0, -B / std::copysign(Rfit::sqr(circle.par(2)), circle.par(2));
+  circle.par(2) = B / std::abs(circle.par(2));
+  circle.cov = Jacob * circle.cov * Jacob.transpose();
 
   // Grab helix_fit from the proper location in the output vector
-  auto & helix = results[helix_start];
+  auto &helix = results[helix_start];
   helix.par << circle.par, line.par;
 
   helix.cov = Rfit::Matrix5d::Zero();
@@ -157,86 +162,131 @@ void kernelBLFit(
   helix.chi2_line = line.chi2;
 
 #ifdef BROKENLINE_DEBUG
-  if ( !(circle.chi2>=0) || !(line.chi2>=0) ) printf("kernelBLFit failed! %f/%f\n", helix.chi2_circle,helix.chi2_line);
-  printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", N,nHits, helix_start,
-         circle.par(0), circle.par(1), circle.par(2));
-  printf("kernelBLHits line.par(0,1): %d %f,%f\n", helix_start, line.par(0),line.par(1));
-  printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",helix.chi2_circle,helix.chi2_line, 
-         helix.cov(0,0),helix.cov(1,1),helix.cov(2,2),helix.cov(3,3),helix.cov(4,4));
+  if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
+    printf("kernelBLFit failed! %f/%f\n", helix.chi2_circle, helix.chi2_line);
+  printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+         N,
+         nHits,
+         helix_start,
+         circle.par(0),
+         circle.par(1),
+         circle.par(2));
+  printf("kernelBLHits line.par(0,1): %d %f,%f\n", helix_start, line.par(0), line.par(1));
+  printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
+         helix.chi2_circle,
+         helix.chi2_line,
+         helix.cov(0, 0),
+         helix.cov(1, 1),
+         helix.cov(2, 2),
+         helix.cov(3, 3),
+         helix.cov(4, 4));
 #endif
 }
 
-
-void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
-{
-    assert(tuples_d);
-
-    auto blockSize = 64;
-    auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
-
-   //  Fit internals
-   edm::Service<CUDAService> cs;
-   auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
-   auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
-   auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
-
-   for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
-
-      // fit triplets
-      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-          3, offset);
+void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples,
+                                            cuda::stream_t<> &stream) {
+  assert(tuples_d);
+
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  edm::Service<CUDAService> cs;
+  auto hitsGPU_ = cs->make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ =
+      cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ =
+      cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      hh.view(),
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      3,
+                                                                      offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                  bField_,
+                                                                  helix_fit_results_d,
+                                                                  hitsGPU_.get(),
+                                                                  hits_geGPU_.get(),
+                                                                  fast_fit_resultsGPU_.get(),
+                                                                  3,
+                                                                  offset);
+    cudaCheck(cudaGetLastError());
+
+    // fit quads
+    kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      hh.view(),
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      4,
+                                                                      offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                  bField_,
+                                                                  helix_fit_results_d,
+                                                                  hitsGPU_.get(),
+                                                                  hits_geGPU_.get(),
+                                                                  fast_fit_resultsGPU_.get(),
+                                                                  4,
+                                                                  offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                        tupleMultiplicity_d,
+                                                                        hh.view(),
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        5,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-             3, offset);
+      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                    bField_,
+                                                                    helix_fit_results_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
       cudaCheck(cudaGetLastError());
-
-      // fit quads
-      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-          4, offset);
+    } else {
+      // fit penta (all 5)
+      kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                        tupleMultiplicity_d,
+                                                                        hh.view(),
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        5,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-             4, offset);
+      kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                    bField_,
+                                                                    helix_fit_results_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
       cudaCheck(cudaGetLastError());
+    }
 
-      if (fit5as4_) {
-        // fit penta (only first 4)
-        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-          5, offset);
-        cudaCheck(cudaGetLastError());
-
-        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-             5, offset);
-        cudaCheck(cudaGetLastError());
-      } else {
-        // fit penta (all 5)
-        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-          5, offset);
-        cudaCheck(cudaGetLastError());
-
-        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
-             5, offset);
-        cudaCheck(cudaGetLastError());
-      }
-
-    } // loop on concurrent fits
-
-
+  }  // loop on concurrent fits
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index a33c613402dcf..aa9b4e188deb5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -1,62 +1,60 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
 #define RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
 
-#include <cuda_runtime.h>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include <cuda_runtime.h>
+
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 // #define ONLY_PHICUT
 
 namespace CAConstants {
 
-   // constants
+  // constants
 #ifdef GPU_SMALL_EVENTS
-   constexpr uint32_t maxNumberOfTuples() { return 3*1024;}
+  constexpr uint32_t maxNumberOfTuples() { return 3 * 1024; }
 #else
-   constexpr uint32_t maxNumberOfTuples() { return 6*1024;}
+  constexpr uint32_t maxNumberOfTuples() { return 6 * 1024; }
 #endif
-   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
+  constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
 #ifndef GPU_SMALL_EVENTS
-   constexpr uint32_t maxNumberOfDoublets() { return 262144; }
-   constexpr uint32_t maxCellsPerHit() { return 128; }
+  constexpr uint32_t maxNumberOfDoublets() { return 262144; }
+  constexpr uint32_t maxCellsPerHit() { return 128; }
 #else
-   constexpr uint32_t maxNumberOfDoublets() { return 262144/2; }
-   constexpr uint32_t maxCellsPerHit() { return 128/2; }
+  constexpr uint32_t maxNumberOfDoublets() { return 262144 / 2; }
+  constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
 #endif
 #else
-   constexpr uint32_t maxNumberOfDoublets() { return 6*262144; }
-   constexpr uint32_t maxCellsPerHit() { return 4*128; }
+  constexpr uint32_t maxNumberOfDoublets() { return 6 * 262144; }
+  constexpr uint32_t maxCellsPerHit() { return 4 * 128; }
 #endif
-  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets()/4;}
-
-
-   constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
-   constexpr uint32_t maxNumberOfLayers() { return 10; }
-   constexpr uint32_t maxTuples() { return maxNumberOfTuples();}
+  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
 
-   // types
-   using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct
-   using tindex_type = uint16_t; //  for tuples
+  constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
+  constexpr uint32_t maxNumberOfLayers() { return 10; }
+  constexpr uint32_t maxTuples() { return maxNumberOfTuples(); }
 
-   using CellNeighbors = GPU::VecArray< uint32_t, 36>;
-   using CellTracks = GPU::VecArray< tindex_type, 42>;
+  // types
+  using hindex_type = uint16_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+  using tindex_type = uint16_t;  //  for tuples
 
-   using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
-   using CellTracksVector = GPU::SimpleVector<CellTracks>;
+  using CellNeighbors = GPU::VecArray<uint32_t, 36>;
+  using CellTracks = GPU::VecArray<tindex_type, 42>;
 
-   using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>;
-   using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5*maxTuples()>;
-   using HitToTuple = OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
-   using TupleMultiplicity = OneToManyAssoc<tindex_type,8,maxTuples()>;
+  using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
+  using CellTracksVector = GPU::SimpleVector<CellTracks>;
 
-}
+  using OuterHitOfCell = GPU::VecArray<uint32_t, maxCellsPerHit()>;
+  using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
+  using HitToTuple =
+      OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
+  using TupleMultiplicity = OneToManyAssoc<tindex_type, 8, maxTuples()>;
 
+}  // namespace CAConstants
 
-
-#endif
-
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index f20a96dc79b73..91f2f903295ef 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -1,8 +1,9 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+
 //
 // Author: Felice Pantaleo, CERN
 //
-#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
-#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
 
 #include <cuda_runtime.h>
 
@@ -11,12 +12,10 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
-
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
 class GPUCACell {
 public:
-
   using ptrAsInt = unsigned long long;
 
   static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit();
@@ -29,7 +28,7 @@ class GPUCACell {
   using Hits = TrackingRecHit2DSOAView;
   using hindex_type = Hits::hindex_type;
 
-  using TmpTuple = GPU::VecArray<uint32_t,6>;
+  using TmpTuple = GPU::VecArray<uint32_t, 6>;
 
   using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
@@ -38,12 +37,13 @@ class GPUCACell {
   GPUCACell() = default;
 #ifdef __CUDACC__
 
-  __device__ __forceinline__
-  void init(CellNeighborsVector & cellNeighbors, CellTracksVector & cellTracks,
-      Hits const & hh,
-      int layerPairId, int doubletId,  
-      hindex_type innerHitId, hindex_type outerHitId)
-  {
+  __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
+                                       CellTracksVector& cellTracks,
+                                       Hits const& hh,
+                                       int layerPairId,
+                                       int doubletId,
+                                       hindex_type innerHitId,
+                                       hindex_type outerHitId) {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
@@ -56,62 +56,56 @@ class GPUCACell {
     tracks().reset();
     assert(outerNeighbors().empty());
     assert(tracks().empty());
-
   }
 
-
- __device__ __forceinline__
-  int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector & cellNeighbors) {
-     return outerNeighbors().push_back(t);
+  __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
+    return outerNeighbors().push_back(t);
   }
 
-  __device__ __forceinline__
-  int addTrack(CellTracks::value_t t, CellTracksVector & cellTracks) {
-     return tracks().push_back(t);
-  } 
-
-  __device__ __forceinline__ CellTracks & tracks() { return theTracks;}
-  __device__ __forceinline__ CellTracks const & tracks() const { return theTracks;}
-  __device__ __forceinline__ CellNeighbors & outerNeighbors() { return theOuterNeighbors;}
-  __device__ __forceinline__ CellNeighbors const & outerNeighbors() const { return theOuterNeighbors;}
-  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return hh.xGlobal(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return hh.xGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return hh.yGlobal(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return hh.yGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
-  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return hh.zGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
-  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return hh.rGlobal(theOuterHitId); }
-
-   __device__ __forceinline__ auto get_inner_iphi(Hits const & hh) const { return hh.iphi(theInnerHitId); }
-   __device__ __forceinline__ auto get_outer_iphi(Hits const & hh) const { return hh.iphi(theOuterHitId); }
-
-  __device__ __forceinline__ float get_inner_detId(Hits const & hh) const { return hh.detectorIndex(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_detId(Hits const & hh) const { return hh.detectorIndex(theOuterHitId); }
-
-  constexpr unsigned int get_inner_hit_id() const {
-    return theInnerHitId;
-  }
-  constexpr unsigned int get_outer_hit_id() const {
-    return theOuterHitId;
+  __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
+    return tracks().push_back(t);
   }
 
-
-  __device__
-  void print_cell() const {
-    printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
-           "%d, innerradius %f, outerRadius %f \n",
-           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId
-    );
+  __device__ __forceinline__ CellTracks& tracks() { return theTracks; }
+  __device__ __forceinline__ CellTracks const& tracks() const { return theTracks; }
+  __device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; }
+  __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; }
+  __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_z(Hits const& hh) const {
+    return theInnerZ;
+  }  // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_r(Hits const& hh) const {
+    return theInnerR;
+  }  // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
+
+  __device__ __forceinline__ auto get_inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
+  __device__ __forceinline__ auto get_outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
+
+  __device__ __forceinline__ float get_inner_detId(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_detId(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
+
+  constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; }
+  constexpr unsigned int get_outer_hit_id() const { return theOuterHitId; }
+
+  __device__ void print_cell() const {
+    printf(
+        "printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
+        "%d, innerradius %f, outerRadius %f \n",
+        theDoubletId,
+        theLayerPairId,
+        theInnerHitId,
+        theOuterHitId);
   }
 
-
-  __device__
-  bool check_alignment(Hits const & hh,
-      GPUCACell const & otherCell, 
-      const float ptmin,
-      const float hardCurvCut) const
-  {
+  __device__ bool check_alignment(Hits const& hh,
+                                  GPUCACell const& otherCell,
+                                  const float ptmin,
+                                  const float hardCurvCut) const {
     auto ri = get_inner_r(hh);
     auto zi = get_inner_z(hh);
 
@@ -120,36 +114,30 @@ class GPUCACell {
 
     auto r1 = otherCell.get_inner_r(hh);
     auto z1 = otherCell.get_inner_z(hh);
-    auto isBarrel = otherCell.get_outer_detId(hh)<1184;
-    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, isBarrel ? 0.002f : 0.003f); // 2.f*thetaCut); // FIXME tune cuts
-    return (aligned &&  dcaCut(hh, otherCell, otherCell.get_inner_detId(hh)<96 ? 0.15f : 0.25f, hardCurvCut));  // FIXME tune cuts
-                            // region_origin_radius_plus_tolerance,  hardCurvCut));
+    auto isBarrel = otherCell.get_outer_detId(hh) < 1184;
+    bool aligned =
+        areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, isBarrel ? 0.002f : 0.003f);  // 2.f*thetaCut); // FIXME tune cuts
+    return (aligned &&
+            dcaCut(hh, otherCell, otherCell.get_inner_detId(hh) < 96 ? 0.15f : 0.25f, hardCurvCut));  // FIXME tune cuts
+    // region_origin_radius_plus_tolerance,  hardCurvCut));
   }
 
-  __device__ __forceinline__
-  static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo,
-                                        const float ptmin,
-                                        const float thetaCut) {
+  __device__ __forceinline__ static bool areAlignedRZ(
+      float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, const float thetaCut) {
     float radius_diff = std::abs(r1 - ro);
-    float distance_13_squared =
-        radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
+    float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
 
-    float pMin =
-        ptmin * std::sqrt(distance_13_squared); // this needs to be divided by
-                                                // radius_diff later
+    float pMin = ptmin * std::sqrt(distance_13_squared);  // this needs to be divided by
+                                                          // radius_diff later
 
-    float tan_12_13_half_mul_distance_13_squared =
-        fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
+    float tan_12_13_half_mul_distance_13_squared = fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
     return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
   }
 
-  
-  __device__
-  inline bool
-  dcaCut(Hits const & hh, GPUCACell const & otherCell,
-                       const float region_origin_radius_plus_tolerance,
-                       const float maxCurv) const {
-
+  __device__ inline bool dcaCut(Hits const& hh,
+                                GPUCACell const& otherCell,
+                                const float region_origin_radius_plus_tolerance,
+                                const float maxCurv) const {
     auto x1 = otherCell.get_inner_x(hh);
     auto y1 = otherCell.get_inner_y(hh);
 
@@ -159,48 +147,43 @@ class GPUCACell {
     auto x3 = get_outer_x(hh);
     auto y3 = get_outer_y(hh);
 
-    CircleEq<float> eq(x1,y1,x2,y2,x3,y3);  
-
-    if (eq.curvature() > maxCurv) return false;
+    CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
 
-    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance*std::abs(eq.curvature());
+    if (eq.curvature() > maxCurv)
+      return false;
 
+    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
   }
 
-  __device__
-  inline bool 
-  hole(Hits const & hh, GPUCACell const & innerCell) const {
+  __device__ inline bool hole(Hits const& hh, GPUCACell const& innerCell) const {
     int p = get_outer_iphi(hh);
-    if (p<0) p+=std::numeric_limits<unsigned short>::max();
-    p = (64*p)/std::numeric_limits<unsigned short>::max();
-    p %=2;
-    float r4 = p==0 ? 15.815 : 16.146;  // later on from geom
+    if (p < 0)
+      p += std::numeric_limits<unsigned short>::max();
+    p = (64 * p) / std::numeric_limits<unsigned short>::max();
+    p %= 2;
+    float r4 = p == 0 ? 15.815 : 16.146;  // later on from geom
     auto ri = innerCell.get_inner_r(hh);
     auto zi = innerCell.get_inner_z(hh);
     auto ro = get_outer_r(hh);
     auto zo = get_outer_z(hh);
-    auto z4 = std::abs(zi + (r4-ri)*(zo-zi)/(ro-ri));
-    auto zm = z4-6.7*int(z4/6.7);
-    auto h = zm<0.2 || zm>6.5;
-    return h || ( z4>26 && z4<32.f);
+    auto z4 = std::abs(zi + (r4 - ri) * (zo - zi) / (ro - ri));
+    auto zm = z4 - 6.7 * int(z4 / 6.7);
+    auto h = zm < 0.2 || zm > 6.5;
+    return h || (z4 > 26 && z4 < 32.f);
   }
 
-
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
 
-  template<typename CM>
-  __device__
-  inline void find_ntuplets(
-      Hits const & hh,
-      GPUCACell * __restrict__ cells,
-      CellTracksVector & cellTracks,
-      TuplesOnGPU::Container & foundNtuplets, 
-      AtomicPairCounter & apc,
-      CM & tupleMultiplicity,
-      TmpTuple & tmpNtuplet,
-      const unsigned int minHitsPerNtuplet) const
-  {
+  template <typename CM>
+  __device__ inline void find_ntuplets(Hits const& hh,
+                                       GPUCACell* __restrict__ cells,
+                                       CellTracksVector& cellTracks,
+                                       TuplesOnGPU::Container& foundNtuplets,
+                                       AtomicPairCounter& apc,
+                                       CM& tupleMultiplicity,
+                                       TmpTuple& tmpNtuplet,
+                                       const unsigned int minHitsPerNtuplet) const {
     // the building process for a track ends if:
     // it has no right neighbor
     // it has no compatible neighbor
@@ -208,28 +191,31 @@ class GPUCACell {
     // than a threshold
 
     tmpNtuplet.push_back_unsafe(theDoubletId);
-    assert(tmpNtuplet.size()<=4);
+    assert(tmpNtuplet.size() <= 4);
 
-    if(outerNeighbors().size()>0) {
+    if (outerNeighbors().size() > 0) {
       for (int j = 0; j < outerNeighbors().size(); ++j) {
         auto otherCell = outerNeighbors()[j];
-        cells[otherCell].find_ntuplets(hh, cells, cellTracks, foundNtuplets, apc, tupleMultiplicity, 
-                                       tmpNtuplet, minHitsPerNtuplet);
+        cells[otherCell].find_ntuplets(
+            hh, cells, cellTracks, foundNtuplets, apc, tupleMultiplicity, tmpNtuplet, minHitsPerNtuplet);
       }
     } else {  // if long enough save...
-      if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) {
+      if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
 #ifndef ALL_TRIPLETS
         // triplets accepted only pointing to the hole
-        if (tmpNtuplet.size()>=3 || hole(hh, cells[tmpNtuplet[0]]))
+        if (tmpNtuplet.size() >= 3 || hole(hh, cells[tmpNtuplet[0]]))
 #endif
-       {
-          hindex_type hits[6]; auto nh=0U;
-          for (auto c : tmpNtuplet) hits[nh++] = cells[c].theInnerHitId;
-          hits[nh] = theOuterHitId; 
-          auto it = foundNtuplets.bulkFill(apc,hits,tmpNtuplet.size()+1);
-          if (it>=0)  { // if negative is overflow....
-            for (auto c : tmpNtuplet) cells[c].addTrack(it,cellTracks);
-            tupleMultiplicity.countDirect(tmpNtuplet.size()+1);
+        {
+          hindex_type hits[6];
+          auto nh = 0U;
+          for (auto c : tmpNtuplet)
+            hits[nh++] = cells[c].theInnerHitId;
+          hits[nh] = theOuterHitId;
+          auto it = foundNtuplets.bulkFill(apc, hits, tmpNtuplet.size() + 1);
+          if (it >= 0) {  // if negative is overflow....
+            for (auto c : tmpNtuplet)
+              cells[c].addTrack(it, cellTracks);
+            tupleMultiplicity.countDirect(tmpNtuplet.size() + 1);
           }
         }
       }
@@ -238,7 +224,7 @@ class GPUCACell {
     assert(tmpNtuplet.size() < 4);
   }
 
-#endif // __CUDACC__
+#endif  // __CUDACC__
 
 private:
   CellNeighbors theOuterNeighbors;
@@ -255,4 +241,4 @@ class GPUCACell {
   hindex_type theOuterHitId;
 };
 
-#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index b2eab7626279e..a374c975ef4b6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,19 +1,16 @@
-#include "HelixFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMultiplicity const * tupleMultiplicity, Rfit::helix_fit * helix_fit_results) {
-
+void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const* tuples,
+                                  TupleMultiplicity const* tupleMultiplicity,
+                                  Rfit::helix_fit* helix_fit_results) {
   tuples_d = tuples;
   tupleMultiplicity_d = tupleMultiplicity;
   helix_fit_results_d = helix_fit_results;
 
-  assert(tuples_d); assert(tupleMultiplicity_d); assert(helix_fit_results_d);
-
+  assert(tuples_d);
+  assert(tupleMultiplicity_d);
+  assert(helix_fit_results_d);
 }
 
-void HelixFitOnGPU::deallocateOnGPU() {
-
-}
-
-
-
+void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index a50116c87321d..06a8e0b0b5505 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,72 +1,73 @@
 #ifndef RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
 #define RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
 
+#include <cuda/api_wrappers.h>
+
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 #include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
 
-#include <cuda/api_wrappers.h>
-
 class TrackingRecHit2DSOAView;
 class TrackingRecHit2DCUDA;
 
-
 namespace Rfit {
-  constexpr uint32_t maxNumberOfConcurrentFits() { return 6*1024;}
-  constexpr uint32_t stride() { return maxNumberOfConcurrentFits();}
-  using Matrix3x4d = Eigen::Matrix<double,3,4>;
-  using Map3x4d = Eigen::Map<Matrix3x4d,0,Eigen::Stride<3*stride(),stride()> >;
-  using Matrix6x4f = Eigen::Matrix<float,6,4>;
-  using Map6x4f = Eigen::Map<Matrix6x4f,0,Eigen::Stride<6*stride(),stride()> >;
+  constexpr uint32_t maxNumberOfConcurrentFits() { return 6 * 1024; }
+  constexpr uint32_t stride() { return maxNumberOfConcurrentFits(); }
+  using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
+  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride(), stride()> >;
+  using Matrix6x4f = Eigen::Matrix<float, 6, 4>;
+  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride(), stride()> >;
 
   // hits
-  template<int N>
-  using Matrix3xNd = Eigen::Matrix<double,3,N>;
-  template<int N>
-  using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >;
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()> >;
   // errors
-  template<int N>
-  using Matrix6xNf = Eigen::Matrix<float,6,N>;
-  template<int N>
-  using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >;
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()> >;
   // fast fit
-  using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;
-
-}
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
 
+}  // namespace Rfit
 
 class HelixFitOnGPU {
 public:
-
-   using HitsOnGPU = TrackingRecHit2DSOAView;
-   using HitsOnCPU = TrackingRecHit2DCUDA;
-
-   using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
-   using TupleMultiplicity = CAConstants::TupleMultiplicity;
-
-   explicit HelixFitOnGPU(bool fit5as4) : fit5as4_(fit5as4) {}
-   ~HelixFitOnGPU() { deallocateOnGPU();}
-
-   void setBField(double bField) { bField_ = bField;}
-   void launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream);
-   void launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream);
-
-   void allocateOnGPU(TuplesOnGPU::Container const * tuples, TupleMultiplicity const * tupleMultiplicity, Rfit::helix_fit * helix_fit_results);
-   void deallocateOnGPU();
-
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+
+  using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  explicit HelixFitOnGPU(bool fit5as4) : fit5as4_(fit5as4) {}
+  ~HelixFitOnGPU() { deallocateOnGPU(); }
+
+  void setBField(double bField) { bField_ = bField; }
+  void launchRiemannKernels(HitsOnCPU const &hh,
+                            uint32_t nhits,
+                            uint32_t maxNumberOfTuples,
+                            cuda::stream_t<> &cudaStream);
+  void launchBrokenLineKernels(HitsOnCPU const &hh,
+                               uint32_t nhits,
+                               uint32_t maxNumberOfTuples,
+                               cuda::stream_t<> &cudaStream);
+
+  void allocateOnGPU(TuplesOnGPU::Container const *tuples,
+                     TupleMultiplicity const *tupleMultiplicity,
+                     Rfit::helix_fit *helix_fit_results);
+  void deallocateOnGPU();
 
 private:
+  static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
 
-    static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
-
-    // fowarded
-    TuplesOnGPU::Container const * tuples_d = nullptr;
-    TupleMultiplicity const * tupleMultiplicity_d = nullptr;
-    double bField_;
-    Rfit::helix_fit * helix_fit_results_d = nullptr;
-
-    const bool fit5as4_;
-
+  // fowarded
+  TuplesOnGPU::Container const *tuples_d = nullptr;
+  TupleMultiplicity const *tupleMultiplicity_d = nullptr;
+  double bField_;
+  Rfit::helix_fit *helix_fit_results_d = nullptr;
 
+  const bool fit5as4_;
 };
 
-#endif
+#endif  // RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
index f7538fc822011..3279587d2486e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
@@ -3,77 +3,82 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
 #define RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
 
-
 #include <cstdint>
 #include <unordered_map>
 
-#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
-#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
 #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
 #include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
+#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 
 // store T for each cluster
-template<typename T>
+template <typename T>
 class RecHitsMap {
 public:
-     explicit RecHitsMap(T const & d=T()) : dummy(d){}
-
-     void clear() {m_map.clear();}
-
-     void error(const GeomDetUnit& gd) const {edm::LogError("RecHitMap") << "hit not found in det " << gd.index();  }
-     void error(uint32_t ind) const {edm::LogError("RecHitMap") << "hit not found in det " << ind;  }
-
-     // does not work for matched hits... (easy to extend)
-     void add(TrackingRecHit const & hit, T const & v) {
-       auto const & thit = static_cast<BaseTrackerRecHit const&>(hit);
-       auto const & clus = thit.firstClusterRef();
-
-       if (clus.isPixel())
-          add(clus.pixelCluster(), *thit.detUnit(),v);
-       else
-          add(clus.stripCluster(), *thit.detUnit(),v);
-     }
-
-     template<typename Cluster>
-     void add(const Cluster& cluster, const GeomDetUnit& gd, T const & v) { m_map[encode(cluster,gd)] = v; }
-
-     template<typename Cluster>
-     T const & get(const Cluster& cluster, const GeomDetUnit& gd) const {
-       auto p = m_map.find(encode(cluster,gd));
-       if (p!=m_map.end()) { return (*p).second; }
-       error(gd);
-       return dummy;
-     }
-
-     T const & get(uint32_t ind, uint16_t mr, uint16_t mc) const {
-       auto p = m_map.find(encode(ind,mr,mc));
-       if (p!=m_map.end()) { return (*p).second; }
-       error(ind);
-       return dummy;
-     }
-
-     static uint64_t encode(uint32_t ind, uint16_t mr, uint16_t mc) {
-          uint64_t u1 = ind;
-          uint64_t u2 = mr;
-          uint64_t u3 = mc;
-          return (u1<<32) | (u2<<16) | u3;
-     }
-
-     static uint64_t encode(const SiPixelCluster& cluster, const GeomDetUnit& det) {
-          uint64_t u1 = det.index();
-          uint64_t u2 = cluster.minPixelRow();
-          uint64_t u3 = cluster.minPixelCol();
-          return (u1<<32) | (u2<<16) | u3;
-     }
-     static uint64_t encode(const SiStripCluster& cluster, const GeomDetUnit& det) {
-          uint64_t u1 = det.index();
-          uint64_t u2 = cluster.firstStrip();
-       	  return (u1<<32) | u2;
-     }
-
-     std::unordered_map<uint64_t, T > m_map;
-     T dummy;
-  };
-
-#endif // RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
+  explicit RecHitsMap(T const& d = T()) : dummy(d) {}
+
+  void clear() { m_map.clear(); }
+
+  void error(const GeomDetUnit& gd) const { edm::LogError("RecHitMap") << "hit not found in det " << gd.index(); }
+  void error(uint32_t ind) const { edm::LogError("RecHitMap") << "hit not found in det " << ind; }
+
+  // does not work for matched hits... (easy to extend)
+  void add(TrackingRecHit const& hit, T const& v) {
+    auto const& thit = static_cast<BaseTrackerRecHit const&>(hit);
+    auto const& clus = thit.firstClusterRef();
+
+    if (clus.isPixel())
+      add(clus.pixelCluster(), *thit.detUnit(), v);
+    else
+      add(clus.stripCluster(), *thit.detUnit(), v);
+  }
+
+  template <typename Cluster>
+  void add(const Cluster& cluster, const GeomDetUnit& gd, T const& v) {
+    m_map[encode(cluster, gd)] = v;
+  }
+
+  template <typename Cluster>
+  T const& get(const Cluster& cluster, const GeomDetUnit& gd) const {
+    auto p = m_map.find(encode(cluster, gd));
+    if (p != m_map.end()) {
+      return (*p).second;
+    }
+    error(gd);
+    return dummy;
+  }
+
+  T const& get(uint32_t ind, uint16_t mr, uint16_t mc) const {
+    auto p = m_map.find(encode(ind, mr, mc));
+    if (p != m_map.end()) {
+      return (*p).second;
+    }
+    error(ind);
+    return dummy;
+  }
+
+  static uint64_t encode(uint32_t ind, uint16_t mr, uint16_t mc) {
+    uint64_t u1 = ind;
+    uint64_t u2 = mr;
+    uint64_t u3 = mc;
+    return (u1 << 32) | (u2 << 16) | u3;
+  }
+
+  static uint64_t encode(const SiPixelCluster& cluster, const GeomDetUnit& det) {
+    uint64_t u1 = det.index();
+    uint64_t u2 = cluster.minPixelRow();
+    uint64_t u3 = cluster.minPixelCol();
+    return (u1 << 32) | (u2 << 16) | u3;
+  }
+  static uint64_t encode(const SiStripCluster& cluster, const GeomDetUnit& det) {
+    uint64_t u1 = det.index();
+    uint64_t u2 = cluster.firstStrip();
+    return (u1 << 32) | u2;
+  }
+
+  std::unordered_map<uint64_t, T> m_map;
+  T dummy;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index ea801b1b46389..4aea729e913a6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -2,68 +2,66 @@
 // Author: Felice Pantaleo, CERN
 //
 
-#include "HelixFitOnGPU.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-
 #include <cstdint>
+
 #include <cuda_runtime.h>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
-
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
 
-
+#include "HelixFitOnGPU.h"
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
 
-template<int N>
-__global__
-void kernelFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
-    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
-    uint32_t nHits,
-    HitsOnGPU const * __restrict__ hhp,
-    double * __restrict__ phits,
-    float * __restrict__ phits_ge,
-    double * __restrict__ pfast_fit,
-    uint32_t offset)
-{
-
+template <int N>
+__global__ void kernelFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets,
+                              CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                              uint32_t nHits,
+                              HitsOnGPU const *__restrict__ hhp,
+                              double *__restrict__ phits,
+                              float *__restrict__ phits_ge,
+                              double *__restrict__ pfast_fit,
+                              uint32_t offset) {
   constexpr uint32_t hitsInFit = N;
 
-  assert(hitsInFit<=nHits);
+  assert(hitsInFit <= nHits);
 
-  assert(pfast_fit); assert(foundNtuplets); assert(tupleMultiplicity);
+  assert(pfast_fit);
+  assert(foundNtuplets);
+  assert(tupleMultiplicity);
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
 
 #ifdef RIEMANN_DEBUG
-  if (0==local_start) printf("%d Ntuple of size %d for %d hits to fit\n",tupleMultiplicity->size(nHits), nHits, hitsInFit);
+  if (0 == local_start)
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
 #endif
 
   auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+  if (tuple_start >= tupleMultiplicity->size(nHits))
+    return;
 
   // get it from the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
-  assert (helix_start < foundNtuplets->nbins());
-
-  assert (foundNtuplets->size(helix_start)==nHits);
+  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  assert(helix_start < foundNtuplets->nbins());
 
-  Rfit::Map3xNd<N> hits(phits+local_start);
-  Rfit::Map4d   fast_fit(pfast_fit+local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
+  assert(foundNtuplets->size(helix_start) == nHits);
 
+  Rfit::Map3xNd<N> hits(phits + local_start);
+  Rfit::Map4d fast_fit(pfast_fit + local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
 
   // Prepare data structure
-  auto const * hitId = foundNtuplets->begin(helix_start);
+  auto const *hitId = foundNtuplets->begin(helix_start);
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
@@ -72,104 +70,92 @@ void kernelFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
     // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
 
     hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
-    hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
+    hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
   }
-  Rfit::Fast_fit(hits,fast_fit);
+  Rfit::Fast_fit(hits, fast_fit);
 
   // no NaN here....
-  assert(fast_fit(0)==fast_fit(0));
-  assert(fast_fit(1)==fast_fit(1));
-  assert(fast_fit(2)==fast_fit(2));
-  assert(fast_fit(3)==fast_fit(3));
-
+  assert(fast_fit(0) == fast_fit(0));
+  assert(fast_fit(1) == fast_fit(1));
+  assert(fast_fit(2) == fast_fit(2));
+  assert(fast_fit(3) == fast_fit(3));
 }
 
-template<int N>
-__global__
-void kernelCircleFit(
-    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
-    uint32_t nHits,
-    double B,
-    double * __restrict__ phits,
-    float * __restrict__ phits_ge,
-    double * __restrict__ pfast_fit_input,
-    Rfit::circle_fit *circle_fit,
-    uint32_t offset)
-{
-
-  assert(circle_fit); 
-  assert(N<=nHits);
+template <int N>
+__global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                uint32_t nHits,
+                                double B,
+                                double *__restrict__ phits,
+                                float *__restrict__ phits_ge,
+                                double *__restrict__ pfast_fit_input,
+                                Rfit::circle_fit *circle_fit,
+                                uint32_t offset) {
+  assert(circle_fit);
+  assert(N <= nHits);
 
   // same as above...
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
   auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+  if (tuple_start >= tupleMultiplicity->size(nHits))
+    return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
-
+  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
-  Rfit::Map3xNd<N> hits(phits+local_start);
-  Rfit::Map4d   fast_fit(pfast_fit_input+local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
+  Rfit::Map3xNd<N> hits(phits + local_start);
+  Rfit::Map4d fast_fit(pfast_fit_input + local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
 
   Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-  Rfit::Matrix2Nd<N> hits_cov =  Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge,hits_cov);
+  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
+  Rfit::loadCovariance2D(hits_ge, hits_cov);
 
-  circle_fit[local_start] =
-      Rfit::Circle_fit(hits.block(0, 0, 2, N),
-                       hits_cov,
-                       fast_fit, rad, B, true);
+  circle_fit[local_start] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
 
 #ifdef RIEMANN_DEBUG
-//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", helix_start, 
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", helix_start,
 //         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
 #endif
 }
 
-
-template<int N>
-__global__
-void kernelLineFit(
-    CAConstants::TupleMultiplicity const * __restrict__ tupleMultiplicity,
-    uint32_t nHits,
-    double B,
-    Rfit::helix_fit *results,
-    double * __restrict__ phits,
-    float * __restrict__ phits_ge,
-    double * __restrict__ pfast_fit_input,
-    Rfit::circle_fit * __restrict__ circle_fit,
-    uint32_t offset)
-{
-
-  assert(results); assert(circle_fit);
-  assert(N<=nHits);
+template <int N>
+__global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                              uint32_t nHits,
+                              double B,
+                              Rfit::helix_fit *results,
+                              double *__restrict__ phits,
+                              float *__restrict__ phits_ge,
+                              double *__restrict__ pfast_fit_input,
+                              Rfit::circle_fit *__restrict__ circle_fit,
+                              uint32_t offset) {
+  assert(results);
+  assert(circle_fit);
+  assert(N <= nHits);
 
   // same as above...
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
   auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits)) return;
+  if (tuple_start >= tupleMultiplicity->size(nHits))
+    return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits)+tuple_start);
+  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
+  Rfit::Map3xNd<N> hits(phits + local_start);
+  Rfit::Map4d fast_fit(pfast_fit_input + local_start);
+  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
 
-  Rfit::Map3xNd<N> hits(phits+local_start);
-  Rfit::Map4d   fast_fit(pfast_fit_input+local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge+local_start);
-
-  auto const & line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true);
+  auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true);
 
   par_uvrtopak(circle_fit[local_start], B, true);
 
   // Grab helix_fit from the proper location in the output vector
-  auto & helix = results[helix_start];
+  auto &helix = results[helix_start];
   helix.par << circle_fit[local_start].par, line_fit.par;
 
   // TODO: pass properly error booleans
@@ -183,107 +169,175 @@ void kernelLineFit(
   helix.chi2_line = line_fit.chi2;
 
 #ifdef RIEMANN_DEBUG
-  printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", N,nHits, helix_start,
-         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
-  printf("kernelLineFit line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0),line_fit.par(1));
-  printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",helix.chi2_circle,helix.chi2_line, 
-         helix.cov(0,0),helix.cov(1,1),helix.cov(2,2),helix.cov(3,3),helix.cov(4,4));
+  printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+         N,
+         nHits,
+         helix_start,
+         circle_fit[local_start].par(0),
+         circle_fit[local_start].par(1),
+         circle_fit[local_start].par(2));
+  printf("kernelLineFit line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0), line_fit.par(1));
+  printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
+         helix.chi2_circle,
+         helix.chi2_line,
+         helix.cov(0, 0),
+         helix.cov(1, 1),
+         helix.cov(2, 2),
+         helix.cov(3, 3),
+         helix.cov(4, 4));
 #endif
 }
 
-
-void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const & hh, uint32_t nhits, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
-{
-    assert(tuples_d); 
-
-    auto blockSize = 64;
-    auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
-
-   //  Fit internals
-   edm::Service<CUDAService> cs;
-   auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
-   auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
-   auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
-   auto circle_fit_resultsGPU_holder = cs->make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit),stream);
-   Rfit::circle_fit * circle_fit_resultsGPU_ = (Rfit::circle_fit*)(circle_fit_resultsGPU_holder.get());
-
-    for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
-
-      // triplets
-      kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, 3, 
-          hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
+void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples,
+                                         cuda::stream_t<> &stream) {
+  assert(tuples_d);
+
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  edm::Service<CUDAService> cs;
+  auto hitsGPU_ = cs->make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ =
+      cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ =
+      cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto circle_fit_resultsGPU_holder =
+      cs->make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
+  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                    tupleMultiplicity_d,
+                                                                    3,
+                                                                    hh.view(),
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                      3,
+                                                                      bField_,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                    3,
+                                                                    bField_,
+                                                                    helix_fit_results_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    circle_fit_resultsGPU_,
+                                                                    offset);
+    cudaCheck(cudaGetLastError());
+
+    // quads
+    kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                    tupleMultiplicity_d,
+                                                                    4,
+                                                                    hh.view(),
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                      4,
+                                                                      bField_,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                    4,
+                                                                    bField_,
+                                                                    helix_fit_results_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    circle_fit_resultsGPU_,
+                                                                    offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // penta
+      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      5,
+                                                                      hh.view(),
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tupleMultiplicity_d, 3, bField_,
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
+      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, 3,  bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
-             offset);
+      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                      5,
+                                                                      bField_,
+                                                                      helix_fit_results_d,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
       cudaCheck(cudaGetLastError());
-
-      // quads
-      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, 4,
-          hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
+    } else {
+      // penta all 5
+      kernelFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      5,
+                                                                      hh.view(),
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tupleMultiplicity_d, 4, bField_,
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
+      kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, 4,  bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
-             offset);
+      kernelLineFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                      5,
+                                                                      bField_,
+                                                                      helix_fit_results_d,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
       cudaCheck(cudaGetLastError());
-
-      if (fit5as4_) {
-        // penta
-        kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, 5,
-          hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
-        cudaCheck(cudaGetLastError());
-
-        kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tupleMultiplicity_d, 5, bField_,
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
-        cudaCheck(cudaGetLastError());
-
-        kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
-             offset);
-        cudaCheck(cudaGetLastError());
-      } else {
-        // penta all 5
-        kernelFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tuples_d, tupleMultiplicity_d, 5,
-          hh.view(),
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),offset);
-        cudaCheck(cudaGetLastError());
-
-        kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-          tupleMultiplicity_d, 5, bField_,
-          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_, offset);
-        cudaCheck(cudaGetLastError());
-
-        kernelLineFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
-             tupleMultiplicity_d, 5,  bField_, helix_fit_results_d,
-             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), circle_fit_resultsGPU_,
-             offset);
-        cudaCheck(cudaGetLastError());
-
-      }
-
     }
-
+  }
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index e5f8406bd31b8..98722f4e5cf42 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -7,84 +7,84 @@
 #include <cstdio>
 #include <limits>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "GPUCACell.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 
 namespace gpuPixelDoublets {
 
-//  __device__
-//  __forceinline__
-  __global__
-  void fishbone(
-               GPUCACell::Hits const *  __restrict__ hhp,
-               GPUCACell * cells, uint32_t const * __restrict__ nCells,
-               GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
-               uint32_t nHits,
-               bool checkTrack) {
-
+  //  __device__
+  //  __forceinline__
+  __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp,
+                           GPUCACell* cells,
+                           uint32_t const* __restrict__ nCells,
+                           GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
+                           uint32_t nHits,
+                           bool checkTrack) {
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
-
-    auto const & hh = *hhp;
-    auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id);};
+    auto const& hh = *hhp;
+    auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
 
     // x run faster...
     auto idy = threadIdx.y + blockIdx.y * blockDim.y;
     auto first = threadIdx.x;
 
-    if (idy>=nHits) return;
-    auto const & vc = isOuterHitOfCell[idy];
+    if (idy >= nHits)
+      return;
+    auto const& vc = isOuterHitOfCell[idy];
     auto s = vc.size();
-    if (s<2) return;
+    if (s < 2)
+      return;
     // if alligned kill one of the two.
-    auto const & c0 = cells[vc[0]];
+    auto const& c0 = cells[vc[0]];
     auto xo = c0.get_outer_x(hh);
     auto yo = c0.get_outer_y(hh);
     auto zo = c0.get_outer_z(hh);
-    float x[maxCellsPerHit], y[maxCellsPerHit],z[maxCellsPerHit], n[maxCellsPerHit];
-    uint16_t d[maxCellsPerHit]; // uint8_t l[maxCellsPerHit];
+    float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
+    uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
     uint32_t cc[maxCellsPerHit];
-    auto sg=0;
-    for (uint32_t ic=0; ic<s; ++ic) {
-      auto & ci = cells[vc[ic]];
-      if (checkTrack && ci.tracks().empty()) continue;
+    auto sg = 0;
+    for (uint32_t ic = 0; ic < s; ++ic) {
+      auto& ci = cells[vc[ic]];
+      if (checkTrack && ci.tracks().empty())
+        continue;
       cc[sg] = vc[ic];
       d[sg] = ci.get_inner_detId(hh);
-//      l[sg] = layer(d[sg]);
-      x[sg] = ci.get_inner_x(hh) -xo;
-      y[sg] = ci.get_inner_y(hh) -yo;
-      z[sg] = ci.get_inner_z(hh) -zo;
-      n[sg] = x[sg]*x[sg]+y[sg]*y[sg]+z[sg]*z[sg];
+      //      l[sg] = layer(d[sg]);
+      x[sg] = ci.get_inner_x(hh) - xo;
+      y[sg] = ci.get_inner_y(hh) - yo;
+      z[sg] = ci.get_inner_z(hh) - zo;
+      n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
       ++sg;
     }
-    if (sg<2) return;   
-    // here we parallelize 
-    for (uint32_t ic=first; ic<sg-1;  ic+=blockDim.x) {
-      auto & ci = cells[cc[ic]];
-      for    (auto jc=ic+1; jc<sg; ++jc) {
-        auto & cj = cells[cc[jc]];
+    if (sg < 2)
+      return;
+    // here we parallelize
+    for (uint32_t ic = first; ic < sg - 1; ic += blockDim.x) {
+      auto& ci = cells[cc[ic]];
+      for (auto jc = ic + 1; jc < sg; ++jc) {
+        auto& cj = cells[cc[jc]];
         // must be different detectors (in the same layer)
-//        if (d[ic]==d[jc]) continue;
+        //        if (d[ic]==d[jc]) continue;
         // || l[ic]!=l[jc]) continue;
-        auto cos12 = x[ic]*x[jc]+y[ic]*y[jc]+z[ic]*z[jc];
-        if (d[ic]!=d[jc] && cos12*cos12 >= 0.99999f*n[ic]*n[jc]) {
-         // alligned:  kill farthest  (prefer consecutive layers)
-         if (n[ic]>n[jc]) {
-           ci.theDoubletId=-1; 
-           break;
-         } else {
-           cj.theDoubletId=-1;
-         }
+        auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
+        if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
+          // alligned:  kill farthest  (prefer consecutive layers)
+          if (n[ic] > n[jc]) {
+            ci.theDoubletId = -1;
+            break;
+          } else {
+            cj.theDoubletId = -1;
+          }
         }
-      } //cj   
-    } // ci
+      }  //cj
+    }    // ci
   }
 
-}
+}  // namespace gpuPixelDoublets
 
-#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index b4c563e601008..6357c8a49e03a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -7,89 +7,111 @@
 
 namespace gpuPixelDoublets {
 
-    using namespace gpuPixelDoubletsAlgos;
-
-    constexpr int nPairs = 13;
-    CONSTANT_VAR const uint8_t layerPairs[2*nPairs] = {
-      0, 1,  1, 2,  2, 3,
+  using namespace gpuPixelDoubletsAlgos;
+
+  constexpr int nPairs = 13;
+  CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
+      0,
+      1,
+      1,
+      2,
+      2,
+      3,
       // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
-      0, 7,  1, 7,  2, 7,  7, 8,  8, 9, // neg
-      0, 4,  1, 4,  2, 4,  4, 5,  5, 6,  // pos
-    };
-
-    constexpr int16_t phi0p05 = 522;    // round(521.52189...) = phi2short(0.05);
-    constexpr int16_t phi0p06 = 626;    // round(625.82270...) = phi2short(0.06);
-    constexpr int16_t phi0p07 = 730;    // round(730.12648...) = phi2short(0.07);
-
-    CONSTANT_VAR const int16_t phicuts[nPairs] {
-      phi0p05, phi0p05, phi0p06,
-      phi0p07, phi0p06, phi0p06, phi0p05, phi0p05,
-      phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
-    };
-
-    CONSTANT_VAR float const minz[nPairs] = {
-      -20., -22., -22.,
-      -30., -30.,-30., -70., -70.,
-        0., 10., 15., -70., -70.
-    };
-
-    CONSTANT_VAR float const maxz[nPairs] = {
-      20., 22., 22.,
-       0., -10., -15., 70., 70.,
-      30., 30., 30., 70., 70.
-    };
-
-    CONSTANT_VAR float const maxr[nPairs] = {
-      20., 20., 20.,
-       9.,  7.,  6.,  5.,  5.,
-       9.,  7.,  6.,  5.,  5.
-    };
-
+      0,
+      7,
+      1,
+      7,
+      2,
+      7,
+      7,
+      8,
+      8,
+      9,  // neg
+      0,
+      4,
+      1,
+      4,
+      2,
+      4,
+      4,
+      5,
+      5,
+      6,  // pos
+  };
+
+  constexpr int16_t phi0p05 = 522;  // round(521.52189...) = phi2short(0.05);
+  constexpr int16_t phi0p06 = 626;  // round(625.82270...) = phi2short(0.06);
+  constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
+
+  CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05,
+                                             phi0p05,
+                                             phi0p06,
+                                             phi0p07,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p07,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05};
+
+  CONSTANT_VAR float const minz[nPairs] = {-20., -22., -22., -30., -30., -30., -70., -70., 0., 10., 15., -70., -70.};
+
+  CONSTANT_VAR float const maxz[nPairs] = {20., 22., 22., 0., -10., -15., 70., 70., 30., 30., 30., 70., 70.};
+
+  CONSTANT_VAR float const maxr[nPairs] = {20., 20., 20., 9., 7., 6., 5., 5., 9., 7., 6., 5., 5.};
 
   constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
 
   constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
 
-
   using CellNeighbors = CAConstants::CellNeighbors;
   using CellTracks = CAConstants::CellTracks;
   using CellNeighborsVector = CAConstants::CellNeighborsVector;
   using CellTracksVector = CAConstants::CellTracksVector;
 
- __global__
-  void initDoublets(GPUCACell::OuterHitOfCell * isOuterHitOfCell, int nHits,
-                    CellNeighborsVector * cellNeighbors, CellNeighbors * cellNeighborsContainer,
-                    CellTracksVector * cellTracks, CellTracks * cellTracksContainer
-                   )
-  {
-     int first = blockIdx.x * blockDim.x + threadIdx.x;
-     for (int i=first; i<nHits; i+=gridDim.x*blockDim.x) isOuterHitOfCell[i].reset();
+  __global__ void initDoublets(GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                               int nHits,
+                               CellNeighborsVector* cellNeighbors,
+                               CellNeighbors* cellNeighborsContainer,
+                               CellTracksVector* cellTracks,
+                               CellTracks* cellTracksContainer) {
+    int first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
+      isOuterHitOfCell[i].reset();
   }
 
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
-  __global__
-  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
-  void getDoubletsFromHisto(GPUCACell * cells,
-                            uint32_t * nCells,
-                            CellNeighborsVector * cellNeighbors, CellTracksVector * cellTracks,
-                            TrackingRecHit2DSOAView const *  __restrict__ hhp,
-                            GPUCACell::OuterHitOfCell * isOuterHitOfCell,
-                            bool ideal_cond)
-  {
-
-    auto const &  __restrict__ hh = *hhp;
-    doubletsFromHisto(layerPairs, nPairs,
-                      cells, nCells,
-                      cellNeighbors, cellTracks,
-                      hh, isOuterHitOfCell,
-                      phicuts, minz, maxz, maxr,
+  __global__ __launch_bounds__(
+      getDoubletsFromHistoMaxBlockSize,
+      getDoubletsFromHistoMinBlocksPerMP) void getDoubletsFromHisto(GPUCACell* cells,
+                                                                    uint32_t* nCells,
+                                                                    CellNeighborsVector* cellNeighbors,
+                                                                    CellTracksVector* cellTracks,
+                                                                    TrackingRecHit2DSOAView const* __restrict__ hhp,
+                                                                    GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                                                    bool ideal_cond) {
+    auto const& __restrict__ hh = *hhp;
+    doubletsFromHisto(layerPairs,
+                      nPairs,
+                      cells,
+                      nCells,
+                      cellNeighbors,
+                      cellTracks,
+                      hh,
+                      isOuterHitOfCell,
+                      phicuts,
+                      minz,
+                      maxz,
+                      maxr,
                       ideal_cond);
   }
 
+}  // namespace gpuPixelDoublets
 
-
-} // namespace end
-
-#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index cffb432cd8e37..29e5aca73bd38 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -7,14 +7,13 @@
 #include <cstdio>
 #include <limits>
 
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
-#include "GPUCACell.h"
 #include "CAConstants.h"
-
+#include "GPUCACell.h"
 
 // useful for benchmark
 // #define ONLY_PHICUT
@@ -27,43 +26,39 @@ namespace gpuPixelDoubletsAlgos {
 
   constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
 
-
   using CellNeighbors = CAConstants::CellNeighbors;
   using CellTracks = CAConstants::CellTracks;
   using CellNeighborsVector = CAConstants::CellNeighborsVector;
   using CellTracksVector = CAConstants::CellTracksVector;
 
-  __device__
-  __forceinline__
-  void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
-                         uint32_t nPairs,
-                         GPUCACell * cells,
-                         uint32_t * nCells,
-                         CellNeighborsVector * cellNeighbors, CellTracksVector * cellTracks,
-                         TrackingRecHit2DSOAView const &  __restrict__ hh,
-                         GPUCACell::OuterHitOfCell * isOuterHitOfCell,
-                         int16_t const * __restrict__ phicuts,
-                         float const * __restrict__ minz,
-                         float const * __restrict__ maxz,
-                         float const * __restrict__ maxr,
-                         bool ideal_cond)
-  {
-
-#ifndef NO_CLSCUT 
+  __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
+                                                    uint32_t nPairs,
+                                                    GPUCACell* cells,
+                                                    uint32_t* nCells,
+                                                    CellNeighborsVector* cellNeighbors,
+                                                    CellTracksVector* cellTracks,
+                                                    TrackingRecHit2DSOAView const& __restrict__ hh,
+                                                    GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                                    int16_t const* __restrict__ phicuts,
+                                                    float const* __restrict__ minz,
+                                                    float const* __restrict__ maxz,
+                                                    float const* __restrict__ maxr,
+                                                    bool ideal_cond) {
+#ifndef NO_CLSCUT
     // ysize cuts (z in the barrel)  times 8
-    constexpr int minYsizeB1=36;
-    constexpr int minYsizeB2=28;
-    constexpr int maxDYsize12=28;
-    constexpr int maxDYsize=20;
+    constexpr int minYsizeB1 = 36;
+    constexpr int minYsizeB2 = 28;
+    constexpr int maxDYsize12 = 28;
+    constexpr int maxDYsize = 20;
 #endif
 
     using Hist = TrackingRecHit2DSOAView::Hist;
 
-    auto const & __restrict__ hist = hh.phiBinner();
-    uint32_t const * __restrict__ offsets = hh.hitsLayerStart();
+    auto const& __restrict__ hist = hh.phiBinner();
+    uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
     assert(offsets);
 
-    auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
+    auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
 
     // nPairsMax to be optimized later (originally was 64).
     // If it should be much bigger, consider using a block-wide parallel prefix scan,
@@ -72,12 +67,12 @@ namespace gpuPixelDoubletsAlgos {
     assert(nPairs <= nPairsMax);
     __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
     __shared__ uint32_t ntot;
-    if (threadIdx.y==0 && threadIdx.x==0) {
+    if (threadIdx.y == 0 && threadIdx.x == 0) {
       innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
       for (uint32_t i = 1; i < nPairs; ++i) {
-        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
+        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(layerPairs[2 * i]);
       }
-      ntot = innerLayerCumulativeSize[nPairs-1];
+      ntot = innerLayerCumulativeSize[nPairs - 1];
     }
     __syncthreads();
 
@@ -85,35 +80,36 @@ namespace gpuPixelDoubletsAlgos {
     auto idy = blockIdx.y * blockDim.y + threadIdx.y;
     auto first = threadIdx.x;
     auto stride = blockDim.x;
-    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
-
-      uint32_t pairLayerId=0;
-      while (j >= innerLayerCumulativeSize[pairLayerId++]);
-      --pairLayerId; // move to lower_bound ??
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
+      uint32_t pairLayerId = 0;
+      while (j >= innerLayerCumulativeSize[pairLayerId++])
+        ;
+      --pairLayerId;  // move to lower_bound ??
 
       assert(pairLayerId < nPairs);
       assert(j < innerLayerCumulativeSize[pairLayerId]);
-      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId-1]);
+      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
 
-      uint8_t inner = layerPairs[2*pairLayerId];
-      uint8_t outer = layerPairs[2*pairLayerId+1];
+      uint8_t inner = layerPairs[2 * pairLayerId];
+      uint8_t outer = layerPairs[2 * pairLayerId + 1];
       assert(outer > inner);
 
       auto hoff = Hist::histOff(outer);
 
-      auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
+      auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
       i += offsets[inner];
 
       // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
 
       assert(i >= offsets[inner]);
-      assert(i < offsets[inner+1]);
+      assert(i < offsets[inner + 1]);
 
       // found hit corresponding to our cuda thread, now do the job
       auto mez = hh.zGlobal(i);
 
 #ifndef NO_ZCUT
-     if (mez<minz[pairLayerId] || mez>maxz[pairLayerId]) continue;
+      if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
+        continue;
 #endif
 
 #ifndef NO_CLSCUT
@@ -121,57 +117,61 @@ namespace gpuPixelDoubletsAlgos {
 
       // if ideal treat inner ladder as outer
       auto mi = hh.detectorIndex(i);
-      if (inner==0) assert(mi<96);    
-      const bool isOuterLadder = ideal_cond ? true : 0 == (mi/8)%2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
-
-
-      if (inner==0 && outer>3 && isOuterLadder)  // B1 and F1
-         if (mes>0 && mes<minYsizeB1) continue; // only long cluster  (5*8)
-      if (inner==1 && outer>3)  // B2 and F1
-         if (mes>0 && mes<minYsizeB2) continue;
-#endif // NO_CLSCUT
+      if (inner == 0)
+        assert(mi < 96);
+      const bool isOuterLadder =
+          ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
+
+      if (inner == 0 && outer > 3 && isOuterLadder)  // B1 and F1
+        if (mes > 0 && mes < minYsizeB1)
+          continue;                 // only long cluster  (5*8)
+      if (inner == 1 && outer > 3)  // B2 and F1
+        if (mes > 0 && mes < minYsizeB2)
+          continue;
+#endif  // NO_CLSCUT
 
       auto mep = hh.iphi(i);
       auto mer = hh.rGlobal(i);
- 
-      constexpr float z0cut = 12.f;                     // cm
-      constexpr float hardPtCut = 0.5f;                 // GeV
-      constexpr float minRadius = hardPtCut * 87.78f;   // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
-      constexpr float minRadius2T4 = 4.f*minRadius*minRadius;
+
+      constexpr float z0cut = 12.f;      // cm
+      constexpr float hardPtCut = 0.5f;  // GeV
+      constexpr float minRadius =
+          hardPtCut * 87.78f;  // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
       auto ptcut = [&](int j) {
         auto r2t4 = minRadius2T4;
         auto ri = mer;
         auto ro = hh.rGlobal(j);
-        auto dphi = short2phi( min( abs(int16_t(mep-hh.iphi(j))), abs(int16_t(hh.iphi(j)-mep)) ) );
-        return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
+        auto dphi = short2phi(min(abs(int16_t(mep - hh.iphi(j))), abs(int16_t(hh.iphi(j) - mep))));
+        return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
       };
       auto z0cutoff = [&](int j) {
-        auto zo = hh.zGlobal(j);;
+        auto zo = hh.zGlobal(j);
+        ;
         auto ro = hh.rGlobal(j);
-        auto dr = ro-mer;
-        return dr > maxr[pairLayerId] ||
-          dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
+        auto dr = ro - mer;
+        return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
       };
 
 #ifndef NO_CLSCUT
       auto zsizeCut = [&](int j) {
-        auto onlyBarrel = outer<4;
+        auto onlyBarrel = outer < 4;
         auto so = hh.clusterSizeY(j);
-        auto dy = inner==0 ? ( isOuterLadder ? maxDYsize12: 100 ) : maxDYsize;
-        return onlyBarrel && mes>0 && so>0 && std::abs(so-mes)>dy;
+        auto dy = inner == 0 ? (isOuterLadder ? maxDYsize12 : 100) : maxDYsize;
+        return onlyBarrel && mes > 0 && so > 0 && std::abs(so - mes) > dy;
       };
 #endif
 
       auto iphicut = phicuts[pairLayerId];
 
-      auto kl = Hist::bin(int16_t(mep-iphicut));
-      auto kh = Hist::bin(int16_t(mep+iphicut));
-      auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
+      auto kl = Hist::bin(int16_t(mep - iphicut));
+      auto kh = Hist::bin(int16_t(mep + iphicut));
+      auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
 
 #ifdef GPU_DEBUG
-      int  tot  = 0;
-      int  nmin = 0;
-      int tooMany=0;
+      int tot = 0;
+      int nmin = 0;
+      int tooMany = 0;
 #endif
 
       auto khh = kh;
@@ -179,31 +179,37 @@ namespace gpuPixelDoubletsAlgos {
       for (auto kk = kl; kk != khh; incr(kk)) {
 #ifdef GPU_DEBUG
         if (kk != kl && kk != kh)
-          nmin += hist.size(kk+hoff);
+          nmin += hist.size(kk + hoff);
 #endif
-        auto const * __restrict__ p = hist.begin(kk+hoff);
-        auto const * __restrict__ e = hist.end(kk+hoff);
-        p+=first;
-        for (;p < e; p+=stride) {
-          auto oi=__ldg(p);
-          assert(oi>=offsets[outer]);
-          assert(oi<offsets[outer+1]);
-
-          if (std::min(std::abs(int16_t(hh.iphi(oi)-mep)), std::abs(int16_t(mep-hh.iphi(oi)))) > iphicut)
+        auto const* __restrict__ p = hist.begin(kk + hoff);
+        auto const* __restrict__ e = hist.end(kk + hoff);
+        p += first;
+        for (; p < e; p += stride) {
+          auto oi = __ldg(p);
+          assert(oi >= offsets[outer]);
+          assert(oi < offsets[outer + 1]);
+
+          if (std::min(std::abs(int16_t(hh.iphi(oi) - mep)), std::abs(int16_t(mep - hh.iphi(oi)))) > iphicut)
             continue;
 #ifndef ONLY_PHICUT
 #ifndef NO_CLSCUT
-          if (zsizeCut(oi)) continue;
+          if (zsizeCut(oi))
+            continue;
 #endif
-          if (z0cutoff(oi) || ptcut(oi)) continue;
+          if (z0cutoff(oi) || ptcut(oi))
+            continue;
 #endif
-          auto ind = atomicAdd(nCells, 1); 
-          if (ind>=MaxNumOfDoublets) {atomicSub(nCells, 1); break; } // move to SimpleVector??
+          auto ind = atomicAdd(nCells, 1);
+          if (ind >= MaxNumOfDoublets) {
+            atomicSub(nCells, 1);
+            break;
+          }  // move to SimpleVector??
           // int layerPairId, int doubletId, int innerHitId, int outerHitId)
           cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi);
           isOuterHitOfCell[oi].push_back(ind);
 #ifdef GPU_DEBUG
-          if (isOuterHitOfCell[oi].full()) ++tooMany;
+          if (isOuterHitOfCell[oi].full())
+            ++tooMany;
           ++tot;
 #endif
         }
@@ -215,6 +221,6 @@ namespace gpuPixelDoubletsAlgos {
     }  // loop in block...
   }
 
-} // namespace end
+}  // namespace gpuPixelDoubletsAlgos
 
-#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h

From 47dc7f712463905110a0075abc8aca7f29c5a51e Mon Sep 17 00:00:00 2001
From: Marco Rovere <marco.rovere@cern.ch>
Date: Wed, 8 May 2019 16:02:49 +0200
Subject: [PATCH 048/102] Make GPU CellularAutomaton configurable
 (cms-patatrack#347)

---
 .../PixelTriplets/plugins/GPUCACell.h         | 51 ++++++++++++++-----
 .../PixelTriplets/plugins/gpuFishbone.h       |  2 +-
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 91f2f903295ef..d9dd7aada290b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -86,8 +86,8 @@ class GPUCACell {
   __device__ __forceinline__ auto get_inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
   __device__ __forceinline__ auto get_outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
 
-  __device__ __forceinline__ float get_inner_detId(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_detId(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
+  __device__ __forceinline__ float get_inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float get_outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
 
   constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; }
   constexpr unsigned int get_outer_hit_id() const { return theOuterHitId; }
@@ -105,7 +105,16 @@ class GPUCACell {
   __device__ bool check_alignment(Hits const& hh,
                                   GPUCACell const& otherCell,
                                   const float ptmin,
-                                  const float hardCurvCut) const {
+                                  const float hardCurvCut,
+                                  const float CAThetaCutBarrel,
+                                  const float CAThetaCutForward,
+                                  const float dcaCutInnerTriplet,
+                                  const float dcaCutOuterTriplet) const {
+    // detIndex of the layerStart for the Phase1 Pixel Detector:
+    // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
+    // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
+    constexpr uint32_t last_bpix1_detIndex = 96;
+    constexpr uint32_t last_barrel_detIndex = 1184;
     auto ri = get_inner_r(hh);
     auto zi = get_inner_z(hh);
 
@@ -114,12 +123,21 @@ class GPUCACell {
 
     auto r1 = otherCell.get_inner_r(hh);
     auto z1 = otherCell.get_inner_z(hh);
-    auto isBarrel = otherCell.get_outer_detId(hh) < 1184;
-    bool aligned =
-        areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, isBarrel ? 0.002f : 0.003f);  // 2.f*thetaCut); // FIXME tune cuts
+    auto isBarrel = otherCell.get_outer_detIndex(hh) < last_barrel_detIndex;
+    bool aligned = areAlignedRZ(r1,
+                                z1,
+                                ri,
+                                zi,
+                                ro,
+                                zo,
+                                ptmin,
+                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
     return (aligned &&
-            dcaCut(hh, otherCell, otherCell.get_inner_detId(hh) < 96 ? 0.15f : 0.25f, hardCurvCut));  // FIXME tune cuts
-    // region_origin_radius_plus_tolerance,  hardCurvCut));
+            dcaCut(hh,
+                   otherCell,
+                   otherCell.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
+                   hardCurvCut));  // FIXME tune cuts
+                                   // region_origin_radius_plus_tolerance,  hardCurvCut));
   }
 
   __device__ __forceinline__ static bool areAlignedRZ(
@@ -156,20 +174,27 @@ class GPUCACell {
   }
 
   __device__ inline bool hole(Hits const& hh, GPUCACell const& innerCell) const {
+    constexpr uint32_t max_ladder_bpx4 = 64;
+    constexpr float radius_even_ladder = 15.815f;
+    constexpr float radius_odd_ladder = 16.146f;
+    constexpr float ladder_length = 6.7f;
+    constexpr float ladder_tolerance = 0.2f;
+    constexpr float barrel_z_length = 26.f;
+    constexpr float forward_z_begin = 32.f;
     int p = get_outer_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
-    p = (64 * p) / std::numeric_limits<unsigned short>::max();
+    p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
     p %= 2;
-    float r4 = p == 0 ? 15.815 : 16.146;  // later on from geom
+    float r4 = p == 0 ? radius_even_ladder : radius_odd_ladder;  // later on from geom
     auto ri = innerCell.get_inner_r(hh);
     auto zi = innerCell.get_inner_z(hh);
     auto ro = get_outer_r(hh);
     auto zo = get_outer_z(hh);
     auto z4 = std::abs(zi + (r4 - ri) * (zo - zi) / (ro - ri));
-    auto zm = z4 - 6.7 * int(z4 / 6.7);
-    auto h = zm < 0.2 || zm > 6.5;
-    return h || (z4 > 26 && z4 < 32.f);
+    auto z_in_ladder = z4 - ladder_length * int(z4 / ladder_length);
+    auto h = z_in_ladder < ladder_tolerance || z_in_ladder > (ladder_length - ladder_tolerance);
+    return h || (z4 > barrel_z_length && z4 < forward_z_begin);
   }
 
   // trying to free the track building process from hardcoded layers, leaving
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 98722f4e5cf42..1b6700ece509c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -53,7 +53,7 @@ namespace gpuPixelDoublets {
       if (checkTrack && ci.tracks().empty())
         continue;
       cc[sg] = vc[ic];
-      d[sg] = ci.get_inner_detId(hh);
+      d[sg] = ci.get_inner_detIndex(hh);
       //      l[sg] = layer(d[sg]);
       x[sg] = ci.get_inner_x(hh) - xo;
       y[sg] = ci.get_inner_y(hh) - yo;

From e5372ab1fb56a8d881cc42697df7bf2175e815e5 Mon Sep 17 00:00:00 2001
From: Felice Pantaleo <felice.pantaleo@cern.ch>
Date: Fri, 10 May 2019 12:03:09 +0200
Subject: [PATCH 049/102] Make doClusterCut, doZCut, doPhiCut configurable
 (cms-patatrack#347)

---
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 10 ++-
 .../plugins/gpuPixelDoubletsAlgos.h           | 65 ++++++++-----------
 2 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 6357c8a49e03a..1c039a1fe6e28 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -95,7 +95,10 @@ namespace gpuPixelDoublets {
                                                                     CellTracksVector* cellTracks,
                                                                     TrackingRecHit2DSOAView const* __restrict__ hhp,
                                                                     GPUCACell::OuterHitOfCell* isOuterHitOfCell,
-                                                                    bool ideal_cond) {
+                                                                    bool ideal_cond,
+                                                                    bool doClusterCut,
+                                                                    bool doZCut,
+                                                                    bool doPhiCut) {
     auto const& __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs,
                       nPairs,
@@ -109,7 +112,10 @@ namespace gpuPixelDoublets {
                       minz,
                       maxz,
                       maxr,
-                      ideal_cond);
+                      ideal_cond,
+                      doClusterCut,
+                      doZCut,
+                      doPhiCut);
   }
 
 }  // namespace gpuPixelDoublets
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 29e5aca73bd38..52ff91b54867b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -15,11 +15,6 @@
 #include "CAConstants.h"
 #include "GPUCACell.h"
 
-// useful for benchmark
-// #define ONLY_PHICUT
-// #define NO_ZCUT
-// #define NO_CLSCUT
-
 namespace gpuPixelDoubletsAlgos {
 
   constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
@@ -43,14 +38,18 @@ namespace gpuPixelDoubletsAlgos {
                                                     float const* __restrict__ minz,
                                                     float const* __restrict__ maxz,
                                                     float const* __restrict__ maxr,
-                                                    bool ideal_cond) {
-#ifndef NO_CLSCUT
+                                                    bool ideal_cond,
+                                                    bool doClusterCut,
+                                                    bool doZCut,
+                                                    bool doPhiCut) {
     // ysize cuts (z in the barrel)  times 8
+    // these are used if doClusterCut is true
     constexpr int minYsizeB1 = 36;
     constexpr int minYsizeB2 = 28;
     constexpr int maxDYsize12 = 28;
     constexpr int maxDYsize = 20;
-#endif
+    int16_t mes;
+    bool isOuterLadder = ideal_cond;
 
     using Hist = TrackingRecHit2DSOAView::Hist;
 
@@ -107,29 +106,25 @@ namespace gpuPixelDoubletsAlgos {
       // found hit corresponding to our cuda thread, now do the job
       auto mez = hh.zGlobal(i);
 
-#ifndef NO_ZCUT
-      if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
+      if (doZCut && (mez < minz[pairLayerId] || mez > maxz[pairLayerId]))
         continue;
-#endif
 
-#ifndef NO_CLSCUT
-      auto mes = hh.clusterSizeY(i);
+      if (doClusterCut) {
+        auto mes = hh.clusterSizeY(i);
 
-      // if ideal treat inner ladder as outer
-      auto mi = hh.detectorIndex(i);
-      if (inner == 0)
-        assert(mi < 96);
-      const bool isOuterLadder =
-          ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
-
-      if (inner == 0 && outer > 3 && isOuterLadder)  // B1 and F1
-        if (mes > 0 && mes < minYsizeB1)
-          continue;                 // only long cluster  (5*8)
-      if (inner == 1 && outer > 3)  // B2 and F1
-        if (mes > 0 && mes < minYsizeB2)
-          continue;
-#endif  // NO_CLSCUT
+        // if ideal treat inner ladder as outer
+        auto mi = hh.detectorIndex(i);
+        if (inner == 0)
+          assert(mi < 96);
+        isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
 
+        if (inner == 0 && outer > 3 && isOuterLadder)  // B1 and F1
+          if (mes > 0 && mes < minYsizeB1)
+            continue;                 // only long cluster  (5*8)
+        if (inner == 1 && outer > 3)  // B2 and F1
+          if (mes > 0 && mes < minYsizeB2)
+            continue;
+      }
       auto mep = hh.iphi(i);
       auto mer = hh.rGlobal(i);
 
@@ -153,14 +148,12 @@ namespace gpuPixelDoubletsAlgos {
         return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
       };
 
-#ifndef NO_CLSCUT
       auto zsizeCut = [&](int j) {
         auto onlyBarrel = outer < 4;
         auto so = hh.clusterSizeY(j);
         auto dy = inner == 0 ? (isOuterLadder ? maxDYsize12 : 100) : maxDYsize;
         return onlyBarrel && mes > 0 && so > 0 && std::abs(so - mes) > dy;
       };
-#endif
 
       auto iphicut = phicuts[pairLayerId];
 
@@ -191,14 +184,12 @@ namespace gpuPixelDoubletsAlgos {
 
           if (std::min(std::abs(int16_t(hh.iphi(oi) - mep)), std::abs(int16_t(mep - hh.iphi(oi)))) > iphicut)
             continue;
-#ifndef ONLY_PHICUT
-#ifndef NO_CLSCUT
-          if (zsizeCut(oi))
-            continue;
-#endif
-          if (z0cutoff(oi) || ptcut(oi))
-            continue;
-#endif
+          if (doPhiCut) {
+            if (doClusterCut && zsizeCut(oi))
+              continue;
+            if (z0cutoff(oi) || ptcut(oi))
+              continue;
+          }
           auto ind = atomicAdd(nCells, 1);
           if (ind >= MaxNumOfDoublets) {
             atomicSub(nCells, 1);

From daae2fa40b0567fc08df7379effc67d184494d9d Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 15 May 2019 14:11:42 +0200
Subject: [PATCH 050/102] Synchronise with CMSSW_10_6_0

---
 .../PixelTriplets/plugins/BuildFile.xml           |  3 +++
 Validation/RecoTrack/python/plotting/html.py      |  6 ++++++
 .../RecoTrack/python/plotting/trackingPlots.py    | 15 ++++++++-------
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index d4140692181bf..d0d65227bc79f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,6 +1,7 @@
 <use name="cuda"/>
 <use name="cuda-api-wrappers"/>
 <use name="ofast-flag"/>
+<use name="CommonTools/RecoAlgos"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/PluginManager"/>
@@ -11,6 +12,8 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
+<flags CXXFLAGS="-fno-math-errno"/>
+
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py
index ac80f1f936448..b884c4682c4d5 100644
--- a/Validation/RecoTrack/python/plotting/html.py
+++ b/Validation/RecoTrack/python/plotting/html.py
@@ -448,6 +448,12 @@ def _formatPlotSets(self):
             '  </table>',
         ])
 
+        if len(fileTable):
+            first_row = fileTable[0]
+            self._content.extend([
+              '  <a href="%s">Browse Folder</a>' % (first_row[1][0:first_row[1].rfind('/')])
+            ])
+
     def _appendColumnHeader(self, header):
         leg = ""
         if header in self._columnHeadersIndex:
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index dee8c75ac5172..2250b736bc00a 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+from builtins import range
 import os
 import copy
 import collections
@@ -1081,7 +1082,7 @@ def draw(self, legendLabels, prefix=None, directory="", *args, **kwargs):
         legendLabels = legendLabels[:]
         if max(map(len, legendLabels)) > 20:
             haveShortLabels = True
-            labels_short = [str(chr(ord('A')+i)) for i in xrange(len(legendLabels))]
+            labels_short = [str(chr(ord('A')+i)) for i in range(len(legendLabels))]
             for i, ls in enumerate(labels_short):
                 legendLabels[i] = "%s: %s" % (ls, legendLabels[i])
         else:
@@ -1135,7 +1136,7 @@ def draw(self, legendLabels, prefix=None, directory="", *args, **kwargs):
         if len(histos_linear) == 0:
             return []
 
-        data = [ [h.GetBinContent(i) for i in xrange(1, h.GetNbinsX()+1)] for h in histos_linear]
+        data = [ [h.GetBinContent(i) for i in range(1, h.GetNbinsX()+1)] for h in histos_linear]
         table = html.Table(["dummy"]*len(histos_linear), xbinlabels, data, None, None, None)
         data = table.tableAsRowColumn()
 
@@ -1652,7 +1653,7 @@ def _create(self, tdirectory):
 
         ret = timeTh1.Clone(self._name)
         xaxis = ret.GetXaxis()
-        for i in xrange(1, ret.GetNbinsX()+1):
+        for i in range(1, ret.GetNbinsX()+1):
             ret.SetBinContent(i, ret.GetBinContent(i)/nevents)
             ret.SetBinError(i, ret.GetBinError(i)/nevents)
             xaxis.SetBinLabel(i, xaxis.GetBinLabel(i).replace(" (unscheduled)", ""))
@@ -1704,12 +1705,12 @@ def create(self, tdirectory):
         if h_reco_per_iter is None:
             return None
         values = {}
-        for i in xrange(1, h_reco_per_iter.GetNbinsX()+1):
+        for i in range(1, h_reco_per_iter.GetNbinsX()+1):
             values[h_reco_per_iter.GetXaxis().GetBinLabel(i)] = h_reco_per_iter.GetBinContent(i)
 
 
         result = []
-        for i in xrange(1, timeTh1.GetNbinsX()+1):
+        for i in range(1, timeTh1.GetNbinsX()+1):
             iterName = timeTh1.GetXaxis().GetBinLabel(i)
             if iterName in values:
                 ntrk = values[iterName]
@@ -1741,10 +1742,10 @@ def _edit(s):
             # remove "Tracks" from the track producer name to get the iteration name
             # muonSeeded iterations do not have "Step" in the producer name, so add it here
             return s.replace("Tracks", "").replace("muonSeeded", "muonSeededStep")
-        return [_edit(xaxis.GetBinLabel(i)) for i in xrange(1, h.GetNbinsX()+1)]
+        return [_edit(xaxis.GetBinLabel(i)) for i in range(1, h.GetNbinsX()+1)]
 
     def __call__(self, tdirectory, labels):
-        ret = range(0, len(labels))
+        ret = list(range(0, len(labels)))
         f = tdirectory.GetFile()
         if not f:
             return ret

From 2b385c3dbe662fbb335309ff56fd69ec886c2070 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 2 Aug 2019 10:56:28 -0400
Subject: [PATCH 051/102] Build seeds directly from cpu product
 (cms-patatrack#365)

---
 .../PixelTrackFitting/interface/FitUtils.h    |  30 +++++
 .../PixelTrackFitting/test/BuildFile.xml      |   6 +-
 .../test/testEigenJacobian.cpp                | 119 +++++++++++++-----
 .../TkSeedGenerator/plugins/BuildFile.xml     |  59 ++-------
 4 files changed, 134 insertions(+), 80 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
index 7a78cf4cff47c..e92c46f654615 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -189,6 +189,36 @@ namespace Rfit {
     circle.par = par_pak;
   }
 
+  // transformation between the "perigee" to cmssw localcoord frame
+  // the plane of the latter is the perigee plane...
+  // from   //!<(phi,Tip,pt,cotan(theta)),Zip)
+  // to q/p,dx/dz,dy/dz,x,z
+  template<typename V5, typename M5>
+  __host__ __device__ inline void transformToPerigeePlane(V5 const & ip, M5 const & icov, V5 & op, M5 & ocov, double charge) {
+
+    auto sinTheta2 = 1./(1.+ip(3)*ip(3));
+    auto sinTheta = std::sqrt(sinTheta2);
+    auto cosTheta = ip(3)*sinTheta;
+
+    op(0) = charge*sinTheta/ip(2);
+    op(1) = 0.;
+    op(2) = -ip(3);
+    op(3) = ip(1);
+    op(4) = -ip(4);
+
+    Matrix5d J = Matrix5d::Zero();
+
+    J(0,2) = -charge*sinTheta/(ip(2)*ip(2));
+    J(0,3) = -charge*sinTheta2*cosTheta/ip(2);
+    J(1,0) = 1.;
+    J(2,3) = -1.;
+    J(3,1) = 1.;
+    J(4,4) = -1;
+
+    ocov=  J*icov*J.transpose();
+
+  }
+
 }  // namespace Rfit
 
 #endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 83ebf7a577711..03c2713760fe6 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -78,7 +78,11 @@
   <flags CXXFLAGS="-g"/>
 </bin>
 
-<bin file="testEigenJacobian.cpp">
+<bin file="testEigenJacobian.cpp">  
+  <use   name="DataFormats/GeometrySurface"/>
+  <use   name="TrackingTools/AnalyticalJacobians"/>
+  <use   name="TrackingTools/TrajectoryParametrization"/>
+  <use   name="MagneticField/Engine"/>
   <use name="eigen"/>
   <use name="cuda"/>
   <flags CXXFLAGS="-g"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
index e01aa30efc656..dc12de88001cd 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
@@ -1,16 +1,34 @@
-#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 #include<cmath>
 
 using Rfit::Vector5d;
 using Rfit::Matrix5d;
 
 
-Vector5d transf(Vector5d  p) {
-  auto sinTheta = 1/std::sqrt(1+p(3)*p(3));
-  p(2) = sinTheta/p(2);
-  return p;
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+
+#include "DataFormats/GeometrySurface/interface/Surface.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+
+#include "MagneticField/Engine/interface/MagneticField.h"
+
+namespace {
+
+  struct M5T : public  MagneticField {
+    M5T() :  mf(0.,0.,5.){}
+    virtual GlobalVector inTesla (const GlobalPoint&) const {
+      return mf;
+    }
+
+    GlobalVector mf;
+  };
+
 }
 
+// old pixeltrack version...
 Matrix5d transfFast(Matrix5d cov, Vector5d const &  p) {
   auto sqr = [](auto x) { return x*x;};
   auto sinTheta = 1/std::sqrt(1+p(3)*p(3));
@@ -27,27 +45,17 @@ Matrix5d transfFast(Matrix5d cov, Vector5d const &  p) {
 
 }
 
-Matrix5d Jacobian(Vector5d const &  p) {
-
-  Matrix5d J = Matrix5d::Identity();
-
-  auto sinTheta2 = 1/(1+p(3)*p(3));
-  auto sinTheta = std::sqrt(sinTheta2);
-  J(2,2) = -sinTheta/(p(2)*p(2));
-  J(2,3) = -sinTheta2*sinTheta*p(3)/p(2);
-  return J;
-}
-
-Matrix5d transf(Matrix5d const & cov, Matrix5d const& J) {
-
-   return J*cov*J.transpose();
-
-}  
-
 Matrix5d loadCov(Vector5d const & e) {
 
-  Matrix5d cov = Matrix5d::Zero();
+  Matrix5d cov;
   for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined
+      cov(i,j) = (i+j)%2 ? -0.4*v  : 0.1*v;
+      cov(j,i) = cov(i,j);
+    }
+   }
   return cov;
 }
 
@@ -55,38 +63,83 @@ Matrix5d loadCov(Vector5d const & e) {
 #include<iostream>
 int main() {
 
-  //!<(phi,Tip,pt,cotan(theta)),Zip)
+ M5T const mf;
+
+ for (auto charge=-1; charge<2; charge+=2)
+ for (auto szip=-1; szip<2; szip+=2)
+ for (auto stip=-1; stip<2; stip+=2)
+ {
   Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
   Vector5d del0; del0 << 0.01,0.01,0.035,-0.03,-0.01;
+  //!<(phi,Tip,pt,cotan(theta)),Zip)
+    par0(1) *= stip;
+    par0(4) *= szip;
+
+  Matrix5d cov0 = loadCov(del0);
 
-  Matrix5d J = Jacobian(par0);
+  Vector5d par1;
+  Vector5d par2;
 
+  Matrix5d cov1;
+  Matrix5d cov2;
 
-  Vector5d par1 = transf(par0);
-  Vector5d par2 = transf(par0+del0);
-  Vector5d del1 = par2-par1; 
+  // Matrix5d covf = transfFast(cov0,par0);
+
+  Rfit::transformToPerigeePlane(par0,cov0,par1,cov1,charge);
+
+  std::cout << "cov1\n" << cov1 << std::endl;
+
+
+  LocalTrajectoryParameters lpar(par1(0),par1(1),par1(2),par1(3),par1(4),1.);
+  AlgebraicSymMatrix55 m;
+  for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = cov1(i,j);
+
+    float phi = par0(0);
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(
+                              sp, -cp,    0,
+                               0,   0, -1.f,
+                              cp,  sp,    0);
+
+  Surface::PositionType bs(0., 0., 0.);
+  Plane plane(bs,rot);
+  GlobalTrajectoryParameters gp(plane.toGlobal(lpar.position()), plane.toGlobal(lpar.momentum()),lpar.charge(),&mf);
+  std::cout << "global par " << gp.position() << ' ' << gp.momentum() << ' ' << gp.charge() << std::endl;
+  JacobianLocalToCurvilinear jl2c(plane,lpar,mf);
+  std::cout << "jac l2c" << jl2c.jacobian() << std::endl;
+
+  AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
+  std::cout << "curv error\n" << mo << std::endl;
+
+  /*
+
+  // not accurate as the perigee plane move as well...
+  Vector5d del1 = par2-par1;
 
-  Matrix5d cov0 = loadCov(del0);
-  Matrix5d cov1 = transf(cov0,J);
-  Matrix5d cov2 = transfFast(cov0,par0);
 
   // don't ask: guess
+  std::cout << "charge " << charge << std::endl;
   std::cout << "par0 " << par0.transpose() << std::endl;
   std::cout << "del0 " << del0.transpose() << std::endl;
 
 
   std::cout << "par1 " << par1.transpose() << std::endl;
   std::cout << "del1 " << del1.transpose() << std::endl;
-  std::cout << "del2 " << (J*del0).transpose() << std::endl;
+  // std::cout << "del2 " << (J*del0).transpose() << std::endl;
 
   std::cout << "del1^2 " << (del1.array()*del1.array()).transpose() << std::endl;
   std::cout << std::endl;
-  std::cout << "J\n" << J << std::endl;
   
   std::cout << "cov0\n" << cov0 << std::endl;
   std::cout << "cov1\n" << cov1 << std::endl;
   std::cout << "cov2\n" << cov2 << std::endl;
+  */
+
+  std::cout << std::endl << "----------" << std::endl;
+
 
+  } // lopp over signs
 
   return 0;
 
diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
index abaa5754e68c7..c10ee14dc3638 100644
--- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
+++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
@@ -1,47 +1,14 @@
-<use name="CommonTools/RecoAlgos"/>
-<use name="DataFormats/L1TrackTrigger"/>
-<use name="RecoTracker/TkSeedGenerator"/>
-<use name="RecoTracker/TkTrackingRegions"/>
-<use name="RecoPixelVertexing/PixelTriplets"/>
-<use name="RecoPixelVertexing/PixelTrackFitting"/>
-<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
-<use name="CommonTools/Utils"/>
-<use name="DataFormats/BeamSpot"/>
-<use name="DataFormats/Common"/>
-<use name="DataFormats/Math"/>
-<use name="DataFormats/SiPixelDetId"/>
-<use name="DataFormats/SiStripDetId"/>
-<use name="DataFormats/TrackReco"/>
-<use name="DataFormats/TrackerCommon"/>
-<use name="DataFormats/TrackingRecHit"/>
-<use name="DataFormats/TrajectorySeed"/>
-<use name="DataFormats/TrajectoryState"/>
-<use name="DataFormats/VertexReco"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/MessageLogger"/>
-<use name="FWCore/ParameterSet"/>
-<use name="FWCore/PluginManager"/>
-<use name="FWCore/Utilities"/>
-<use name="Geometry/CommonDetUnit"/>
-<use name="Geometry/Records"/>
-<use name="Geometry/TrackerGeometryBuilder"/>
-<use name="MagneticField/Engine"/>
-<use name="MagneticField/Records"/>
-<use name="MagneticField/UniformEngine"/>
-<use name="RecoTracker/MeasurementDet"/>
-<use name="RecoTracker/Record"/>
-<use name="RecoTracker/SpecialSeedGenerators"/>
-<use name="RecoTracker/TkHitPairs"/>
-<use name="RecoTracker/TkSeedingLayers"/>
-<use name="RecoTracker/TransientTrackingRecHit"/>
-<use name="TrackingTools/GeomPropagators"/>
-<use name="TrackingTools/KalmanUpdators"/>
-<use name="TrackingTools/MeasurementDet"/>
-<use name="TrackingTools/PatternTools"/>
-<use name="TrackingTools/Records"/>
-<use name="TrackingTools/TrajectoryState"/>
-<use name="TrackingTools/TransientTrackingRecHit"/>
-<library file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
-  <flags EDM_PLUGIN="1"/>
-  <use name="DataFormats/TrackerRecHit2D"/>
+<use name="cuda"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/Producer"/>
+<use name="HeterogeneousCore/Product"/>
+<use   name="CommonTools/RecoAlgos"/>
+<use   name="RecoTracker/TkSeedGenerator"/>
+<use   name="RecoTracker/TkTrackingRegions"/>
+<use   name="RecoPixelVertexing/PixelTriplets"/>
+<use   name="RecoPixelVertexing/PixelTrackFitting"/>
+<use   name="RecoPixelVertexing/PixelLowPtUtilities"/>
+<library   file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
+  <flags   EDM_PLUGIN="1"/>
+  <use   name="DataFormats/TrackerRecHit2D"/>
 </library>

From 9e6ca10508bd12aac2ccba4750adcc8d02a04f25 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Jun 2019 12:09:14 +0200
Subject: [PATCH 052/102] Implement triplets in the pixel ntuplet producer
 (cms-patatrack#382)

Enable pixel triplets with:

    process.pixelTracksHitQuadruplets.minHitsPerNtuplet = 3
    process.pixelTracksHitQuadruplets.includeJumpingForwardDoublets = True

Changes:
  - adjust for the average pixel geometry and the beam spot position;
  - allow "jumping doublets" in the forward region (FPIX1-FPIX3) for triplets.
---
 .../plugins/BrokenLineFitOnGPU.cu             |   5 +-
 .../PixelTriplets/plugins/BuildFile.xml       |   1 -
 .../PixelTriplets/plugins/CAConstants.h       |  10 +-
 .../PixelTriplets/plugins/GPUCACell.h         | 125 +++++++++++++-----
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |   3 +-
 .../PixelTriplets/plugins/gpuFishbone.h       |   2 +
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  74 ++++-------
 .../plugins/gpuPixelDoubletsAlgos.h           |  13 +-
 8 files changed, 137 insertions(+), 96 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 12d1707fdd388..cb8f151233385 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -38,13 +38,16 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
 
   assert(pfast_fit);
   assert(foundNtuplets);
+  assert(tupleMultiplicity);
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
 
 #ifdef BROKENLINE_DEBUG
-  if (0 == local_start)
+  if (0 == local_start) {
+    printf("%d total Ntuple\n",foundNtuplets->nbins());
     printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+  }
 #endif
 
   auto tuple_start = local_start + offset;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index d0d65227bc79f..341a108348337 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -12,7 +12,6 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
-<flags CXXFLAGS="-fno-math-errno"/>
 
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index aa9b4e188deb5..06d5fdf7dd898 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -18,24 +18,24 @@ namespace CAConstants {
 #ifdef GPU_SMALL_EVENTS
   constexpr uint32_t maxNumberOfTuples() { return 3 * 1024; }
 #else
-  constexpr uint32_t maxNumberOfTuples() { return 6 * 1024; }
+  constexpr uint32_t maxNumberOfTuples() { return 24 * 1024; }
 #endif
   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
 #ifndef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumberOfDoublets() { return 262144; }
+  constexpr uint32_t maxNumberOfDoublets() { return 448*1024; }
   constexpr uint32_t maxCellsPerHit() { return 128; }
 #else
-  constexpr uint32_t maxNumberOfDoublets() { return 262144 / 2; }
+  constexpr uint32_t maxNumberOfDoublets() { return  128*1024; }
   constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
 #endif
 #else
-  constexpr uint32_t maxNumberOfDoublets() { return 6 * 262144; }
+  constexpr uint32_t maxNumberOfDoublets() { return 448*1024; }
   constexpr uint32_t maxCellsPerHit() { return 4 * 128; }
 #endif
   constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
 
-  constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
+  constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
   constexpr uint32_t maxNumberOfLayers() { return 10; }
   constexpr uint32_t maxTuples() { return maxNumberOfTuples(); }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index d9dd7aada290b..2beb8b889c94c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -5,6 +5,9 @@
 // Author: Felice Pantaleo, CERN
 //
 
+
+// #define ONLY_TRIPLETS_IN_HOLE
+
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
@@ -32,7 +35,8 @@ class GPUCACell {
 
   using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+  using Quality = pixelTuplesHeterogeneousProduct::Quality;
+  static constexpr auto bad = pixelTuplesHeterogeneousProduct::bad;
 
   GPUCACell() = default;
 #ifdef __CUDACC__
@@ -48,7 +52,9 @@ class GPUCACell {
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
+    theUsed = 0;
 
+    // optimization that depends on access pattern
     theInnerZ = hh.zGlobal(innerHitId);
     theInnerR = hh.rGlobal(innerHitId);
 
@@ -74,13 +80,11 @@ class GPUCACell {
   __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
   __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
   __device__ __forceinline__ float get_outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_z(Hits const& hh) const {
-    return theInnerZ;
-  }  // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_inner_z(Hits const& hh) const { return theInnerZ; }
+     // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
   __device__ __forceinline__ float get_outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_r(Hits const& hh) const {
-    return theInnerR;
-  }  // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_inner_r(Hits const& hh) const { return theInnerR; }
+     // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
   __device__ __forceinline__ float get_outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
 
   __device__ __forceinline__ auto get_inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
@@ -137,7 +141,6 @@ class GPUCACell {
                    otherCell,
                    otherCell.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
                    hardCurvCut));  // FIXME tune cuts
-                                   // region_origin_radius_plus_tolerance,  hardCurvCut));
   }
 
   __device__ __forceinline__ static bool areAlignedRZ(
@@ -173,42 +176,85 @@ class GPUCACell {
     return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
   }
 
-  __device__ inline bool hole(Hits const& hh, GPUCACell const& innerCell) const {
+
+  __device__ __forceinline__ static bool dcaCutH(float x1,float y1, float x2,float y2, float x3,float y3,
+                                const float region_origin_radius_plus_tolerance,
+                                   const float maxCurv) {
+
+    CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+
+    if (eq.curvature() > maxCurv)
+      return false;
+
+    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
+  }
+
+
+
+  __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
+    constexpr uint32_t max_ladder_bpx0 = 12;
+    constexpr uint32_t first_ladder_bpx0 = 0;
+    constexpr float module_length = 6.7f;
+    constexpr float module_tolerance = 0.4f; // projection to cylinder is inaccurate on BPIX1
+    int p = innerCell.get_inner_iphi(hh);
+    if (p < 0)
+      p += std::numeric_limits<unsigned short>::max();
+    p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
+    p %= max_ladder_bpx0;
+    auto il = first_ladder_bpx0+p;
+    auto r0 = hh.averageGeometry().ladderR[il];
+    auto ri = innerCell.get_inner_r(hh);
+    auto zi = innerCell.get_inner_z(hh);
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
+    auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
+    auto z_in_ladder = std::abs(z0-hh.averageGeometry().ladderZ[il]);
+    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
+    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    return gap;
+  }
+
+
+  __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
     constexpr uint32_t max_ladder_bpx4 = 64;
-    constexpr float radius_even_ladder = 15.815f;
-    constexpr float radius_odd_ladder = 16.146f;
-    constexpr float ladder_length = 6.7f;
-    constexpr float ladder_tolerance = 0.2f;
-    constexpr float barrel_z_length = 26.f;
-    constexpr float forward_z_begin = 32.f;
+    constexpr uint32_t first_ladder_bpx4 = 84;
+    // constexpr float radius_even_ladder = 15.815f;
+    // constexpr float radius_odd_ladder = 16.146f;
+    constexpr float module_length = 6.7f;
+    constexpr float module_tolerance = 0.2f;
+    // constexpr float barrel_z_length = 26.f;
+    // constexpr float forward_z_begin = 32.f;
     int p = get_outer_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
     p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
-    p %= 2;
-    float r4 = p == 0 ? radius_even_ladder : radius_odd_ladder;  // later on from geom
+    p %= max_ladder_bpx4;
+    auto il = first_ladder_bpx4+p;  
+    auto r4 = hh.averageGeometry().ladderR[il];
     auto ri = innerCell.get_inner_r(hh);
     auto zi = innerCell.get_inner_z(hh);
     auto ro = get_outer_r(hh);
     auto zo = get_outer_z(hh);
-    auto z4 = std::abs(zi + (r4 - ri) * (zo - zi) / (ro - ri));
-    auto z_in_ladder = z4 - ladder_length * int(z4 / ladder_length);
-    auto h = z_in_ladder < ladder_tolerance || z_in_ladder > (ladder_length - ladder_tolerance);
-    return h || (z4 > barrel_z_length && z4 < forward_z_begin);
+    auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
+    auto z_in_ladder = std::abs(z4-hh.averageGeometry().ladderZ[il]);
+    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
+    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
+    auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1];
+    return gap || holeP || holeN;
   }
 
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
-
-  template <typename CM>
   __device__ inline void find_ntuplets(Hits const& hh,
                                        GPUCACell* __restrict__ cells,
                                        CellTracksVector& cellTracks,
                                        TuplesOnGPU::Container& foundNtuplets,
                                        AtomicPairCounter& apc,
-                                       CM& tupleMultiplicity,
+                                       Quality* __restrict__ quality,
                                        TmpTuple& tmpNtuplet,
-                                       const unsigned int minHitsPerNtuplet) const {
+                                       const unsigned int minHitsPerNtuplet,
+                                       bool startAt0) const {
     // the building process for a track ends if:
     // it has no right neighbor
     // it has no compatible neighbor
@@ -218,29 +264,35 @@ class GPUCACell {
     tmpNtuplet.push_back_unsafe(theDoubletId);
     assert(tmpNtuplet.size() <= 4);
 
-    if (outerNeighbors().size() > 0) {
-      for (int j = 0; j < outerNeighbors().size(); ++j) {
-        auto otherCell = outerNeighbors()[j];
+    bool last=true;
+    for (int j = 0; j < outerNeighbors().size(); ++j) {
+      auto otherCell = outerNeighbors()[j];
+      if (cells[otherCell].theDoubletId<0) continue; // killed by earlyFishbone
+        last = false;
         cells[otherCell].find_ntuplets(
-            hh, cells, cellTracks, foundNtuplets, apc, tupleMultiplicity, tmpNtuplet, minHitsPerNtuplet);
-      }
-    } else {  // if long enough save...
+            hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet,startAt0);
+    }
+    if(last) {  // if long enough save...
       if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
-#ifndef ALL_TRIPLETS
+#ifdef ONLY_TRIPLETS_IN_HOLE
         // triplets accepted only pointing to the hole
-        if (tmpNtuplet.size() >= 3 || hole(hh, cells[tmpNtuplet[0]]))
+        if (tmpNtuplet.size() >= 3 || 
+            ( startAt0&&hole4(hh, cells[tmpNtuplet[0]]) ) ||
+            ( (!startAt0)&&hole0(hh, cells[tmpNtuplet[0]]) )
+        )
 #endif
         {
           hindex_type hits[6];
           auto nh = 0U;
-          for (auto c : tmpNtuplet)
+          for (auto c : tmpNtuplet) {
             hits[nh++] = cells[c].theInnerHitId;
+          }
           hits[nh] = theOuterHitId;
           auto it = foundNtuplets.bulkFill(apc, hits, tmpNtuplet.size() + 1);
           if (it >= 0) {  // if negative is overflow....
             for (auto c : tmpNtuplet)
               cells[c].addTrack(it, cellTracks);
-            tupleMultiplicity.countDirect(tmpNtuplet.size() + 1);
+            quality[it] =  bad; // initialize to bad
           }
         }
       }
@@ -257,7 +309,8 @@ class GPUCACell {
 
 public:
   int32_t theDoubletId;
-  int32_t theLayerPairId;
+  int16_t theLayerPairId;
+  uint16_t theUsed; // tbd
 
 private:
   float theInnerZ;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 06a8e0b0b5505..022a6ba5ae623 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -10,7 +10,8 @@ class TrackingRecHit2DSOAView;
 class TrackingRecHit2DCUDA;
 
 namespace Rfit {
-  constexpr uint32_t maxNumberOfConcurrentFits() { return 6 * 1024; }
+  // in case of memory issue can be made smaller
+  constexpr uint32_t maxNumberOfConcurrentFits() { return CAConstants::maxNumberOfTuples(); }
   constexpr uint32_t stride() { return maxNumberOfConcurrentFits(); }
   using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
   using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride(), stride()> >;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 1b6700ece509c..ba2c3d9b146a8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -40,6 +40,7 @@ namespace gpuPixelDoublets {
     if (s < 2)
       return;
     // if alligned kill one of the two.
+    // in principle one could try to relax the cut (only in r-z?) for jumping-doublets 
     auto const& c0 = cells[vc[0]];
     auto xo = c0.get_outer_x(hh);
     auto yo = c0.get_outer_y(hh);
@@ -50,6 +51,7 @@ namespace gpuPixelDoublets {
     auto sg = 0;
     for (uint32_t ic = 0; ic < s; ++ic) {
       auto& ci = cells[vc[ic]];
+      if (0==ci.theUsed) continue; // for triplets equivalent to next 
       if (checkTrack && ci.tracks().empty())
         continue;
       cc[sg] = vc[ic];
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 1c039a1fe6e28..32b4e4f538ea9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -9,60 +9,40 @@ namespace gpuPixelDoublets {
 
   using namespace gpuPixelDoubletsAlgos;
 
-  constexpr int nPairs = 13;
+  constexpr int nPairs = 13+2+4;
+  static_assert(nPairs<=CAConstants::maxNumberOfLayerPairs());
+
+// start constants
+// clang-format off 
+
   CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
-      0,
-      1,
-      1,
-      2,
-      2,
-      3,
-      // 0, 4,  1, 4,  2, 4,  4, 5,  5, 6,
-      0,
-      7,
-      1,
-      7,
-      2,
-      7,
-      7,
-      8,
-      8,
-      9,  // neg
-      0,
-      4,
-      1,
-      4,
-      2,
-      4,
-      4,
-      5,
-      5,
-      6,  // pos
+      0,1, 0,4, 0,7, // BPIX1 (3)
+      1,2, 1,4, 1,7, // BPIX2 (5)
+      4,5, 7,8,      // FPIX1 (8)
+      2,3, 2,4, 2,7, 5,6, 8,9,  // BPIX3 & FPIX2 (13)
+      0,2, 1,3,      // Jumping Barrel (15)
+      0,5, 0,8,      // Jumping Forward (BPIX1,FPIX2)
+      4,6, 7,9       // Jumping Forward (19)
   };
 
   constexpr int16_t phi0p05 = 522;  // round(521.52189...) = phi2short(0.05);
   constexpr int16_t phi0p06 = 626;  // round(625.82270...) = phi2short(0.06);
   constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
 
-  CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05,
-                                             phi0p05,
-                                             phi0p06,
-                                             phi0p07,
-                                             phi0p06,
-                                             phi0p06,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p07,
-                                             phi0p06,
-                                             phi0p06,
-                                             phi0p05,
-                                             phi0p05};
-
-  CONSTANT_VAR float const minz[nPairs] = {-20., -22., -22., -30., -30., -30., -70., -70., 0., 10., 15., -70., -70.};
+  CONSTANT_VAR const int16_t phicuts[nPairs]{
+     phi0p05, phi0p07, phi0p07, 
+     phi0p05, phi0p06, phi0p06,
+     phi0p05, phi0p05,
+     phi0p06, phi0p06, phi0p06, phi0p05, phi0p05,
+     phi0p05, phi0p05, phi0p05,phi0p05, phi0p05,phi0p05};
+//   phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06};  // relaxed cuts
 
-  CONSTANT_VAR float const maxz[nPairs] = {20., 22., 22., 0., -10., -15., 70., 70., 30., 30., 30., 70., 70.};
+  CONSTANT_VAR float const minz[nPairs] = {-20.,0.,-30., -22.,10.,-30., -70.,-70., -22.,15.,-30, -70.,-70., -20.,-22., 0,-30., -70.,-70.};
+  CONSTANT_VAR float const maxz[nPairs] = { 20.,30.,0.,   22.,30.,-10.,  70.,70.,   22.,30.,-15., 70., 70.,  20.,22., 30.,0.,  70.,70.};
+  CONSTANT_VAR float const maxr[nPairs] = {20.,9.,9., 20.,7.,7., 5.,5., 20.,6.,6., 5., 5., 20.,20.,9.,9., 9.,9.}; 
 
-  CONSTANT_VAR float const maxr[nPairs] = {20., 20., 20., 9., 7., 6., 5., 5., 9., 7., 6., 5., 5.};
+// end constants
+// clang-format on
 
   constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
 
@@ -79,6 +59,7 @@ namespace gpuPixelDoublets {
                                CellNeighbors* cellNeighborsContainer,
                                CellTracksVector* cellTracks,
                                CellTracks* cellTracksContainer) {
+    assert(isOuterHitOfCell);
     int first = blockIdx.x * blockDim.x + threadIdx.x;
     for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
       isOuterHitOfCell[i].reset();
@@ -95,13 +76,14 @@ namespace gpuPixelDoublets {
                                                                     CellTracksVector* cellTracks,
                                                                     TrackingRecHit2DSOAView const* __restrict__ hhp,
                                                                     GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                                                    int nActualPairs,
                                                                     bool ideal_cond,
                                                                     bool doClusterCut,
                                                                     bool doZCut,
                                                                     bool doPhiCut) {
     auto const& __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs,
-                      nPairs,
+                      nActualPairs,
                       cells,
                       nCells,
                       cellNeighbors,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 52ff91b54867b..1cfdd298c0cd0 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -62,7 +62,7 @@ namespace gpuPixelDoubletsAlgos {
     // nPairsMax to be optimized later (originally was 64).
     // If it should be much bigger, consider using a block-wide parallel prefix scan,
     // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
-    const int nPairsMax = 16;
+    const int nPairsMax = CAConstants::maxNumberOfLayerPairs();
     assert(nPairs <= nPairsMax);
     __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
     __shared__ uint32_t ntot;
@@ -133,11 +133,12 @@ namespace gpuPixelDoubletsAlgos {
       constexpr float minRadius =
           hardPtCut * 87.78f;  // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
       constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
-      auto ptcut = [&](int j) {
+      auto ptcut = [&](int j, int16_t mop) {
         auto r2t4 = minRadius2T4;
         auto ri = mer;
         auto ro = hh.rGlobal(j);
-        auto dphi = short2phi(min(abs(int16_t(mep - hh.iphi(j))), abs(int16_t(hh.iphi(j) - mep))));
+        // auto mop = hh.iphi(j);
+        auto dphi = short2phi(std::min(std::abs(int16_t(mep - mop)), std::abs(int16_t(mop - mep))));
         return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
       };
       auto z0cutoff = [&](int j) {
@@ -181,13 +182,13 @@ namespace gpuPixelDoubletsAlgos {
           auto oi = __ldg(p);
           assert(oi >= offsets[outer]);
           assert(oi < offsets[outer + 1]);
-
-          if (std::min(std::abs(int16_t(hh.iphi(oi) - mep)), std::abs(int16_t(mep - hh.iphi(oi)))) > iphicut)
+          auto mop = hh.iphi(oi);
+          if (std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))) > iphicut)
             continue;
           if (doPhiCut) {
             if (doClusterCut && zsizeCut(oi))
               continue;
-            if (z0cutoff(oi) || ptcut(oi))
+            if (z0cutoff(oi) || ptcut(oi,mop))
               continue;
           }
           auto ind = atomicAdd(nCells, 1);

From 080647155aa96852ec7fd98233468abd345efffa Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 13 Jul 2019 10:01:04 +0200
Subject: [PATCH 053/102] fix bug in (and extend) cluster shape cut
 (cms-patatrack#383)

---
 .../plugins/gpuPixelDoubletsAlgos.h           | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 1cfdd298c0cd0..472fa671e7d23 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -48,6 +48,9 @@ namespace gpuPixelDoubletsAlgos {
     constexpr int minYsizeB2 = 28;
     constexpr int maxDYsize12 = 28;
     constexpr int maxDYsize = 20;
+    constexpr int maxDYPred = 20;
+    constexpr float dzdrFact = 8*0.0285/0.015;  // from dz/dr to "DY"
+
     int16_t mes;
     bool isOuterLadder = ideal_cond;
 
@@ -110,15 +113,16 @@ namespace gpuPixelDoubletsAlgos {
         continue;
 
       if (doClusterCut) {
-        auto mes = hh.clusterSizeY(i);
-
         // if ideal treat inner ladder as outer
         auto mi = hh.detectorIndex(i);
         if (inner == 0)
           assert(mi < 96);
         isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
 
-        if (inner == 0 && outer > 3 && isOuterLadder)  // B1 and F1
+        // in any case we always test mes>0 ...
+        mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1;
+
+        if (inner == 0 && outer > 3 )  // B1 and F1
           if (mes > 0 && mes < minYsizeB1)
             continue;                 // only long cluster  (5*8)
         if (inner == 1 && outer > 3)  // B2 and F1
@@ -128,6 +132,7 @@ namespace gpuPixelDoubletsAlgos {
       auto mep = hh.iphi(i);
       auto mer = hh.rGlobal(i);
 
+      // all cuts: true if fails
       constexpr float z0cut = 12.f;      // cm
       constexpr float hardPtCut = 0.5f;  // GeV
       constexpr float minRadius =
@@ -143,7 +148,6 @@ namespace gpuPixelDoubletsAlgos {
       };
       auto z0cutoff = [&](int j) {
         auto zo = hh.zGlobal(j);
-        ;
         auto ro = hh.rGlobal(j);
         auto dr = ro - mer;
         return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
@@ -152,8 +156,16 @@ namespace gpuPixelDoubletsAlgos {
       auto zsizeCut = [&](int j) {
         auto onlyBarrel = outer < 4;
         auto so = hh.clusterSizeY(j);
-        auto dy = inner == 0 ? (isOuterLadder ? maxDYsize12 : 100) : maxDYsize;
-        return onlyBarrel && mes > 0 && so > 0 && std::abs(so - mes) > dy;
+        auto dy = inner == 0 ? maxDYsize12  : maxDYsize;
+        // in the barrel cut on difference in size
+        // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well)
+        // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ...
+        auto zo = hh.zGlobal(j);
+        auto ro = hh.rGlobal(j);
+        return onlyBarrel ?
+                     mes > 0 && so > 0 && std::abs(so - mes) > dy :
+                     (inner<4) && mes>0 
+                     && std::abs(mes - int(std::abs((mez-zo)/(mer-ro))*dzdrFact+0.5f)) > maxDYPred;
       };
 
       auto iphicut = phicuts[pairLayerId];

From 8df4bc880c9ae204fa131f3684806aae465a30d3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 5 Jul 2019 11:59:12 +0200
Subject: [PATCH 054/102] Port the whole pixel workflow to new heterogeneous
 framework (cms-patatrack#384)

  - port the whole pixel workflow to new heterogeneous framework
  - implement a legacy cluster to SoA converter for the pixel RecHits
  - update the vertex producer to run on CPU as well as GPU
---
 CUDADataFormats/Track/BuildFile.xml           |  10 +
 .../Track/interface/PixelTrackHeterogeneous.h |  79 ++
 .../Track/interface/TrajectoryStateSoA.h      |  65 ++
 CUDADataFormats/Track/src/classes.h           |  10 +
 CUDADataFormats/Track/src/classes_def.xml     |   8 +
 CUDADataFormats/Track/test/BuildFile.xml      |  13 +
 .../Track/test/TrajectoryStateSOA_t.cpp       |   1 +
 .../Track/test/TrajectoryStateSOA_t.cu        |   1 +
 .../Track/test/TrajectoryStateSOA_t.h         |  77 ++
 .../customizePixelTracksForProfiling.py       |  55 +-
 .../PixelTrackFitting/interface/FitUtils.h    |  37 +-
 .../PixelTrackFitting/plugins/BuildFile.xml   |   1 +
 .../plugins/PixelTrackProducerFromSoA.cc      | 214 ++++
 .../plugins/PixelTrackSoAFromCUDA.cc          |  95 ++
 .../PixelTrackFitting/plugins/storeTracks.h   |  10 +-
 .../python/PixelTracks_cff.py                 |  24 +-
 .../test/testEigenJacobian.cpp                |   2 +-
 .../plugins/BrokenLineFitOnGPU.cu             |  69 +-
 .../PixelTriplets/plugins/BuildFile.xml       |   1 +
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  82 ++
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 933 ++++++++++++++++++
 .../plugins/CAHitNtupletGeneratorKernels.h    | 186 ++++
 .../CAHitNtupletGeneratorKernelsAlloc.cu      |  40 +
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 164 +++
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  73 ++
 .../PixelTriplets/plugins/GPUCACell.h         |  12 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  10 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  23 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  |  67 +-
 .../plugins/SeedProducerFromSoA.cc            | 177 ++++
 30 files changed, 2401 insertions(+), 138 deletions(-)
 create mode 100644 CUDADataFormats/Track/BuildFile.xml
 create mode 100644 CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
 create mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoA.h
 create mode 100644 CUDADataFormats/Track/src/classes.h
 create mode 100644 CUDADataFormats/Track/src/classes_def.xml
 create mode 100644 CUDADataFormats/Track/test/BuildFile.xml
 create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
 create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
 create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
 create mode 100644 RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc

diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
new file mode 100644
index 0000000000000..521ea8fe29753
--- /dev/null
+++ b/CUDADataFormats/Track/BuildFile.xml
@@ -0,0 +1,10 @@
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSetReader"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
new file mode 100644
index 0000000000000..a576604b6e935
--- /dev/null
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -0,0 +1,79 @@
+#ifndef CUDADataFormatsTrackTrackHeterogeneous_H
+#define CUDADataFormatsTrackTrackHeterogeneous_H
+
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+
+namespace trackQuality {
+  enum Quality : uint8_t { bad=0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAT {
+public:
+
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = trackQuality::Quality;
+  using hindex_type = uint16_t;
+  using HitContainer = OneToManyAssoc<hindex_type, S, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+  eigenSoA::ScalarSoA<uint8_t, S> m_quality;
+  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i));}
+  constexpr Quality & quality(int32_t i) { return (Quality&)(m_quality(i));}
+  constexpr Quality const * qualityData() const { return (Quality const *)(m_quality.data());}
+  constexpr Quality * qualityData() { return (Quality*)(m_quality.data());}
+
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit  
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i);}
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoA<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f,stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+  
+  // total number of tracks (including those not fitted)
+  uint32_t m_nTracks;
+
+};
+
+namespace pixelTrack{
+
+#ifdef GPU_SMALL_EVENTS
+  constexpr uint32_t maxNumber() { return 2 * 1024;}
+#else
+  constexpr uint32_t maxNumber() { return 32 * 1024;}
+#endif
+
+  using TrackSoA = TrackSoAT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoA<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+  using Quality = trackQuality::Quality;
+
+}
+
+using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+
+
+#endif // CUDADataFormatsTrackTrackSoA_H
+
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
new file mode 100644
index 0000000000000..a6553ff96cb0b
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
@@ -0,0 +1,65 @@
+#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H
+#define CUDADataFormatsTrackTrajectoryStateSOA_H
+
+#include <Eigen/Dense>
+#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+
+template <int32_t S>
+struct TrajectoryStateSoA {
+
+  using Vector5f = Eigen::Matrix<float, 5, 1>;
+  using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::MatrixSoA<Vector5f,S> state;
+  eigenSoA::MatrixSoA<Vector15f,S> covariance;
+
+
+  template<typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__
+  void copyFromCircle(V3 const & cp, M3 const & ccov, V2 const & lp, M2 const & lcov, float b, int32_t i) {
+     state(i) << cp.template cast<float>(), lp.template cast<float>();
+     state(i)(2) *=b;
+     auto cov =  covariance(i);
+     cov(0) = ccov(0,0);
+     cov(1) = ccov(0,1);
+     cov(2) = b*float(ccov(0,2));
+     cov(4) = cov(3) = 0;
+     cov(5) = ccov(1,1);
+     cov(6) = b*float(ccov(1,2));
+     cov(8) = cov(7) = 0;
+     cov(9) = b*b*float(ccov(2,2));
+     cov(11) = cov(10) = 0;
+     cov(12) = lcov(0,0);
+     cov(13) = lcov(0,1);
+     cov(14) = lcov(1,1);
+  }
+
+
+  template<typename V5, typename M5>
+  __host__ __device__
+  void copyFromDense(V5 const & v, M5 const & cov, int32_t i) {
+     state(i) = v.template cast<float>();
+     for(int j=0, ind=0; j<5; ++j) for (auto k=j;k<5;++k) covariance(i)(ind++) = cov(j,k); 
+  }
+
+  template<typename V5, typename M5>
+  __host__ __device__
+  void copyToDense(V5 & v, M5 & cov, int32_t i) const {
+     v = state(i).template cast<typename V5::Scalar>();
+     for(int j=0, ind=0; j<5; ++j) {
+        cov(j,j) = covariance(i)(ind++); 
+        for (auto k=j+1;k<5;++k) cov(k,j)=cov(j,k) = covariance(i)(ind++);
+     }
+  }
+
+};
+
+#endif // CUDADataFormatsTrackTrajectoryStateSOA_H
+
+
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
new file mode 100644
index 0000000000000..7f89096977e64
--- /dev/null
+++ b/CUDADataFormats/Track/src/classes.h
@@ -0,0 +1,10 @@
+#ifndef CUDADataFormats__src_classes_h
+#define CUDADataFormats__src_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/ArrayShadow.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif  
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
new file mode 100644
index 0000000000000..a4c2e766582dd
--- /dev/null
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -0,0 +1,8 @@
+<lcgdict>
+  <class name="CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
+  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
+  <class name="edm::Wrapper<ArrayShadow<std::array<unsigned int,2001>>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
new file mode 100644
index 0000000000000..598b345d4709d
--- /dev/null
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -0,0 +1,13 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
new file mode 100644
index 0000000000000..d6ff539a642b0
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
new file mode 100644
index 0000000000000..d6ff539a642b0
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
new file mode 100644
index 0000000000000..adefb57d7bbe5
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -0,0 +1,77 @@
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+__host__ __device__
+Matrix5d loadCov(Vector5d const & e) {
+  Matrix5d cov;
+  for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined
+      cov(i,j) = (i+j)%2 ? -0.4*v  : 0.1*v;
+      cov(j,i) = cov(i,j);
+    }
+   }
+  return cov;
+}
+
+
+using TS = TrajectoryStateSoA<128>;
+
+__global__ void testTSSoA(TS * pts, int n) {
+
+  assert(n<=128);
+
+  Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
+  Vector5d e0; e0 << 0.01,0.01,0.035,-0.03,-0.01;
+  auto cov0 = loadCov(e0);
+
+  TS & ts = *pts;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    ts.copyFromDense(par0,cov0,i);
+    Vector5d par1; Matrix5d cov1;
+    ts.copyToDense(par1,cov1,i);
+    Vector5d delV = par1-par0;
+    Matrix5d delM = cov1-cov0;
+    for(int j=0; j<5; ++j) {
+      assert(std::abs(delV(j))<1.e-5);
+      for (auto k=j;k<5;++k) {
+        assert(cov0(k,j)==cov0(j,k));
+        assert(cov1(k,j)==cov1(j,k));
+        assert(std::abs(delM(k,j))<1.e-5);
+      }
+    }
+
+  }
+}
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  exitSansCUDADevices();
+#endif
+
+
+  TS ts;
+
+#ifdef __CUDACC__
+  TS * ts_d;
+  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
+  testTSSoA<<<1, 64>>>(ts_d,128);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testTSSoA(&ts,128);
+#endif
+
+}
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 1021918c0ce6c..ce97de6650244 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -1,6 +1,41 @@
 import FWCore.ParameterSet.Config as cms
 
-def customizePixelTracksForProfiling(process):
+def customizePixelTracksForProfilingGPUOnly(process):
+    process.MessageLogger.cerr.FwkReport.reportEvery = 100
+
+    process.Raw2Hit = cms.Path(process.offlineBeamSpot+process.offlineBeamSpotCUDA+process.siPixelClustersCUDAPreSplitting+process.siPixelRecHitsCUDAPreSplitting)
+
+    process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi')
+    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi')
+    process.TVreco = cms.Path(process.caHitNtupletCUDA+process.pixelVertexCUDA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco)
+    return process
+
+def customizePixelTracksForProfilingSoAonCPU(process):
+    process = customizePixelTracksForProfilingGPUOnly(process)
+
+    process.pixelVertexSoA = process.pixelVertexCUDA.clone()
+    process.pixelVertexSoA.onGPU = False
+    process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA'
+    process.TVSoAreco = cms.Path(process.caHitNtupletCUDA+process.pixelTrackSoA+process.pixelVertexSoA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVSoAreco)
+
+    return process
+
+def customizePixelTracksForProfilingEnableTransfer(process):
+    process = customizePixelTracksForProfilingGPUOnly(process)
+
+    process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi')
+    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi')
+    process.toSoA = cms.Path(process.pixelTrackSoA+process.pixelVertexSoA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco, process.toSoA)
+    return process
+
+def customizePixelTracksForProfilingEnableConversion(process):
+    # use old trick of output path
     process.MessageLogger.cerr.FwkReport.reportEvery = 100
 
     process.out = cms.OutputModule("AsciiOutputModule",
@@ -17,21 +52,3 @@ def customizePixelTracksForProfiling(process):
 
     return process
 
-def customizePixelTracksForProfilingDisableConversion(process):
-    process = customizePixelTracksForProfiling(process)
-
-    # Disable conversions to legacy
-    process.pixelTracksHitQuadruplets.gpuEnableConversion = False
-    process.pixelTracks.gpuEnableConversion = False
-    process.pixelVertices.gpuEnableConversion = False
-
-    return process
-
-def customizePixelTracksForProfilingDisableTransfer(process):
-    process = customizePixelTracksForProfilingDisableConversion(process)
-
-    # Disable "unnecessary" transfers to CPU
-    process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
-    process.pixelVertices.gpuEnableTransfer = False
-
-    return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
index e92c46f654615..e44a58f676106 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -189,18 +189,43 @@ namespace Rfit {
     circle.par = par_pak;
   }
 
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+  */
+  __host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
+           circle.q/circle.par(2);
+    
+      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+      const double temp3 = 1. / temp1 * circle.q;
+      Matrix3d J4;
+      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.q, 0., 0., -circle.q/(circle.par(2)*circle.par(2));
+      circle.cov = J4 * circle.cov * J4.transpose();
+    
+    circle.par = par_pak;
+  }
+
+
+
   // transformation between the "perigee" to cmssw localcoord frame
   // the plane of the latter is the perigee plane...
-  // from   //!<(phi,Tip,pt,cotan(theta)),Zip)
+  // from   //!<(phi,Tip,q/pt,cotan(theta)),Zip)
   // to q/p,dx/dz,dy/dz,x,z
-  template<typename V5, typename M5>
-  __host__ __device__ inline void transformToPerigeePlane(V5 const & ip, M5 const & icov, V5 & op, M5 & ocov, double charge) {
+  template<typename VI5, typename MI5, typename VO5, typename MO5>
+  __host__ __device__ inline void transformToPerigeePlane(VI5 const & ip, MI5 const & icov, VO5 & op, MO5 & ocov) {
 
     auto sinTheta2 = 1./(1.+ip(3)*ip(3));
     auto sinTheta = std::sqrt(sinTheta2);
     auto cosTheta = ip(3)*sinTheta;
 
-    op(0) = charge*sinTheta/ip(2);
+    op(0) = sinTheta*ip(2);
     op(1) = 0.;
     op(2) = -ip(3);
     op(3) = ip(1);
@@ -208,8 +233,8 @@ namespace Rfit {
 
     Matrix5d J = Matrix5d::Zero();
 
-    J(0,2) = -charge*sinTheta/(ip(2)*ip(2));
-    J(0,3) = -charge*sinTheta2*cosTheta/ip(2);
+    J(0,2) = sinTheta;
+    J(0,3) = -sinTheta2*cosTheta*ip(2);
     J(1,0) = 1.;
     J(2,3) = -1.;
     J(3,1) = 1.;
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
index 62a8e8541aa64..8c0261ee0d999 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
@@ -2,6 +2,7 @@
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>
 <use name="HeterogeneousCore/Product"/>
+<use name="CUDADataFormats/Track"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
 <library file="*.cc" name="RecoPixelVertexingPixelTrackFittingPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
new file mode 100644
index 0000000000000..284bcfc2ebb51
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -0,0 +1,214 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+
+#include "storeTracks.h"
+#include "CUDADataFormats/Common/interface/ArrayShadow.h"
+
+
+/**
+ * This class creates "leagcy"  reco::Track
+ * objects from the output of GPU CA. 
+ */
+class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
+public:
+
+  using IndToEdm = std::vector<uint16_t>;
+
+  explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~PixelTrackProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+  using HitModuleStart = std::array<uint32_t,gpuClustering::MaxNumModules + 1>;
+  using HMSstorage = ArrayShadow<HitModuleStart>;
+
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
+  edm::EDGetTokenT<HMSstorage> hmsToken_;
+
+  int32_t minNumberOfHits_;
+};
+
+PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) :
+      tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
+{
+    produces<reco::TrackCollection>();
+    produces<TrackingRecHitCollection>();
+    produces<reco::TrackExtraCollection>();
+    produces<IndToEdm>();
+}
+
+void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("trackSrc", edm::InputTag("pixelTrackSoA"));
+  desc.add<edm::InputTag>("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsLegacyPreSplitting"));
+  desc.add<int>("minNumberOfHits", 0);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+
+void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  // std::cout << "Converting gpu helix in reco tracks" << std::endl;
+
+  auto indToEdmP = std::make_unique<IndToEdm>();
+  auto & indToEdm = *indToEdmP;
+
+  edm::ESHandle<MagneticField> fieldESH;
+  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
+
+  pixeltrackfitting::TracksWithRecHits tracks;
+  edm::ESHandle<TrackerTopology> httopo;
+  iSetup.get<TrackerTopologyRcd>().get(httopo);
+
+
+  edm::Handle<reco::BeamSpot> bsHandle;
+  iEvent.getByToken(tBeamSpot_, bsHandle);
+  const auto &bsh = *bsHandle;
+  // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
+  GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
+
+  edm::Handle<HMSstorage> hhms;
+  iEvent.getByToken(hmsToken_,hhms);
+  auto const & hitsModuleStart = *hhms;
+
+  auto fc = hitsModuleStart.data;
+
+  edm::Handle<SiPixelRecHitCollectionNew> gh;
+  iEvent.getByToken(cpuHits_, gh);
+  auto const &rechits = *gh;
+  std::vector<TrackingRecHit const*> hitmap;
+  auto const &rcs = rechits.data();
+  auto nhits = rcs.size();
+  hitmap.resize(nhits,nullptr);
+  for (auto const &h : rcs) {
+    auto const &thit = static_cast<BaseTrackerRecHit const &>(h);
+    auto detI = thit.det()->index();
+    auto const &clus = thit.firstClusterRef();
+    assert(clus.isPixel());
+    auto i = fc[detI] + clus.pixelCluster().originalId();
+    assert(i < nhits);
+    assert(nullptr==hitmap[i]);
+    hitmap[i] = &h;
+  }
+
+  std::vector<const TrackingRecHit *> hits;
+  hits.reserve(5);
+
+  const auto & tsoa = *iEvent.get(tokenTrack_);
+
+  auto const * quality = tsoa.qualityData();
+  auto const & fit = tsoa.stateAtBS;
+  auto const & hitIndices = tsoa.hitIndices; 
+  auto maxTracks =tsoa.stride();
+
+  int32_t nt = 0;
+ 
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    indToEdm.push_back(-1);
+    auto q = quality[it];
+    if (q != trackQuality::loose)
+      continue;                           // FIXME
+    if (nHits< minNumberOfHits_) continue;
+    indToEdm.back() = nt;
+    ++nt;
+
+    hits.resize(nHits);
+    auto b = hitIndices.begin(it);
+    for (int iHit = 0; iHit < nHits; ++iHit)
+      hits[iHit] =  hitmap[*(b+iHit)];
+
+    // mind: this values are respect the beamspot!
+
+    float chi2 = tsoa.chi2(it);
+    float phi = tsoa.phi(it);
+
+    Rfit::Vector5d ipar,opar;
+    Rfit::Matrix5d icov,ocov;
+    fit.copyToDense(ipar,icov,it);
+    Rfit::transformToPerigeePlane(ipar,icov,opar,ocov);
+
+    LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.);
+    AlgebraicSymMatrix55 m;
+    for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j);
+
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(
+                              sp, -cp,    0,
+                               0,   0, -1.f,
+                              cp,  sp,    0);
+
+    Plane impPointPlane(bs,rot);
+    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()), 
+                                  impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product());
+    JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product());
+
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
+
+    int ndof = 2*hits.size()-5;
+    chi2 = chi2*ndof; // FIXME
+    GlobalPoint vv = gp.position();
+    math::XYZPoint  pos( vv.x(), vv.y(), vv.z() );
+    GlobalVector pp = gp.momentum();
+    math::XYZVector mom( pp.x(), pp.y(), pp.z() );
+
+    auto track =  std::make_unique<reco::Track> ( chi2, ndof, pos, mom,
+                  gp.charge(), CurvilinearTrajectoryError(mo));
+    // filter???
+    tracks.emplace_back(track.release(), hits);
+  }
+  // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl;
+
+  // store tracks
+  storeTracks(iEvent, tracks, *httopo);
+  iEvent.put(std::move(indToEdmP));
+}
+
+
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
new file mode 100644
index 0000000000000..c8dc04633f832
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -0,0 +1,95 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~PixelTrackSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+
+  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
+
+  cudautils::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
+
+};
+
+PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) :
+  tokenCUDA_(consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+  tokenSOA_(produces<PixelTrackHeterogeneous>())
+{}
+
+
+void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+
+  edm::ParameterSetDescription desc;
+
+   desc.add<edm::InputTag>("src", edm::InputTag("caHitNtupletCUDA"));
+   descriptions.add("pixelTrackSoA", desc);
+
+}
+
+
+void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_soa = inputData.toHostAsync(ctx.stream());
+
+}
+
+void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+
+  /*
+  auto const & tsoa = *m_soa;
+  auto maxTracks = tsoa.stride();
+  std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    assert(nHits==int(tsoa.hitIndices.size(it)));
+    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    nt++;
+  }
+  std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
+  */
+
+  // DO NOT  make a copy  (actually TWO....)
+  iEvent.emplace(tokenSOA_,PixelTrackHeterogeneous(std::move(m_soa)));
+
+  assert(!m_soa);
+}
+
+
+DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
index 48abab5237587..13bdee8164780 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
@@ -16,8 +16,8 @@
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 
-template<typename Ev>
-void storeTracks(Ev & ev, const pixeltrackfitting::TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo)
+template<typename Ev, typename TWH>
+void storeTracks(Ev & ev, const TWH& tracksWithHits, const TrackerTopology& ttopo)
 {
   auto tracks = std::make_unique<reco::TrackCollection>();
   auto recHits = std::make_unique<TrackingRecHitCollection>();
@@ -27,12 +27,12 @@ void storeTracks(Ev & ev, const pixeltrackfitting::TracksWithTTRHs& tracksWithHi
 
   for (int i = 0; i < nTracks; i++)
   {
-    reco::Track* track =  tracksWithHits.at(i).first;
-    const SeedingHitSet& hits = tracksWithHits.at(i).second;
+    reco::Track* track =  tracksWithHits[i].first;
+    const auto & hits = tracksWithHits[i].second;
 
     for (unsigned int k = 0; k < hits.size(); k++)
     {
-      TrackingRecHit *hit = hits[k]->hit()->clone();
+      auto * hit = hits[k]->clone();
 
       track->appendHitPattern(*hit, ttopo);
       recHits->push_back(hit);
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index ef6d5d16fb329..ab7738826b1c2 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -51,11 +51,6 @@
     SeedComparitorPSet = dict(clusterShapeCacheSrc = 'siPixelClusterShapeCachePreSplitting')
 )
 
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoPixelVertexing.PixelTriplets.caHitQuadrupletHeterogeneousEDProducer_cfi import caHitQuadrupletHeterogeneousEDProducer as _caHitQuadrupletHeterogeneousEDProducer
-gpu.toReplaceWith(pixelTracksHitQuadruplets, _caHitQuadrupletHeterogeneousEDProducer)
-gpu.toModify(pixelTracksHitQuadruplets, trackingRegions = "pixelTracksTrackingRegions")
-
 # for trackingLowPU
 pixelTracksHitTriplets = _pixelTripletHLTEDProducer.clone(
     doublets = "pixelTracksHitDoublets",
@@ -70,10 +65,6 @@
 )
 trackingLowPU.toModify(pixelTracks, SeedingHitSets = "pixelTracksHitTriplets")
 
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromCUDA_cfi import pixelTrackProducerFromCUDA as _pixelTrackProducerFromCUDA
-gpu.toReplaceWith(pixelTracks, _pixelTrackProducerFromCUDA)
-
 pixelTracksTask = cms.Task(
     pixelTracksTrackingRegions,
     pixelFitterByHelixProjections,
@@ -94,4 +85,19 @@
 _pixelTracksTask_ntupleFit.replace(pixelFitterByHelixProjections, pixelNtupletsFitter)
 ntupleFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_ntupleFit)
 
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi import pixelTrackSoA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackFromSoA
+_pixelTracksGPUTask = cms.Task(
+  caHitNtupletCUDA,
+  pixelTrackSoA,
+  pixelTracks # FromSoA
+)
+
+gpu.toReplaceWith(pixelTracksTask, _pixelTracksGPUTask)
+gpu.toReplaceWith(pixelTracks,_pixelTrackFromSoA)
+
+
 pixelTracksSequence = cms.Sequence(pixelTracksTask)
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
index dc12de88001cd..d294e4cc6c1d6 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
@@ -85,7 +85,7 @@ int main() {
 
   // Matrix5d covf = transfFast(cov0,par0);
 
-  Rfit::transformToPerigeePlane(par0,cov0,par1,cov1,charge);
+  Rfit::transformToPerigeePlane(par0,cov0,par1,cov1);
 
   std::cout << "cov1\n" << cov1 << std::endl;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index cb8f151233385..bdfb835a02f33 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -17,14 +17,15 @@
 #include "HelixFitOnGPU.h"
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
-using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
 
 using namespace Eigen;
 
 // #define BL_DUMP_HITS
 
 template <int N>
-__global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets,
+__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
                                 CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                                 HitsOnGPU const *__restrict__ hhp,
                                 double *__restrict__ phits,
@@ -55,10 +56,10 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
     return;
 
   // get it from the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
-  assert(helix_start < foundNtuplets->nbins());
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  assert(tkid < foundNtuplets->nbins());
 
-  assert(foundNtuplets->size(helix_start) == nHits);
+  assert(foundNtuplets->size(tkid) == nHits);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit + local_start);
@@ -68,11 +69,11 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
   __shared__ int done;
   done = 0;
   __syncthreads();
-  bool dump = (foundNtuplets->size(helix_start) == 5 && 0 == atomicAdd(&done, 1));
+  bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
 #endif
 
   // Prepare data structure
-  auto const *hitId = foundNtuplets->begin(helix_start);
+  auto const *hitId = foundNtuplets->begin(tkid);
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     float ge[6];
@@ -80,14 +81,14 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
 #ifdef BL_DUMP_HITS
     if (dump) {
       printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
-             helix_start,
+             tkid,
              hhp->detectorIndex(hit),
              i,
              hhp->xGlobal(hit),
              hhp->yGlobal(hit),
              hhp->zGlobal(hit));
       printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
-             helix_start,
+             tkid,
              hhp->detetectorIndex(hit),
              i,
              ge[0],
@@ -113,7 +114,7 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
 template <int N>
 __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                             double B,
-                            Rfit::helix_fit *results,
+                            OutputSoA *results,
                             double *__restrict__ phits,
                             float *__restrict__ phits_ge,
                             double *__restrict__ pfast_fit,
@@ -133,7 +134,7 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
     return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit + local_start);
@@ -148,41 +149,31 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
   BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
   BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
   BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
-  Jacob << 1, 0, 0, 0, 1, 0, 0, 0, -B / std::copysign(Rfit::sqr(circle.par(2)), circle.par(2));
-  circle.par(2) = B / std::abs(circle.par(2));
-  circle.cov = Jacob * circle.cov * Jacob.transpose();
 
-  // Grab helix_fit from the proper location in the output vector
-  auto &helix = results[helix_start];
-  helix.par << circle.par, line.par;
-
-  helix.cov = Rfit::Matrix5d::Zero();
-  helix.cov.block(0, 0, 3, 3) = circle.cov;
-  helix.cov.block(3, 3, 2, 2) = line.cov;
-
-  helix.q = circle.q;
-  helix.chi2_circle = circle.chi2;
-  helix.chi2_line = line.chi2;
+  results->stateAtBS.copyFromCircle(circle.par,circle.cov,line.par,line.cov,1.f/float(B),tkid);
+  results->pt(tkid) =  float(B)/float(std::abs(circle.par(2)));
+  results->eta(tkid) =  asinhf(line.par(0));
+  results->chi2(tkid) = (circle.chi2+line.chi2)/(2*N-5);
 
 #ifdef BROKENLINE_DEBUG
   if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
-    printf("kernelBLFit failed! %f/%f\n", helix.chi2_circle, helix.chi2_line);
+    printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
   printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
          N,
          nHits,
-         helix_start,
+         tkid,
          circle.par(0),
          circle.par(1),
          circle.par(2));
-  printf("kernelBLHits line.par(0,1): %d %f,%f\n", helix_start, line.par(0), line.par(1));
+  printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
   printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
-         helix.chi2_circle,
-         helix.chi2_line,
-         helix.cov(0, 0),
-         helix.cov(1, 1),
-         helix.cov(2, 2),
-         helix.cov(3, 3),
-         helix.cov(4, 4));
+         circle.chi2,
+         line.chi2,
+         circle.cov(0, 0),
+         circle.cov(1, 1),
+         circle.cov(2, 2),
+         line.cov(0, 0),
+         line.cov(1, 1));
 #endif
 }
 
@@ -218,7 +209,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
     kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                   bField_,
-                                                                  helix_fit_results_d,
+                                                                  outputSoa_d,
                                                                   hitsGPU_.get(),
                                                                   hits_geGPU_.get(),
                                                                   fast_fit_resultsGPU_.get(),
@@ -239,7 +230,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
     kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                   bField_,
-                                                                  helix_fit_results_d,
+                                                                  outputSoa_d,
                                                                   hitsGPU_.get(),
                                                                   hits_geGPU_.get(),
                                                                   fast_fit_resultsGPU_.get(),
@@ -261,7 +252,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
       kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
@@ -282,7 +273,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
       kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 341a108348337..6d15cc6883098 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -12,6 +12,7 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
+<use name="CUDADataFormats/Track"/>
 
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
new file mode 100644
index 0000000000000..ba8a3e1052e7b
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -0,0 +1,82 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+
+class CAHitNtupletCUDA : public edm::global::EDProducer<> {
+public:
+  explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig);
+  ~CAHitNtupletCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tokenHit_;
+  edm::EDPutTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenTrack_;
+
+  CAHitNtupletGeneratorOnGPU gpuAlgo_;
+
+};
+
+CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) :
+      tokenHit_(consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      tokenTrack_(produces<CUDAProduct<PixelTrackHeterogeneous>>()),
+      gpuAlgo_(iConfig, consumesCollector()) {}
+
+
+void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
+
+  CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
+  auto label = "caHitNtupletCUDA";
+  descriptions.add(label, desc);
+}
+
+void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
+
+  edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>>  hHits;
+  iEvent.getByToken(tokenHit_, hHits);
+
+  CUDAScopedContextProduce ctx{*hHits};
+  auto const& hits = ctx.get(*hHits);
+
+  auto bf = 1./PixelRecoUtilities::fieldInInvGev(es);
+
+  ctx.emplace(
+      iEvent,
+      tokenTrack_,
+      std::move(gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()))
+   );
+
+}
+
+
+
+DEFINE_FWK_MODULE(CAHitNtupletCUDA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
new file mode 100644
index 0000000000000..cedef59f78f91
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -0,0 +1,933 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+// #define NTUPLE_DEBUG
+
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+#include "CAConstants.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "GPUCACell.h"
+#include "gpuFishbone.h"
+#include "gpuPixelDoublets.h"
+
+using namespace gpuPixelDoublets;
+
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+__global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
+                                      CAConstants::TupleMultiplicity * tupleMultiplicity,
+                                      AtomicPairCounter *apc,
+                                      GPUCACell const *__restrict__ cells,
+                                      uint32_t const *__restrict__ nCells,
+                                      CellNeighborsVector const *cellNeighbors,
+                                      CellTracksVector const *cellTracks,
+                                      GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                                      uint32_t nHits,
+                                      CAHitNtupletGeneratorKernels::Counters *counters) {
+  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto &c = *counters;
+  // counters once per event
+  if (0 == idx) {
+    atomicAdd(&c.nEvents, 1);
+    atomicAdd(&c.nHits, nHits);
+    atomicAdd(&c.nCells, *nCells);
+    atomicAdd(&c.nTuples, apc->get().m);
+    atomicAdd(&c.nFitTracks,tupleMultiplicity->size());
+  }
+
+#ifdef NTUPLE_DEBUG
+  if (0 == idx) {
+    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
+           *nCells,
+           apc->get().m,
+           apc->get().n,
+           nHits);
+    if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) {
+      assert(foundNtuplets->size(apc->get().m) == 0);
+      assert(foundNtuplets->size() == apc->get().n);
+    }
+  }
+
+  if (idx < foundNtuplets->nbins()) {
+    if (foundNtuplets->size(idx) > 5)
+      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
+    assert(foundNtuplets->size(idx) < 6);
+    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+      assert(*ih < nHits);
+  }
+#endif
+
+  if (0 == idx) {
+    if (apc->get().m >= CAConstants::maxNumberOfQuadruplets())
+      printf("Tuples overflow\n");
+    if (*nCells >= CAConstants::maxNumberOfDoublets())
+      printf("Cells overflow\n");
+  }
+
+  if (idx < (*nCells)) {
+    auto &thisCell = cells[idx];
+    if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.theDoubletId < 0)
+      atomicAdd(&c.nKilledCells, 1);
+    if (0==thisCell.theUsed)
+      atomicAdd(&c.nEmptyCells, 1);
+    if (thisCell.tracks().empty())
+      atomicAdd(&c.nZeroTrackCells, 1);
+  }
+  if (idx < nHits) {
+    if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
+      printf("OuterHitOfCell overflow %d\n", idx);
+  }
+}
+
+
+__global__ void kernel_fishboneCleaner(GPUCACell const *cells,
+                                       uint32_t const *__restrict__ nCells,
+                                       Quality *quality) {
+  constexpr auto bad = trackQuality::bad;
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto const &thisCell = cells[cellIndex];
+  if (thisCell.theDoubletId >= 0)
+    return;
+
+  for (auto it : thisCell.tracks())
+    quality[it] = bad;
+}
+
+__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer *foundNtuplets,
+                                            Quality *quality) {
+  // constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto const &thisCell = cells[cellIndex];
+  if (thisCell.theDoubletId < 0)
+    return;
+
+  uint32_t maxNh = 0;
+
+  // find maxNh
+  for (auto it : thisCell.tracks()) {
+    auto nh = foundNtuplets->size(it);
+    maxNh = std::max(nh, maxNh);
+  }
+
+  for (auto it : thisCell.tracks()) {
+    if (foundNtuplets->size(it) != maxNh)
+      quality[it] = dup;  //no race:  simple assignment of the same constant
+  }
+
+}
+
+
+__global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer const * __restrict__ foundNtuplets,
+                                            TkSoA * __restrict__ tracks) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto const &thisCell = cells[cellIndex];
+  if (thisCell.theDoubletId < 0)
+    return;
+
+  float mc = 10000.f;
+  uint16_t im = 60000;
+
+  auto score = [&](auto it) {
+    return std::abs(tracks->tip(it));  // tip
+    // return tracks->chi2(it);  //chi2
+  };
+
+  // find min socre
+  for (auto it : thisCell.tracks()) {
+    if (tracks->quality(it) == loose && score(it) < mc) {
+      mc = score(it);
+      im = it;
+    }
+  }
+  // mark all other duplicates
+  for (auto it : thisCell.tracks()) {
+    if (tracks->quality(it) != bad && it != im)
+        tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
+  }
+}
+
+
+__global__ void kernel_connect(AtomicPairCounter *apc1,
+                               AtomicPairCounter *apc2,  // just to zero them,
+                               GPUCACell::Hits const *__restrict__ hhp,
+                               GPUCACell *cells,
+                               uint32_t const *__restrict__ nCells,
+                               CellNeighborsVector *cellNeighbors,
+                               GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                               float hardCurvCut,
+                               float ptmin,
+                               float CAThetaCutBarrel,
+                               float CAThetaCutForward,
+                               float dcaCutInnerTriplet,
+                               float dcaCutOuterTriplet) {
+  auto const &hh = *hhp;
+
+  auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+  auto first = threadIdx.x;
+  auto stride = blockDim.x;
+
+  if (0 == (cellIndex + first)) {
+    (*apc1) = 0;
+    (*apc2) = 0;
+  }  // ready for next kernel
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto & thisCell = cells[cellIndex];
+  //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
+  //  return;
+  auto innerHitId = thisCell.get_inner_hit_id();
+  auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+  auto vi = isOuterHitOfCell[innerHitId].data();
+
+  constexpr uint32_t last_bpix1_detIndex = 96;
+  constexpr uint32_t last_barrel_detIndex = 1184;
+  auto ri = thisCell.get_inner_r(hh);
+  auto zi = thisCell.get_inner_z(hh);
+
+  auto ro = thisCell.get_outer_r(hh);
+  auto zo = thisCell.get_outer_z(hh);
+  auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
+
+  for (auto j = first; j < numberOfPossibleNeighbors; j += stride) {
+    auto otherCell = __ldg(vi + j);
+    auto & oc = cells[otherCell];
+    // if (cells[otherCell].theDoubletId < 0 ||
+    //    cells[otherCell].theUsed>1 )
+    //  continue;
+    auto r1 = oc.get_inner_r(hh);
+    auto z1 = oc.get_inner_z(hh);
+    // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
+    bool aligned = GPUCACell::areAlignedRZ(r1,
+                                z1,
+                                ri,
+                                zi,
+                                ro,
+                                zo,
+                                ptmin,
+                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+    if(aligned &&
+      thisCell.dcaCut(hh,oc,
+                      oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
+                      hardCurvCut)
+       ) {  // FIXME tune cuts
+      oc.addOuterNeighbor(cellIndex, *cellNeighbors);
+      thisCell.theUsed |= 1;
+      oc.theUsed |= 1;
+    }
+  } // loop on inner cells
+}
+
+__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells,
+                                     CellTracksVector *cellTracks,
+                                     HitContainer *foundNtuplets,
+                                     AtomicPairCounter *apc,
+                                     Quality * __restrict__ quality,
+                                     unsigned int minHitsPerNtuplet) {
+  // recursive: not obvious to widen
+  auto const &hh = *hhp;
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  if (cellIndex >= (*nCells))
+    return;
+  auto &thisCell = cells[cellIndex];
+
+  if (thisCell.theDoubletId < 0)
+    return;
+
+  auto pid = thisCell.theLayerPairId;
+  auto doit = minHitsPerNtuplet>3 ? pid<3 : pid<8 || pid >12;
+  if (doit) {
+    GPUCACell::TmpTuple stack;
+    stack.reset();
+    thisCell.find_ntuplets(hh,
+                           cells,
+                           *cellTracks,
+                           *foundNtuplets,
+                           *apc,
+                           quality,
+                           stack,
+                           minHitsPerNtuplet, 
+                           pid<3);
+    assert(stack.size() == 0);
+    // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
+  }
+
+}
+
+
+__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells) {
+
+  // auto const &hh = *hhp;
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  if (cellIndex >= (*nCells))
+    return;
+  auto &thisCell = cells[cellIndex];
+  if (!thisCell.tracks().empty())
+    thisCell.theUsed |= 2;
+
+}
+
+
+__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                         Quality const * __restrict__ quality,
+                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto it = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (it >= foundNtuplets->nbins())
+    return;
+
+  auto nhits = foundNtuplets->size(it);
+  if (nhits < 3)
+    return;
+  if (quality[it] == trackQuality::dup) return;
+  assert(quality[it] == trackQuality::bad);
+  if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
+  assert(nhits<8);
+  tupleMultiplicity->countDirect(nhits);
+}
+
+
+
+__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                        Quality const * __restrict__ quality,
+                                        CAConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto it = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (it >= foundNtuplets->nbins())
+    return;
+
+  auto nhits = foundNtuplets->size(it);
+  if (nhits < 3)
+    return;
+  if (quality[it] == trackQuality::dup) return;
+  if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
+  assert(nhits<8);
+  tupleMultiplicity->fillDirect(nhits, it);
+}
+
+
+
+__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
+                                      TkSoA const * __restrict__ tracks,
+                                      CAHitNtupletGeneratorKernels::QualityCuts cuts,
+                                      Quality *__restrict__ quality) {
+  auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= tuples->nbins()) {
+    return;
+  }
+  if (tuples->size(idx) == 0) {
+    return;
+  }
+
+  // if duplicate: not even fit
+  if (quality[idx] == trackQuality::dup) return;
+
+  assert(quality[idx] == trackQuality::bad);
+
+  // mark doublets as bad
+  if (tuples->size(idx) < 3) {
+    return;
+  }
+
+  // if the fit has any invalid parameters, mark it as bad
+  bool isNaN = false;
+  for (int i = 0; i < 5; ++i) {
+    isNaN |= isnan(tracks->stateAtBS.state(idx)(i));
+  }
+  if (isNaN) {
+#ifdef NTUPLE_DEBUG
+    printf("NaN in fit %d size %d chi2 %f\n",
+           idx,
+           tuples->size(idx),
+           tracks->chi2(idx)
+    );
+#endif
+    return;
+  }
+
+  // compute a pT-dependent chi2 cut
+  // default parameters:
+  //   - chi2MaxPt = 10 GeV
+  //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
+  //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
+  // (see CAHitNtupletGeneratorGPU.cc)
+  float pt = std::min<float>(tracks->pt(idx), cuts.chi2MaxPt);
+  float chi2Cut = cuts.chi2Scale *
+                  (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+  // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
+  if (3.f*tracks->chi2(idx) >= chi2Cut) {
+#ifdef NTUPLE_DEBUG
+    printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
+           idx,
+           tuples->size(idx), 
+           tracks->pt(idx),
+           tracks->eta(idx),
+           3.f*tracks->chi2(idx)
+    );
+#endif
+    return;
+  }
+
+  // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+  // default cuts:
+  //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+  //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+  // (see CAHitNtupletGeneratorGPU.cc)
+  auto const &region = (tuples->size(idx) > 3) ? cuts.quadruplet : cuts.triplet;
+  bool isOk = (std::abs(tracks->tip(idx)) < region.maxTip) and (tracks->pt(idx) > region.minPt) and
+              (std::abs(tracks->zip(idx)) < region.maxZip);
+
+  if (isOk) {
+    quality[idx] = trackQuality::loose;
+  }
+}
+
+__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernels::Counters *counters) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      continue;
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    atomicAdd(&(counters->nGoodTracks), 1);
+  }
+}
+
+
+__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      continue;
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->countDirect(*h);
+  }
+}
+
+__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
+                                       Quality const *__restrict__ quality,
+                                       CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      continue;
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->fillDirect(*h, idx);
+  }
+}
+
+__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
+                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                         HitContainer *__restrict__ hitDetIndices) {
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  // copy offsets
+  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    hitDetIndices->off[idx] = tuples->off[idx];
+  }
+  // fill hit indices
+  auto const & hh = *hhp;
+  auto nhits = hh.nHits();
+  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    assert(tuples->bins[idx]<nhits);
+    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+  }
+}
+
+void CAHitNtupletGeneratorKernels::fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tracks_d, cudaStream_t cudaStream) {
+  auto blockSize=128;
+  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+
+  kernel_fillHitDetIndices<<<numberOfBlocks,blockSize,0,cudaStream>>>(&tracks_d->hitIndices, hh.view(), &tracks_d->detIndices);
+  cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+}
+
+
+__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ hitToTuple,
+                                             CAHitNtupletGeneratorKernels::Counters *counters) {
+  auto &c = *counters;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple->size(idx) == 0)
+      continue;
+    atomicAdd(&c.nUsedHits, 1);
+    if (hitToTuple->size(idx) > 1)
+      atomicAdd(&c.nDupHits, 1);
+  }
+}
+
+
+__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const * __restrict__ ptracks,
+                                      Quality *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  auto &hitToTuple = *phitToTuple;
+  auto const &foundNtuplets = *ptuples;
+  auto const & tracks = *ptracks;
+
+  //  auto const & hh = *hhp;
+  // auto l1end = hh.hitsLayerStart_d[1];
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+
+  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple.size(idx) < 2)
+      continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      maxNh = std::max(nh, maxNh);
+    }
+    // kill all tracks shorter than maxHn (only triplets???)
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      if (maxNh != nh)
+        quality[*it] = dup;
+    }
+
+    if (maxNh > 3)
+      continue;
+    // if (idx>=l1end) continue;  // only for layer 1
+    // for triplets choose best tip!
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
+        mc = std::abs(tracks.tip(it));
+        im = it;
+      }
+    }
+    // mark duplicates
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && it != im)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+  }  // loop over hits
+}
+
+__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const * __restrict__ ptracks,
+                                      Quality const *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple,
+                                      uint32_t maxPrint, int iev) {
+  auto const & foundNtuplets = *ptuples;
+  auto const & tracks = *ptracks;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < std::min(maxPrint, foundNtuplets.nbins()); i+=blockDim.x*gridDim.x) {
+    auto nh = foundNtuplets.size(i);
+    if (nh<3) continue;
+    printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
+           10000*iev+i,
+           int(quality[i]),
+           nh,
+           tracks.charge(i),
+           tracks.pt(i),
+           tracks.eta(i),
+           tracks.phi(i),
+           tracks.tip(i),
+           tracks.zip(i),
+//           asinhf(fit_results[i].par(3)),
+           tracks.chi2(i),
+           *foundNtuplets.begin(i),
+           *(foundNtuplets.begin(i) + 1),
+           *(foundNtuplets.begin(i) + 2),
+           nh>3 ? int(*(foundNtuplets.begin(i) + 3)):-1,
+           nh>4 ? int(*(foundNtuplets.begin(i) + 4)):-1
+          );
+  }
+}
+
+
+void CAHitNtupletGeneratorKernels::launchKernels(
+    HitsOnCPU const &hh,
+    TkSoA * tracks_d,
+    cudaStream_t cudaStream) {
+
+  auto maxNumberOfDoublets_ = CAConstants::maxNumberOfDoublets();
+
+  // these are pointer on GPU!
+  auto * tuples_d = &tracks_d->hitIndices; 
+  auto * quality_d = (Quality*)(&tracks_d->m_quality);  
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  auto nthTot = 64;
+  auto stride = 4;
+  auto blockSize = nthTot / stride;
+  auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  auto rescale = numberOfBlocks / 65536;
+  blockSize *= (rescale + 1);
+  numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  assert(numberOfBlocks < 65536);
+  assert(blockSize > 0 && 0 == blockSize % 16);
+  dim3 blks(1, numberOfBlocks, 1);
+  dim3 thrs(stride, blockSize, 1);
+
+  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
+      device_hitTuple_apc_,
+      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+      hh.view(),
+      device_theCells_.get(),
+      device_nCells_,
+      device_theCellNeighbors_,
+      device_isOuterHitOfCell_.get(),
+      m_params.hardCurvCut_,
+      m_params.ptmin_,
+      m_params.CAThetaCutBarrel_,
+      m_params.CAThetaCutForward_,
+      m_params.dcaCutInnerTriplet_,
+      m_params.dcaCutOuterTriplet_);
+  cudaCheck(cudaGetLastError());
+
+
+  if (nhits > 1 && m_params.earlyFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+    cudaCheck(cudaGetLastError());
+  }
+
+
+  blockSize = 64;
+  numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                                     device_theCells_.get(),
+                                                                     device_nCells_,
+                                                                     device_theCellTracks_,
+                                                                     tuples_d,
+                                                                     device_hitTuple_apc_,
+                                                                     quality_d,
+                                                                     m_params.minHitsPerNtuplet_);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.doStats_)
+    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                                   device_theCells_.get(),
+                                                                   device_nCells_);
+  cudaCheck(cudaGetLastError());
+  
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+
+  blockSize = 128;
+  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+  kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  blockSize = 128;
+  numberOfBlocks = (CAConstants::maxTuples() + blockSize - 1) / blockSize;
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudaCheck(cudaGetLastError());
+
+  if (nhits > 1 && m_params.lateFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    cudaCheck(cudaGetLastError());
+  }
+
+  if (m_params.doStats_) {
+    numberOfBlocks = (std::max(nhits, maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_,
+                                                                        device_theCellTracks_,
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        counters_);
+    cudaCheck(cudaGetLastError());
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+}
+
+void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  edm::Service<CUDAService> cs;
+  device_isOuterHitOfCell_ = cs->make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits), stream);
+  assert(device_isOuterHitOfCell_.get());
+  {
+    int threadsPerBlock = 128;
+    // at least one block!
+    int blocks = ( std::max(1U,nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream.id()>>>(device_isOuterHitOfCell_.get(),
+                                                                                nhits,
+                                                                                device_theCellNeighbors_,
+                                                                                device_theCellNeighborsContainer_.get(),
+                                                                                device_theCellTracks_,
+                                                                                device_theCellTracksContainer_.get());
+    cudaCheck(cudaGetLastError());
+  }
+
+  device_theCells_ = cs->make_device_unique<GPUCACell[]>(CAConstants::maxNumberOfDoublets(), stream);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs=gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_) nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_>3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs<=gpuPixelDoublets::nPairs);
+  int stride = 1;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
+  int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  dim3 blks(1, blocks, 1);
+  dim3 thrs(stride, threadsPerBlock, 1);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream.id()>>>(device_theCells_.get(),
+                                                                         device_nCells_,
+                                                                         device_theCellNeighbors_,
+                                                                         device_theCellTracks_,
+                                                                         hh.view(),
+                                                                         device_isOuterHitOfCell_.get(),
+                                                                         nActualPairs,
+                                                                         m_params.idealConditions_,
+                                                                         m_params.doClusterCut_,
+                                                                         m_params.doZCut_,
+                                                                         m_params.doPhiCut_);
+  cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+}
+
+void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh,
+                                                     TkSoA * tracks_d,
+                                                     cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto const * tuples_d = &tracks_d->hitIndices;
+  auto * quality_d = (Quality*)(&tracks_d->m_quality);
+
+  auto blockSize = 64;
+
+  // classify tracks based on kinematics
+  auto numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, tracks_d, m_params.cuts_, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        device_theCells_.get(), device_nCells_, quality_d);
+    cudaCheck(cudaGetLastError());
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+  kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.minHitsPerNtuplet_<4 || m_params.doStats_) {
+    // fill hit->track "map"
+    numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cudaCheck(cudaGetLastError());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+  if (m_params.minHitsPerNtuplet_<4) {
+    // remove duplicates (tracks that share a hit)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+  if (m_params.doStats_) {
+    // counters (add flag???)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
+    cudaCheck(cudaGetLastError());
+    numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
+    cudaCheck(cudaGetLastError());
+  }
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef    DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100,iev);
+#endif
+
+}
+
+__global__ void kernel_printCounters(CAHitNtupletGeneratorKernels::Counters const *counters) {
+  auto const &c = *counters;
+  printf(
+      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | nKilledCells | "
+      "nEmptyCells | nZeroTrackCells ||\n");
+  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+         c.nEvents,
+         c.nHits,
+         c.nCells,
+         c.nTuples,
+         c.nGoodTracks,
+         c.nFitTracks,
+         c.nUsedHits,
+         c.nDupHits,
+         c.nKilledCells,
+         c.nEmptyCells,
+         c.nZeroTrackCells);
+  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
+         c.nEvents,
+         c.nHits / double(c.nEvents),
+         c.nCells / double(c.nEvents),
+         c.nTuples / double(c.nEvents),
+         c.nFitTracks / double(c.nEvents),
+         c.nGoodTracks / double(c.nEvents),
+         c.nUsedHits / double(c.nEvents),
+         c.nDupHits / double(c.nEvents),
+         c.nKilledCells / double(c.nEvents),
+         c.nEmptyCells / double(c.nCells),
+         c.nZeroTrackCells / double(c.nCells));
+}
+
+void CAHitNtupletGeneratorKernels::printCounters(Counters const * counters) { 
+   kernel_printCounters<<<1, 1>>>(counters);
+}
+
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
new file mode 100644
index 0000000000000..147ba98310c14
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -0,0 +1,186 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "GPUCACell.h"
+
+// #define DUMP_GPU_TK_TUPLES
+
+
+class CAHitNtupletGeneratorKernels {
+public:
+  // counters
+  struct Counters {
+    unsigned long long nEvents;
+    unsigned long long nHits;
+    unsigned long long nCells;
+    unsigned long long nTuples;
+    unsigned long long nFitTracks;
+    unsigned long long nGoodTracks;
+    unsigned long long nUsedHits;
+    unsigned long long nDupHits;
+    unsigned long long nKilledCells;
+    unsigned long long nEmptyCells;
+    unsigned long long nZeroTrackCells;
+  };
+
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+
+  struct QualityCuts {
+    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
+    float chi2Coeff[4];
+    float chi2MaxPt;    // GeV
+    float chi2Scale;
+
+    struct region {
+      float maxTip;     // cm
+      float minPt;      // GeV
+      float maxZip;     // cm
+    };
+
+    region triplet;
+    region quadruplet;
+  };
+
+
+  // params
+  struct Params {
+    Params(uint32_t minHitsPerNtuplet,
+                                  bool useRiemannFit,
+                                  bool fit5as4,
+                                  bool includeJumpingForwardDoublets,
+                                  bool earlyFishbone,
+                                  bool lateFishbone,
+                                  bool idealConditions,
+                                  bool doStats,
+                                  bool doClusterCut,
+                                  bool doZCut,
+                                  bool doPhiCut,
+                                  float ptmin,
+                                  float CAThetaCutBarrel,
+                                  float CAThetaCutForward,
+                                  float hardCurvCut,
+                                  float dcaCutInnerTriplet,
+                                  float dcaCutOuterTriplet,
+                                  QualityCuts const& cuts)
+      : minHitsPerNtuplet_(minHitsPerNtuplet),
+        useRiemannFit_(useRiemannFit),
+        fit5as4_(fit5as4),
+        includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
+        earlyFishbone_(earlyFishbone),
+        lateFishbone_(lateFishbone),
+        idealConditions_(idealConditions),
+        doStats_(doStats),
+        doClusterCut_(doClusterCut),
+        doZCut_(doZCut),
+        doPhiCut_(doPhiCut),
+        ptmin_(ptmin),
+        CAThetaCutBarrel_(CAThetaCutBarrel),
+        CAThetaCutForward_(CAThetaCutForward),
+        hardCurvCut_(hardCurvCut),
+        dcaCutInnerTriplet_(dcaCutInnerTriplet),
+        dcaCutOuterTriplet_(dcaCutOuterTriplet),
+        cuts_(cuts) { }
+
+  const uint32_t minHitsPerNtuplet_;
+  const bool useRiemannFit_;
+  const bool fit5as4_;
+  const bool includeJumpingForwardDoublets_;
+  const bool earlyFishbone_;
+  const bool lateFishbone_;
+  const bool idealConditions_;
+  const bool doStats_;
+  const bool doClusterCut_;
+  const bool doZCut_;
+  const bool doPhiCut_;
+  const float ptmin_;
+  const float CAThetaCutBarrel_;
+  const float CAThetaCutForward_;
+  const float hardCurvCut_;
+  const float dcaCutInnerTriplet_;
+  const float dcaCutOuterTriplet_;
+
+  // quality cuts
+  QualityCuts cuts_
+  {
+    // polynomial coefficients for the pT-dependent chi2 cut
+    { 0.68177776, 0.74609577, -0.08035491, 0.00315399 },
+    // max pT used to determine the chi2 cut
+    10.,
+    // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+    30.,
+    // regional cuts for triplets
+    {
+      0.3,  // |Tip| < 0.3 cm
+      0.5,  // pT > 0.5 GeV
+      12.0  // |Zip| < 12.0 cm
+    },
+    // regional cuts for quadruplets
+    {
+      0.5,  // |Tip| < 0.5 cm
+      0.3,  // pT > 0.3 GeV
+      12.0  // |Zip| < 12.0 cm
+    }
+   };
+  
+  }; // Params
+
+
+  CAHitNtupletGeneratorKernels(Params const & params) : m_params(params){}
+  ~CAHitNtupletGeneratorKernels() = default;
+
+  TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
+
+  void launchKernels(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+
+  void classifyTuples(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+
+  void fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+
+  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream);
+  void allocateOnGPU(cuda::stream_t<>& stream);
+  void cleanup(cudaStream_t cudaStream);
+
+  static void printCounters(Counters const * counters);
+  Counters* counters_ = nullptr;
+
+
+private:
+
+  // workspace
+  CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
+  cudautils::device::unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
+  CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
+  cudautils::device::unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
+
+  cudautils::device::unique_ptr<GPUCACell[]> device_theCells_;
+  cudautils::device::unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
+  uint32_t* device_nCells_ = nullptr;
+
+  cudautils::device::unique_ptr<HitToTuple> device_hitToTuple_;
+  AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
+
+  AtomicPairCounter* device_hitTuple_apc_ = nullptr;
+
+  cudautils::device::unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
+  
+  uint8_t * device_tmws_;
+
+  cudautils::device::unique_ptr<AtomicPairCounter::c_type[]> device_storage_;
+  // params
+  Params const & m_params;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
new file mode 100644
index 0000000000000..126b6237bd0d7
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
@@ -0,0 +1,40 @@
+#include "CAHitNtupletGeneratorKernels.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+void CAHitNtupletGeneratorKernels::allocateOnGPU(cuda::stream_t<>& stream) {
+  //////////////////////////////////////////////////////////
+  // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
+  //////////////////////////////////////////////////////////
+
+  edm::Service<CUDAService> cs;
+
+  /* not used at the moment 
+  cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
+  cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
+  */
+
+  device_hitToTuple_ = cs->make_device_unique<HitToTuple>(stream);
+
+  device_tupleMultiplicity_ = cs->make_device_unique<TupleMultiplicity>(stream);
+
+  auto storageSize = 3+(std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize())+sizeof(AtomicPairCounter::c_type))/sizeof(AtomicPairCounter::c_type);
+
+  device_storage_ = cs->make_device_unique<AtomicPairCounter::c_type[]>(storageSize,stream);
+  
+  device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
+  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get()+1;
+  device_nCells_ = (uint32_t *)(device_storage_.get()+2);
+  device_tmws_ = (uint8_t*)(device_storage_.get()+3);
+
+  assert(device_tmws_+std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <= (uint8_t*)(device_storage_.get()+storageSize));
+
+  cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
+  cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
+  cudautils::launchZero(device_hitToTuple_.get(), stream.id());  // we may wish to keep it in the edm...
+}
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
new file mode 100644
index 0000000000000..08cafc7e8fc09
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -0,0 +1,164 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDMException.h"
+#include "FWCore/Utilities/interface/isFinite.h"
+#include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+
+namespace {
+
+  template <typename T>
+  T sqr(T x) {
+    return x * x;
+  }
+
+  CAHitNtupletGeneratorKernels::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) {
+    auto coeff = pset.getParameter<std::vector<double>>("chi2Coeff");
+    if (coeff.size() != 4) {
+      throw edm::Exception(edm::errors::Configuration, "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements");
+    }
+    return CAHitNtupletGeneratorKernels::QualityCuts {
+      // polynomial coefficients for the pT-dependent chi2 cut
+      { (float) coeff[0], (float) coeff[1], (float) coeff[2], (float) coeff[3] },
+      // max pT used to determine the chi2 cut
+      (float) pset.getParameter<double>("chi2MaxPt"),
+      // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+      (float) pset.getParameter<double>("chi2Scale"),
+      // regional cuts for triplets
+      {
+        (float) pset.getParameter<double>("tripletMaxTip"),
+        (float) pset.getParameter<double>("tripletMinPt"),
+        (float) pset.getParameter<double>("tripletMaxZip")
+      },
+      // regional cuts for quadruplets
+      {
+        (float) pset.getParameter<double>("quadrupletMaxTip"),
+        (float) pset.getParameter<double>("quadrupletMinPt"),
+        (float) pset.getParameter<double>("quadrupletMaxZip")
+      }
+    };
+  }
+
+}  // namespace
+
+using namespace std;
+
+CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet &cfg, edm::ConsumesCollector &iC)
+    : m_params(cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+              cfg.getParameter<bool>("useRiemannFit"),
+              cfg.getParameter<bool>("fit5as4"),
+              cfg.getParameter<bool>("includeJumpingForwardDoublets"),
+              cfg.getParameter<bool>("earlyFishbone"),
+              cfg.getParameter<bool>("lateFishbone"),
+              cfg.getParameter<bool>("idealConditions"),
+              cfg.getParameter<bool>("fillStatistics"),
+              cfg.getParameter<bool>("doClusterCut"),
+              cfg.getParameter<bool>("doZCut"),
+              cfg.getParameter<bool>("doPhiCut"),
+              cfg.getParameter<double>("ptmin"),
+              cfg.getParameter<double>("CAThetaCutBarrel"),
+              cfg.getParameter<double>("CAThetaCutForward"),
+              cfg.getParameter<double>("hardCurvCut"),
+              cfg.getParameter<double>("dcaCutInnerTriplet"),
+              cfg.getParameter<double>("dcaCutOuterTriplet"),
+              makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
+
+#ifdef    DUMP_GPU_TK_TUPLES
+      printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+             "tid", "qual", "nh","charge","pt","eta","phi","tip","zip","chi2",
+             "h1","h2","h3","h4","h5");
+#endif
+
+  cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
+  cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+
+}
+
+CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU(){
+ if (m_params.doStats_) {
+    // crash on multi-gpu processes
+    CAHitNtupletGeneratorKernels::printCounters(m_counters);
+  }
+  cudaFree(m_counters);
+}
+
+
+void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &desc) {
+  // 87 cm/GeV = 1/(3.8T * 0.3)
+  // take less than radius given by the hardPtCut and reject everything below
+  // auto hardCurvCut = 1.f/(0.35 * 87.f);
+  desc.add<double>("ptmin", 0.9f)->setComment("Cut on minimum pt");
+  desc.add<double>("CAThetaCutBarrel", 0.002f)->setComment("Cut on RZ alignement for Barrel");
+  desc.add<double>("CAThetaCutForward", 0.003f)->setComment("Cut on RZ alignment for Forward");
+  desc.add<double>("hardCurvCut", 1.f / (0.35 * 87.f))->setComment("Cut on minimum curvature");
+  desc.add<double>("dcaCutInnerTriplet", 0.15f)->setComment("Cut on origin radius when the inner hit is on BPix1");
+  desc.add<double>("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1");
+  desc.add<bool>("earlyFishbone", true);
+  desc.add<bool>("lateFishbone", false);
+  desc.add<bool>("idealConditions", true);
+  desc.add<bool>("fillStatistics", false);
+  desc.add<unsigned int>("minHitsPerNtuplet", 4);
+  desc.add<bool>("includeJumpingForwardDoublets", false);
+  desc.add<bool>("fit5as4", true);
+  desc.add<bool>("doClusterCut", true);
+  desc.add<bool>("doZCut", true);
+  desc.add<bool>("doPhiCut", true);
+  desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
+
+  edm::ParameterSetDescription trackQualityCuts;
+  trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
+  trackQualityCuts.add<std::vector<double>>("chi2Coeff", { 0.68177776, 0.74609577, -0.08035491, 0.00315399 })
+    ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut");
+  trackQualityCuts.add<double>("chi2Scale", 30.)->setComment("Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann fit)");
+  trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
+  trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");
+  trackQualityCuts.add<double>("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV");
+  trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm");
+  desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+    ->setComment("Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply \"region cuts\" based on the fit results (pT, Tip, Zip).");
+}
+
+
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
+                                                           float bfield,
+                                                           cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  PixelTrackHeterogeneous tracks(cs->make_device_unique<pixelTrack::TrackSoA>(stream));
+
+  auto * soa = tracks.get();
+  
+  CAHitNtupletGeneratorKernels kernels(m_params);
+  kernels.counters_ = m_counters;
+  HelixFitOnGPU fitter(bfield,m_params.fit5as4_);
+
+  kernels.allocateOnGPU(stream);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+
+  kernels.buildDoublets(hits_d, stream);
+  kernels.launchKernels(hits_d, soa, stream.id());
+  kernels.fillHitDetIndices(hits_d, soa, stream.id());  // in principle needed only if Hits not "available"
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  } else {
+    fitter.launchBrokenLineKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  }
+  kernels.classifyTuples(hits_d, soa, stream.id());
+
+  return tracks;
+}
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
new file mode 100644
index 0000000000000..169f591c48e45
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -0,0 +1,73 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+
+#include <cuda_runtime.h>
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+
+#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+
+#include "CAHitNtupletGeneratorKernels.h"
+#include "HelixFitOnGPU.h"
+
+// FIXME  (split header???)
+#include "GPUCACell.h"
+
+namespace edm {
+  class Event;
+  class EventSetup;
+  class ParameterSetDescription;
+}  // namespace edm
+
+class CAHitNtupletGeneratorOnGPU {
+public:
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+  using hindex_type = TrackingRecHit2DSOAView::hindex_type;
+
+  using Quality = pixelTrack::Quality;
+  using OutputSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+  using Tuple = HitContainer;
+
+public:
+  CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC)
+      : CAHitNtupletGeneratorOnGPU(cfg, iC) {}
+  CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
+
+  ~CAHitNtupletGeneratorOnGPU();
+
+  static void fillDescriptions(edm::ParameterSetDescription& desc);
+  static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
+
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
+                                float bfield,
+                                cuda::stream_t<>& stream) const;
+ 
+private:
+
+  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream) const;
+
+  void hitNtuplets(HitsOnCPU const& hh,
+                   const edm::EventSetup& es,
+                   bool useRiemannFit,
+                   cuda::stream_t<>& cudaStream);
+
+  void cleanup(cudaStream_t stream);
+
+  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const;
+
+
+  CAHitNtupletGeneratorKernels::Params m_params;
+
+  using Counters = CAHitNtupletGeneratorKernels::Counters;
+  CAHitNtupletGeneratorKernels::Counters * m_counters = nullptr;
+
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 2beb8b889c94c..f1709f7ae7063 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -15,7 +15,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
-#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CAConstants.h"
 
 class GPUCACell {
 public:
@@ -33,10 +34,9 @@ class GPUCACell {
 
   using TmpTuple = GPU::VecArray<uint32_t, 6>;
 
-  using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
-
-  using Quality = pixelTuplesHeterogeneousProduct::Quality;
-  static constexpr auto bad = pixelTuplesHeterogeneousProduct::bad;
+  using HitContainer = pixelTrack::HitContainer;
+  using Quality = trackQuality::Quality;
+  static constexpr auto bad = trackQuality::bad;
 
   GPUCACell() = default;
 #ifdef __CUDACC__
@@ -249,7 +249,7 @@ class GPUCACell {
   __device__ inline void find_ntuplets(Hits const& hh,
                                        GPUCACell* __restrict__ cells,
                                        CellTracksVector& cellTracks,
-                                       TuplesOnGPU::Container& foundNtuplets,
+                                       HitContainer & foundNtuplets,
                                        AtomicPairCounter& apc,
                                        Quality* __restrict__ quality,
                                        TmpTuple& tmpNtuplet,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index a374c975ef4b6..c071cdd347808 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,16 +1,16 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const* tuples,
-                                  TupleMultiplicity const* tupleMultiplicity,
-                                  Rfit::helix_fit* helix_fit_results) {
+void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
+                     TupleMultiplicity const *tupleMultiplicity,
+                     OutputSoA *helix_fit_results) {
   tuples_d = tuples;
   tupleMultiplicity_d = tupleMultiplicity;
-  helix_fit_results_d = helix_fit_results;
+  outputSoa_d = helix_fit_results;
 
   assert(tuples_d);
   assert(tupleMultiplicity_d);
-  assert(helix_fit_results_d);
+  assert(outputSoa_d);
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 022a6ba5ae623..c9688fc43418c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -4,10 +4,9 @@
 #include <cuda/api_wrappers.h>
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
-#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
-
-class TrackingRecHit2DSOAView;
-class TrackingRecHit2DCUDA;
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CAConstants.h"
 
 namespace Rfit {
   // in case of memory issue can be made smaller
@@ -38,10 +37,12 @@ class HelixFitOnGPU {
   using HitsOnGPU = TrackingRecHit2DSOAView;
   using HitsOnCPU = TrackingRecHit2DCUDA;
 
-  using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+  using Tuples = pixelTrack::HitContainer;
+  using OutputSoA = pixelTrack::TrackSoA;
+
   using TupleMultiplicity = CAConstants::TupleMultiplicity;
 
-  explicit HelixFitOnGPU(bool fit5as4) : fit5as4_(fit5as4) {}
+  explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
@@ -54,19 +55,19 @@ class HelixFitOnGPU {
                                uint32_t maxNumberOfTuples,
                                cuda::stream_t<> &cudaStream);
 
-  void allocateOnGPU(TuplesOnGPU::Container const *tuples,
+  void allocateOnGPU(Tuples const *tuples,
                      TupleMultiplicity const *tupleMultiplicity,
-                     Rfit::helix_fit *helix_fit_results);
+                     OutputSoA * outputSoA);
   void deallocateOnGPU();
 
 private:
   static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
 
   // fowarded
-  TuplesOnGPU::Container const *tuples_d = nullptr;
+  Tuples const *tuples_d = nullptr;
   TupleMultiplicity const *tupleMultiplicity_d = nullptr;
-  double bField_;
-  Rfit::helix_fit *helix_fit_results_d = nullptr;
+  OutputSoA * outputSoa_d;
+  float bField_;
 
   const bool fit5as4_;
 };
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 4aea729e913a6..38f51e676a9ca 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -17,12 +17,13 @@
 #include "HelixFitOnGPU.h"
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
-using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
 
 using namespace Eigen;
 
 template <int N>
-__global__ void kernelFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets,
+__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
                               CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                               uint32_t nHits,
                               HitsOnGPU const *__restrict__ hhp,
@@ -51,17 +52,17 @@ __global__ void kernelFastFit(TuplesOnGPU::Container const *__restrict__ foundNt
     return;
 
   // get it from the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
-  assert(helix_start < foundNtuplets->nbins());
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  assert(tkid < foundNtuplets->nbins());
 
-  assert(foundNtuplets->size(helix_start) == nHits);
+  assert(foundNtuplets->size(tkid) == nHits);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit + local_start);
   Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
 
   // Prepare data structure
-  auto const *hitId = foundNtuplets->begin(helix_start);
+  auto const *hitId = foundNtuplets->begin(tkid);
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
@@ -102,7 +103,7 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
     return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit_input + local_start);
@@ -116,7 +117,7 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
   circle_fit[local_start] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
 
 #ifdef RIEMANN_DEBUG
-//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", helix_start,
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid,
 //         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
 #endif
 }
@@ -125,7 +126,7 @@ template <int N>
 __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                               uint32_t nHits,
                               double B,
-                              Rfit::helix_fit *results,
+                              OutputSoA *results,
                               double *__restrict__ phits,
                               float *__restrict__ phits_ge,
                               double *__restrict__ pfast_fit_input,
@@ -144,7 +145,7 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
     return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit_input + local_start);
@@ -152,39 +153,31 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 
   auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true);
 
-  par_uvrtopak(circle_fit[local_start], B, true);
+  Rfit::fromCircleToPerigee(circle_fit[local_start]);
 
-  // Grab helix_fit from the proper location in the output vector
-  auto &helix = results[helix_start];
-  helix.par << circle_fit[local_start].par, line_fit.par;
-
-  // TODO: pass properly error booleans
-
-  helix.cov = Rfit::Matrix5d::Zero();
-  helix.cov.block(0, 0, 3, 3) = circle_fit[local_start].cov;
-  helix.cov.block(3, 3, 2, 2) = line_fit.cov;
-
-  helix.q = circle_fit[local_start].q;
-  helix.chi2_circle = circle_fit[local_start].chi2;
-  helix.chi2_line = line_fit.chi2;
+  results->stateAtBS.copyFromCircle(circle_fit[local_start].par,circle_fit[local_start].cov,
+                                   line_fit.par,line_fit.cov,1.f/float(B),tkid);
+  results->pt(tkid) =  B/std::abs(circle_fit[local_start].par(2));
+  results->eta(tkid) =  asinhf(line_fit.par(0));
+  results->chi2(tkid) = (circle_fit[local_start].chi2+line_fit.chi2)/(2*N-5);
 
 #ifdef RIEMANN_DEBUG
   printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
          N,
          nHits,
-         helix_start,
+         tkid,
          circle_fit[local_start].par(0),
          circle_fit[local_start].par(1),
          circle_fit[local_start].par(2));
-  printf("kernelLineFit line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0), line_fit.par(1));
+  printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
   printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
-         helix.chi2_circle,
-         helix.chi2_line,
-         helix.cov(0, 0),
-         helix.cov(1, 1),
-         helix.cov(2, 2),
-         helix.cov(3, 3),
-         helix.cov(4, 4));
+         circle_fit[local_start].chi2,
+         line_fit.chi2,
+         circle_fit[local_start].cov(0, 0),
+         circle_fit[local_start].cov(1, 1),
+         circle_fit[local_start].cov(2, 2),
+         line_fit.cov(0, 0),
+         line_fit.cov(1, 1));
 #endif
 }
 
@@ -234,7 +227,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
     kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     3,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
@@ -266,7 +259,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
     kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     4,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
@@ -299,7 +292,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
       kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                       5,
                                                                       bField_,
-                                                                      helix_fit_results_d,
+                                                                      outputSoa_d,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
@@ -331,7 +324,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
       kernelLineFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                       5,
                                                                       bField_,
-                                                                      helix_fit_results_d,
+                                                                      outputSoa_d,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
new file mode 100644
index 0000000000000..b10de55871185
--- /dev/null
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -0,0 +1,177 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h"
+#include "DataFormats/TrackingRecHit/interface/InvalidTrackingRecHit.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "Geometry/CommonDetUnit/interface/GeomDet.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "TrackingTools/MaterialEffects/interface/PropagatorWithMaterial.h"
+#include "TrackingTools/Records/interface/TrackingComponentsRecord.h"
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "TrackingTools/TrajectoryState/interface/TrajectoryStateTransform.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
+/*
+  produces seeds directly from cuda produced tuples
+*/
+class SeedProducerFromSoA : public edm::global::EDProducer<> {
+public:
+
+  explicit SeedProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~SeedProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+
+  int32_t minNumberOfHits_;
+};
+
+SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet &iConfig) :
+      tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
+
+{
+    produces<TrajectorySeedCollection>();
+}
+
+void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelTrackSoA"));
+  desc.add<int>("minNumberOfHits", 0);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+
+  // std::cout << "Converting gpu helix to trajectory seed" << std::endl;
+  auto result = std::make_unique<TrajectorySeedCollection>();
+
+
+  edm::ESHandle<MagneticField> fieldESH;
+  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
+
+  edm::ESHandle<TrackerGeometry> tracker;
+  iSetup.get<TrackerDigiGeometryRecord>().get(tracker);
+  auto const & dus = tracker->detUnits();
+
+  edm::ESHandle<Propagator>  propagatorHandle;
+  iSetup.get<TrackingComponentsRecord>().get("PropagatorWithMaterial",propagatorHandle);
+  const Propagator*  propagator = &(*propagatorHandle);
+
+  edm::ESHandle<TrackerTopology> httopo;
+  iSetup.get<TrackerTopologyRcd>().get(httopo);
+
+
+  const auto &bsh = iEvent.get(tBeamSpot_);
+  // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
+  GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
+
+  const auto & tsoa = *(iEvent.get(tokenTrack_));
+
+  auto const * quality = tsoa.qualityData();
+  auto const & fit = tsoa.stateAtBS;
+  auto const & detIndices = tsoa.detIndices;
+  auto maxTracks = tsoa.stride();
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+
+    auto q = quality[it];
+    if (q != trackQuality::loose)
+      continue;                           // FIXME
+   if (nHits< minNumberOfHits_) continue;
+    ++nt;
+
+    // fill hits with invalid just to hold the detId
+    auto b = detIndices.begin(it);
+    edm::OwnVector<TrackingRecHit> hits;
+    for (int iHit = 0; iHit < nHits; ++iHit) {
+      auto const * det =  dus[*(b+iHit)];
+      // FIXME at some point get a proper type ...
+      hits.push_back(new InvalidTrackingRecHit(*det,TrackingRecHit::bad));
+    }
+
+
+   // mind: this values are respect the beamspot!
+
+    float phi = tsoa.phi(it);
+
+    Rfit::Vector5d ipar,opar;
+    Rfit::Matrix5d icov,ocov;
+    fit.copyToDense(ipar,icov,it);
+    Rfit::transformToPerigeePlane(ipar,icov,opar,ocov);
+
+    LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.);
+    AlgebraicSymMatrix55 m;
+    for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j);
+
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(
+                              sp, -cp,    0,
+                               0,   0, -1.f,
+                              cp,  sp,    0);
+
+    Plane impPointPlane(bs,rot);
+    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()),
+                                  impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product());
+
+    JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product());
+
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
+
+    FreeTrajectoryState fts(gp, CurvilinearTrajectoryError(mo));
+
+    auto const & lastHit = hits.back();
+
+    TrajectoryStateOnSurface outerState = propagator->propagate(fts, *lastHit.surface());
+
+    if (!outerState.isValid()){
+      edm::LogError("SeedFromGPU")<<" was trying to create a seed from:\n"<<fts<<"\n propagating to: " 
+                                         << lastHit.geographicalId().rawId();
+      continue;
+    }
+
+    auto const & pTraj = trajectoryStateTransform::persistentState(outerState, lastHit.geographicalId().rawId());
+
+    result->emplace_back(pTraj, hits, alongMomentum);
+
+  }
+
+  iEvent.put(std::move(result));
+}
+
+DEFINE_FWK_MODULE(SeedProducerFromSoA);

From 3ae0df1c42a8c72c9f21c0d1c695b91869808b7c Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 25 Aug 2019 14:04:02 +0200
Subject: [PATCH 055/102] Implement full Pixel SoA workflow on CPU
 (cms-patatrack#385)

---
 .../Track/interface/TrajectoryStateSoA.h      |   6 +-
 .../python/customizePixelTracksSoAonCPU.py    |  36 +
 .../plugins/PixelTrackProducerFromSoA.cc      |  38 +-
 .../plugins/BrokenLineFitOnGPU.cc             |  96 +++
 .../plugins/BrokenLineFitOnGPU.cu             | 201 +----
 .../plugins/BrokenLineFitOnGPU.h              | 185 +++++
 .../PixelTriplets/plugins/BuildFile.xml       |   1 -
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  47 +-
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 206 ++++++
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 695 +-----------------
 .../plugins/CAHitNtupletGeneratorKernels.h    |  64 +-
 .../CAHitNtupletGeneratorKernelsAlloc.cc      |   1 +
 .../CAHitNtupletGeneratorKernelsAlloc.cu      |  41 +-
 .../CAHitNtupletGeneratorKernelsAlloc.h       |  54 ++
 .../CAHitNtupletGeneratorKernelsImpl.h        | 614 ++++++++++++++++
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  74 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  19 +-
 .../PixelTriplets/plugins/GPUCACell.h         |   6 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  16 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cc  | 151 ++++
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  | 211 +-----
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   | 184 +++++
 .../PixelTriplets/plugins/gpuFishbone.h       |  34 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  18 +-
 .../plugins/gpuPixelDoubletsAlgos.h           |  25 +-
 25 files changed, 1826 insertions(+), 1197 deletions(-)
 create mode 100644 RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h

diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
index a6553ff96cb0b..3aca896475ac0 100644
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
@@ -21,7 +21,7 @@ struct TrajectoryStateSoA {
 
 
   template<typename V3, typename M3, typename V2, typename M2>
-  __host__ __device__
+  __host__ __device__ inline
   void copyFromCircle(V3 const & cp, M3 const & ccov, V2 const & lp, M2 const & lcov, float b, int32_t i) {
      state(i) << cp.template cast<float>(), lp.template cast<float>();
      state(i)(2) *=b;
@@ -42,14 +42,14 @@ struct TrajectoryStateSoA {
 
 
   template<typename V5, typename M5>
-  __host__ __device__
+  __host__ __device__ inline
   void copyFromDense(V5 const & v, M5 const & cov, int32_t i) {
      state(i) = v.template cast<float>();
      for(int j=0, ind=0; j<5; ++j) for (auto k=j;k<5;++k) covariance(i)(ind++) = cov(j,k); 
   }
 
   template<typename V5, typename M5>
-  __host__ __device__
+  __host__ __device__ inline
   void copyToDense(V5 & v, M5 & cov, int32_t i) const {
      v = state(i).template cast<typename V5::Scalar>();
      for(int j=0, ind=0; j<5; ++j) {
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
new file mode 100644
index 0000000000000..9afe654de6c32
--- /dev/null
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -0,0 +1,36 @@
+import FWCore.ParameterSet.Config as cms
+
+def customizePixelTracksSoAonCPU(process) :
+
+  process.load('RecoLocalTracker/SiPixelRecHits/siPixelRecHitHostSoA_cfi')
+  process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi')
+  process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi')
+
+  process.pixelTrackSoA = process.caHitNtupletCUDA.clone()
+  process.pixelTrackSoA.onGPU = False
+  process.pixelTrackSoA.pixelRecHitSrc = 'siPixelRecHitHostSoA'
+  process.pixelVertexSoA = process.pixelVertexCUDA.clone()
+  process.pixelVertexSoA.onGPU = False
+  process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA'
+
+  process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi')
+  process.pixelTracks = process.pixelTrackProducerFromSoA.clone()
+  process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi')
+  process.pixelVertices = process.pixelVertexFromSoA.clone()
+  process.pixelTracks.pixelRecHitLegacySrc = 'siPixelRecHitHostSoA'
+  process.siPixelRecHitHostSoA.convertToLegacy = True
+
+  process.reconstruction_step += process.siPixelRecHitHostSoA+process.pixelTrackSoA+process.pixelVertexSoA
+
+  return process
+
+def customizePixelTracksSoAonCPUForProfiling(process) :
+
+  process.MessageLogger.cerr.FwkReport.reportEvery = 100
+
+  process = customizePixelTracksSoAonCPU(process)
+  process.siPixelRecHitHostSoA.convertToLegacy = False
+  
+  process.TkSoA = cms.Path(process.offlineBeamSpot+process.siPixelDigis+process.siPixelClustersPreSplitting+process.siPixelRecHitHostSoA+process.pixelTrackSoA+process.pixelVertexSoA)
+  process.schedule = cms.Schedule(process.TkSoA)
+  return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 284bcfc2ebb51..b0987b4c4cc2f 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -35,12 +35,12 @@
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 #include "storeTracks.h"
-#include "CUDADataFormats/Common/interface/ArrayShadow.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
 
 
 /**
  * This class creates "leagcy"  reco::Track
- * objects from the output of GPU CA. 
+ * objects from the output of SoA CA. 
  */
 class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 public:
@@ -52,8 +52,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
   static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
 
-  using HitModuleStart = std::array<uint32_t,gpuClustering::MaxNumModules + 1>;
-  using HMSstorage = ArrayShadow<HitModuleStart>;
+//  using HitModuleStart = std::array<uint32_t,gpuClustering::MaxNumModules + 1>;
+  using HMSstorage = HostProduct<unsigned int[]>;
 
 
 private:
@@ -64,7 +64,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
   edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
   edm::EDGetTokenT<HMSstorage> hmsToken_;
 
-  int32_t minNumberOfHits_;
+  int32_t const minNumberOfHits_;
 };
 
 PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) :
@@ -111,12 +111,6 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEve
   // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
 
-  edm::Handle<HMSstorage> hhms;
-  iEvent.getByToken(hmsToken_,hhms);
-  auto const & hitsModuleStart = *hhms;
-
-  auto fc = hitsModuleStart.data;
-
   edm::Handle<SiPixelRecHitCollectionNew> gh;
   iEvent.getByToken(cpuHits_, gh);
   auto const &rechits = *gh;
@@ -124,15 +118,21 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEve
   auto const &rcs = rechits.data();
   auto nhits = rcs.size();
   hitmap.resize(nhits,nullptr);
+
+  edm::Handle<HMSstorage> hhms;
+  iEvent.getByToken(hmsToken_,hhms);
+  auto const * hitsModuleStart = (*hhms).get();
+  auto fc = hitsModuleStart;
+
   for (auto const &h : rcs) {
-    auto const &thit = static_cast<BaseTrackerRecHit const &>(h);
-    auto detI = thit.det()->index();
-    auto const &clus = thit.firstClusterRef();
-    assert(clus.isPixel());
-    auto i = fc[detI] + clus.pixelCluster().originalId();
-    assert(i < nhits);
-    assert(nullptr==hitmap[i]);
-    hitmap[i] = &h;
+      auto const &thit = static_cast<BaseTrackerRecHit const &>(h);
+      auto detI = thit.det()->index();
+      auto const &clus = thit.firstClusterRef();
+      assert(clus.isPixel());
+      auto i = fc[detI] + clus.pixelCluster().originalId();
+      if(i >= hitmap.size()) hitmap.resize(i+256,nullptr);  // only in case of hit overflow in one module
+      assert(nullptr==hitmap[i]);
+      hitmap[i] = &h;
   }
 
   std::vector<const TrackingRecHit *> hits;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
new file mode 100644
index 0000000000000..7f94fd05ece77
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -0,0 +1,96 @@
+#include "BrokenLineFitOnGPU.h"
+
+void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const * hv,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples) {
+  assert(tuples_d);
+
+  //  Fit internals
+  auto hitsGPU_ = std::make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU_ =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    kernelBLFastFit<3>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      hv,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      3,
+                                                                      offset);
+
+    kernelBLFit<3>(tupleMultiplicity_d,
+                                                                  bField_,
+                                                                  outputSoa_d,
+                                                                  hitsGPU_.get(),
+                                                                  hits_geGPU_.get(),
+                                                                  fast_fit_resultsGPU_.get(),
+                                                                  3,
+                                                                  offset);
+
+    // fit quads
+    kernelBLFastFit<4>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      hv,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      4,
+                                                                      offset);
+
+    kernelBLFit<4>(tupleMultiplicity_d,
+                                                                  bField_,
+                                                                  outputSoa_d,
+                                                                  hitsGPU_.get(),
+                                                                  hits_geGPU_.get(),
+                                                                  fast_fit_resultsGPU_.get(),
+                                                                  4,
+                                                                  offset);
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      kernelBLFastFit<4>(tuples_d,
+                                                                        tupleMultiplicity_d,
+                                                                        hv,
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        5,
+                                                                        offset);
+
+      kernelBLFit<4>(tupleMultiplicity_d,
+                                                                    bField_,
+                                                                    outputSoa_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
+    } else {
+      // fit penta (all 5)
+      kernelBLFastFit<5>(tuples_d,
+                                                                        tupleMultiplicity_d,
+                                                                        hv,
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        5,
+                                                                        offset);
+
+      kernelBLFit<5>(tupleMultiplicity_d,
+                                                                    bField_,
+                                                                    outputSoa_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
+    }
+
+  }  // loop on concurrent fits
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index bdfb835a02f33..27d8b2022211f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,183 +1,6 @@
-//
-// Author: Felice Pantaleo, CERN
-//
+#include "BrokenLineFitOnGPU.h"
 
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
-
-#include "HelixFitOnGPU.h"
-
-using HitsOnGPU = TrackingRecHit2DSOAView;
-using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
-
-using namespace Eigen;
-
-// #define BL_DUMP_HITS
-
-template <int N>
-__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
-                                CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                HitsOnGPU const *__restrict__ hhp,
-                                double *__restrict__ phits,
-                                float *__restrict__ phits_ge,
-                                double *__restrict__ pfast_fit,
-                                uint32_t nHits,
-                                uint32_t offset) {
-  constexpr uint32_t hitsInFit = N;
-
-  assert(hitsInFit <= nHits);
-
-  assert(pfast_fit);
-  assert(foundNtuplets);
-  assert(tupleMultiplicity);
-
-  // look in bin for this hit multiplicity
-  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-
-#ifdef BROKENLINE_DEBUG
-  if (0 == local_start) {
-    printf("%d total Ntuple\n",foundNtuplets->nbins());
-    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
-  }
-#endif
-
-  auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits))
-    return;
-
-  // get it from the ntuple container (one to one to helix)
-  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
-  assert(tkid < foundNtuplets->nbins());
-
-  assert(foundNtuplets->size(tkid) == nHits);
-
-  Rfit::Map3xNd<N> hits(phits + local_start);
-  Rfit::Map4d fast_fit(pfast_fit + local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
-
-#ifdef BL_DUMP_HITS
-  __shared__ int done;
-  done = 0;
-  __syncthreads();
-  bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
-#endif
-
-  // Prepare data structure
-  auto const *hitId = foundNtuplets->begin(tkid);
-  for (unsigned int i = 0; i < hitsInFit; ++i) {
-    auto hit = hitId[i];
-    float ge[6];
-    hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
-#ifdef BL_DUMP_HITS
-    if (dump) {
-      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
-             tkid,
-             hhp->detectorIndex(hit),
-             i,
-             hhp->xGlobal(hit),
-             hhp->yGlobal(hit),
-             hhp->zGlobal(hit));
-      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
-             tkid,
-             hhp->detetectorIndex(hit),
-             i,
-             ge[0],
-             ge[1],
-             ge[2],
-             ge[3],
-             ge[4],
-             ge[5]);
-    }
-#endif
-    hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
-    hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
-  }
-  BrokenLine::BL_Fast_fit(hits, fast_fit);
-
-  // no NaN here....
-  assert(fast_fit(0) == fast_fit(0));
-  assert(fast_fit(1) == fast_fit(1));
-  assert(fast_fit(2) == fast_fit(2));
-  assert(fast_fit(3) == fast_fit(3));
-}
-
-template <int N>
-__global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                            double B,
-                            OutputSoA *results,
-                            double *__restrict__ phits,
-                            float *__restrict__ phits_ge,
-                            double *__restrict__ pfast_fit,
-                            uint32_t nHits,
-                            uint32_t offset) {
-  assert(N <= nHits);
-
-  assert(results);
-  assert(pfast_fit);
-
-  // same as above...
-
-  // look in bin for this hit multiplicity
-  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits))
-    return;
-
-  // get it for the ntuple container (one to one to helix)
-  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
-
-  Rfit::Map3xNd<N> hits(phits + local_start);
-  Rfit::Map4d fast_fit(pfast_fit + local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
-
-  BrokenLine::PreparedBrokenLineData<N> data;
-  Rfit::Matrix3d Jacob;
-
-  BrokenLine::karimaki_circle_fit circle;
-  Rfit::line_fit line;
-
-  BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
-  BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
-  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
-
-  results->stateAtBS.copyFromCircle(circle.par,circle.cov,line.par,line.cov,1.f/float(B),tkid);
-  results->pt(tkid) =  float(B)/float(std::abs(circle.par(2)));
-  results->eta(tkid) =  asinhf(line.par(0));
-  results->chi2(tkid) = (circle.chi2+line.chi2)/(2*N-5);
-
-#ifdef BROKENLINE_DEBUG
-  if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
-    printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
-  printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
-         N,
-         nHits,
-         tkid,
-         circle.par(0),
-         circle.par(1),
-         circle.par(2));
-  printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
-  printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
-         circle.chi2,
-         line.chi2,
-         circle.cov(0, 0),
-         circle.cov(1, 1),
-         circle.cov(2, 2),
-         line.cov(0, 0),
-         line.cov(1, 1));
-#endif
-}
-
-void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
+void HelixFitOnGPU::launchBrokenLineKernels(HitsView const * hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
                                             cuda::stream_t<> &stream) {
@@ -199,7 +22,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
     // fit triplets
     kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
                                                                       tupleMultiplicity_d,
-                                                                      hh.view(),
+                                                                      hv,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
@@ -218,9 +41,9 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
     cudaCheck(cudaGetLastError());
 
     // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+    kernelBLFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
                                                                       tupleMultiplicity_d,
-                                                                      hh.view(),
+                                                                      hv,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
@@ -228,7 +51,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
                                                                       offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+    kernelBLFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                   bField_,
                                                                   outputSoa_d,
                                                                   hitsGPU_.get(),
@@ -240,9 +63,9 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+      kernelBLFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
                                                                         tupleMultiplicity_d,
-                                                                        hh.view(),
+                                                                        hv,
                                                                         hitsGPU_.get(),
                                                                         hits_geGPU_.get(),
                                                                         fast_fit_resultsGPU_.get(),
@@ -250,7 +73,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
                                                                         offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+      kernelBLFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     bField_,
                                                                     outputSoa_d,
                                                                     hitsGPU_.get(),
@@ -261,9 +84,9 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
       cudaCheck(cudaGetLastError());
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+      kernelBLFastFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
                                                                         tupleMultiplicity_d,
-                                                                        hh.view(),
+                                                                        hv,
                                                                         hitsGPU_.get(),
                                                                         hits_geGPU_.get(),
                                                                         fast_fit_resultsGPU_.get(),
@@ -271,7 +94,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
                                                                         offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+      kernelBLFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     bField_,
                                                                     outputSoa_d,
                                                                     hitsGPU_.get(),
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
new file mode 100644
index 0000000000000..e903700ebd91b
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -0,0 +1,185 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+// #define BROKENLINE_DEBUG
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+
+#include "HelixFitOnGPU.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
+
+using namespace Eigen;
+
+// #define BL_DUMP_HITS
+
+template <int N>
+__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
+                                CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                HitsOnGPU const *__restrict__ hhp,
+                                double *__restrict__ phits,
+                                float *__restrict__ phits_ge,
+                                double *__restrict__ pfast_fit,
+                                uint32_t nHits,
+                                uint32_t offset) {
+  constexpr uint32_t hitsInFit = N;
+
+  assert(hitsInFit <= nHits);
+
+  assert(hhp);
+  assert(pfast_fit);
+  assert(foundNtuplets);
+  assert(tupleMultiplicity);
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+
+#ifdef BROKENLINE_DEBUG
+  if (0 == local_start) {
+    printf("%d total Ntuple\n",foundNtuplets->nbins());
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+  }
+#endif
+
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
+     auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+
+    // get it from the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    assert(tkid < foundNtuplets->nbins());
+
+    assert(foundNtuplets->size(tkid) == nHits);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+#ifdef BL_DUMP_HITS
+  __shared__ int done;
+  done = 0;
+  __syncthreads();
+  bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
+#endif
+
+    // Prepare data structure
+    auto const *hitId = foundNtuplets->begin(tkid);
+    for (unsigned int i = 0; i < hitsInFit; ++i) {
+      auto hit = hitId[i];
+      float ge[6];
+      hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+#ifdef BL_DUMP_HITS
+    if (dump) {
+      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
+             tkid,
+             hhp->detectorIndex(hit),
+             i,
+             hhp->xGlobal(hit),
+             hhp->yGlobal(hit),
+             hhp->zGlobal(hit));
+      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
+             tkid,
+             hhp->detetectorIndex(hit),
+             i,
+             ge[0],
+             ge[1],
+             ge[2],
+             ge[3],
+             ge[4],
+             ge[5]);
+    }
+#endif
+      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+    }
+    BrokenLine::BL_Fast_fit(hits, fast_fit);
+
+    // no NaN here....
+    assert(fast_fit(0) == fast_fit(0));
+    assert(fast_fit(1) == fast_fit(1));
+    assert(fast_fit(2) == fast_fit(2));
+    assert(fast_fit(3) == fast_fit(3));
+  }
+}
+
+template <int N>
+__global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                            double B,
+                            OutputSoA *results,
+                            double *__restrict__ phits,
+                            float *__restrict__ phits_ge,
+                            double *__restrict__ pfast_fit,
+                            uint32_t nHits,
+                            uint32_t offset) {
+  assert(N <= nHits);
+
+  assert(results);
+  assert(pfast_fit);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
+     auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+
+    // get it for the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    BrokenLine::PreparedBrokenLineData<N> data;
+    Rfit::Matrix3d Jacob;
+
+    BrokenLine::karimaki_circle_fit circle;
+    Rfit::line_fit line;
+
+    BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
+    BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
+    BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+
+    results->stateAtBS.copyFromCircle(circle.par,circle.cov,line.par,line.cov,1.f/float(B),tkid);
+    results->pt(tkid) =  float(B)/float(std::abs(circle.par(2)));
+    results->eta(tkid) =  asinhf(line.par(0));
+    results->chi2(tkid) = (circle.chi2+line.chi2)/(2*N-5);
+
+#ifdef BROKENLINE_DEBUG
+  if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
+    printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
+  printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+         N,
+         nHits,
+         tkid,
+         circle.par(0),
+         circle.par(1),
+         circle.par(2));
+  printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
+  printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
+         circle.chi2,
+         line.chi2,
+         circle.cov(0, 0),
+         circle.cov(1, 1),
+         circle.cov(2, 2),
+         line.cov(0, 0),
+         line.cov(1, 1));
+#endif
+  }
+}
+
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 6d15cc6883098..da08ce124941c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -13,7 +13,6 @@
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
 <use name="CUDADataFormats/Track"/>
-
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index ba8a3e1052e7b..af3a86815b479 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -36,22 +36,35 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 private:
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 
-  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tokenHit_;
-  edm::EDPutTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenTrack_;
+  bool m_OnGPU;
+
+  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DGPU>> tokenHitGPU_;
+  edm::EDPutTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
+  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
 
   CAHitNtupletGeneratorOnGPU gpuAlgo_;
 
 };
 
 CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) :
-      tokenHit_(consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
-      tokenTrack_(produces<CUDAProduct<PixelTrackHeterogeneous>>()),
-      gpuAlgo_(iConfig, consumesCollector()) {}
+      m_OnGPU(iConfig.getParameter<bool>("onGPU")),
+      gpuAlgo_(iConfig, consumesCollector()) {
+  if (m_OnGPU) {
+      tokenHitGPU_ = consumes<CUDAProduct<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+      tokenTrackGPU_ = produces<CUDAProduct<PixelTrackHeterogeneous>>();
+  } else {
+      tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+      tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
+  }
+
+}
 
 
 void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
+  desc.add<bool>("onGPU",true);
   desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
 
   CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
@@ -61,19 +74,27 @@ void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descript
 
 void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
 
-  edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>>  hHits;
-  iEvent.getByToken(tokenHit_, hHits);
+  auto bf = 1./PixelRecoUtilities::fieldInInvGev(es);
 
-  CUDAScopedContextProduce ctx{*hHits};
-  auto const& hits = ctx.get(*hHits);
+  if (m_OnGPU) {
+    edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>>  hHits;
+    iEvent.getByToken(tokenHitGPU_, hHits);
 
-  auto bf = 1./PixelRecoUtilities::fieldInInvGev(es);
+    CUDAScopedContextProduce ctx{*hHits};
+    auto const& hits = ctx.get(*hHits);
 
-  ctx.emplace(
+    ctx.emplace(
       iEvent,
-      tokenTrack_,
+      tokenTrackGPU_,
       std::move(gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()))
-   );
+     );
+  } else {
+    
+    auto const& hits = iEvent.get(tokenHitCPU_);
+    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
+
+  }
+
 
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
new file mode 100644
index 0000000000000..62d07ed38bc8b
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -0,0 +1,206 @@
+#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
+
+template<>
+void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const * counters) {
+   kernel_printCounters(counters);
+}
+
+
+template<>
+void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const * hv, TkSoA * tracks_d, cudaStream_t) {
+  kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
+}
+
+
+
+template<>
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  // overkill to use template here (std::make_unique would suffice)
+  //edm::Service<CUDAService> cs;
+  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
+  device_isOuterHitOfCell_.reset((GPUCACell::OuterHitOfCell*)malloc(std::max(1U,nhits)*sizeof(GPUCACell::OuterHitOfCell)));
+  assert(device_isOuterHitOfCell_.get());
+  gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
+                                                                                   nhits,
+                                                                                   device_theCellNeighbors_,
+                                                                                   device_theCellNeighborsContainer_.get(),
+                                                                                   device_theCellTracks_,
+                                                                                   device_theCellTracksContainer_.get());
+
+  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
+  device_theCells_.reset((GPUCACell*)malloc(sizeof(GPUCACell)*m_params.maxNumberOfDoublets_));
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs=gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_) nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_>3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs<=gpuPixelDoublets::nPairs);
+  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
+                                                                         device_nCells_,
+                                                                         device_theCellNeighbors_,
+                                                                         device_theCellTracks_,
+                                                                         hh.view(),
+                                                                         device_isOuterHitOfCell_.get(),
+                                                                         nActualPairs,
+                                                                         m_params.idealConditions_,
+                                                                         m_params.doClusterCut_,
+                                                                         m_params.doZCut_,
+                                                                         m_params.doPhiCut_,
+                                                                         m_params.maxNumberOfDoublets_);
+
+
+}
+
+
+template<>
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(
+    HitsOnCPU const &hh,
+    TkSoA * tracks_d,
+    cudaStream_t cudaStream) {
+
+  auto * tuples_d = &tracks_d->hitIndices;
+  auto * quality_d = (Quality*)(&tracks_d->m_quality);
+
+  assert(tuples_d && quality_d);
+
+  // zero tuples
+  cudautils::launchZero(tuples_d, cudaStream);
+
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  kernel_connect(
+      device_hitTuple_apc_,
+      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+      hh.view(),
+      device_theCells_.get(),
+      device_nCells_,
+      device_theCellNeighbors_,
+      device_isOuterHitOfCell_.get(),
+      m_params.hardCurvCut_,
+      m_params.ptmin_,
+      m_params.CAThetaCutBarrel_,
+      m_params.CAThetaCutForward_,
+      m_params.dcaCutInnerTriplet_,
+      m_params.dcaCutOuterTriplet_);
+
+
+ if (nhits > 1 && m_params.earlyFishbone_) {
+    fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+  }
+
+
+  kernel_find_ntuplets(hh.view(),
+                                                                     device_theCells_.get(),
+                                                                     device_nCells_,
+                                                                     device_theCellTracks_,
+                                                                     tuples_d,
+                                                                     device_hitTuple_apc_,
+                                                                     quality_d,
+                                                                     m_params.minHitsPerNtuplet_);
+  if (m_params.doStats_)
+    kernel_mark_used(hh.view(),
+                                                                   device_theCells_.get(),
+                                                                   device_nCells_);
+
+
+  cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_earlyDuplicateRemover(
+      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+
+  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+
+  if (nhits > 1 && m_params.lateFishbone_) {
+    fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+  }
+
+
+ if (m_params.doStats_) {
+    kernel_checkOverflows(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_,
+                                                                        device_theCellTracks_,
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        m_params.maxNumberOfDoublets_,
+                                                                        counters_);
+  }
+
+}
+
+
+
+template<>
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh,
+                                                     TkSoA * tracks_d,
+                                                     cudaStream_t cudaStream) {
+  auto const * tuples_d = &tracks_d->hitIndices;
+  auto * quality_d = (Quality*)(&tracks_d->m_quality);
+
+  // classify tracks based on kinematics
+  kernel_classifyTracks(
+      tuples_d, tracks_d, m_params.cuts_, quality_d);
+
+  if (m_params.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    kernel_fishboneCleaner(
+        device_theCells_.get(), device_nCells_, quality_d);
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_fastDuplicateRemover(
+      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+
+  // fill hit->track "map"
+  kernel_countHitInTracks(
+      tuples_d, quality_d, device_hitToTuple_.get());
+  cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+  kernel_fillHitInTracks(
+      tuples_d, quality_d, device_hitToTuple_.get());
+
+  // remove duplicates (tracks that share a hit)
+  kernel_tripletCleaner(
+      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+
+  if (m_params.doStats_) {
+    // counters (add flag???)
+    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
+    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
+  }
+
+#ifdef    DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100,iev);
+#endif
+
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index cedef59f78f91..fdba20cf86b9a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,507 +1,11 @@
-//
-// Original Author: Felice Pantaleo, CERN
-//
-
-// #define NTUPLE_DEBUG
-
-#include <cmath>
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-
-#include "CAConstants.h"
-#include "CAHitNtupletGeneratorKernels.h"
-#include "GPUCACell.h"
-#include "gpuFishbone.h"
-#include "gpuPixelDoublets.h"
-
-using namespace gpuPixelDoublets;
-
-  using HitsOnGPU = TrackingRecHit2DSOAView;
-  using HitsOnCPU = TrackingRecHit2DCUDA;
-
-  using HitToTuple = CAConstants::HitToTuple;
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
-
-  using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
-  using HitContainer = pixelTrack::HitContainer;
-
-__global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
-                                      CAConstants::TupleMultiplicity * tupleMultiplicity,
-                                      AtomicPairCounter *apc,
-                                      GPUCACell const *__restrict__ cells,
-                                      uint32_t const *__restrict__ nCells,
-                                      CellNeighborsVector const *cellNeighbors,
-                                      CellTracksVector const *cellTracks,
-                                      GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
-                                      uint32_t nHits,
-                                      CAHitNtupletGeneratorKernels::Counters *counters) {
-  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-  auto &c = *counters;
-  // counters once per event
-  if (0 == idx) {
-    atomicAdd(&c.nEvents, 1);
-    atomicAdd(&c.nHits, nHits);
-    atomicAdd(&c.nCells, *nCells);
-    atomicAdd(&c.nTuples, apc->get().m);
-    atomicAdd(&c.nFitTracks,tupleMultiplicity->size());
-  }
-
-#ifdef NTUPLE_DEBUG
-  if (0 == idx) {
-    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
-           *nCells,
-           apc->get().m,
-           apc->get().n,
-           nHits);
-    if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) {
-      assert(foundNtuplets->size(apc->get().m) == 0);
-      assert(foundNtuplets->size() == apc->get().n);
-    }
-  }
-
-  if (idx < foundNtuplets->nbins()) {
-    if (foundNtuplets->size(idx) > 5)
-      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
-    assert(foundNtuplets->size(idx) < 6);
-    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
-      assert(*ih < nHits);
-  }
-#endif
-
-  if (0 == idx) {
-    if (apc->get().m >= CAConstants::maxNumberOfQuadruplets())
-      printf("Tuples overflow\n");
-    if (*nCells >= CAConstants::maxNumberOfDoublets())
-      printf("Cells overflow\n");
-  }
-
-  if (idx < (*nCells)) {
-    auto &thisCell = cells[idx];
-    if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
-      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
-    if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
-      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
-    if (thisCell.theDoubletId < 0)
-      atomicAdd(&c.nKilledCells, 1);
-    if (0==thisCell.theUsed)
-      atomicAdd(&c.nEmptyCells, 1);
-    if (thisCell.tracks().empty())
-      atomicAdd(&c.nZeroTrackCells, 1);
-  }
-  if (idx < nHits) {
-    if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
-      printf("OuterHitOfCell overflow %d\n", idx);
-  }
-}
-
-
-__global__ void kernel_fishboneCleaner(GPUCACell const *cells,
-                                       uint32_t const *__restrict__ nCells,
-                                       Quality *quality) {
-  constexpr auto bad = trackQuality::bad;
-
-  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (cellIndex >= (*nCells))
-    return;
-  auto const &thisCell = cells[cellIndex];
-  if (thisCell.theDoubletId >= 0)
-    return;
-
-  for (auto it : thisCell.tracks())
-    quality[it] = bad;
-}
-
-__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
-                                            uint32_t const *__restrict__ nCells,
-                                            HitContainer *foundNtuplets,
-                                            Quality *quality) {
-  // constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
-  // constexpr auto loose = trackQuality::loose;
-
-  assert(nCells);
-
-  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (cellIndex >= (*nCells))
-    return;
-  auto const &thisCell = cells[cellIndex];
-  if (thisCell.theDoubletId < 0)
-    return;
-
-  uint32_t maxNh = 0;
-
-  // find maxNh
-  for (auto it : thisCell.tracks()) {
-    auto nh = foundNtuplets->size(it);
-    maxNh = std::max(nh, maxNh);
-  }
-
-  for (auto it : thisCell.tracks()) {
-    if (foundNtuplets->size(it) != maxNh)
-      quality[it] = dup;  //no race:  simple assignment of the same constant
-  }
-
-}
-
-
-__global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells,
-                                            uint32_t const *__restrict__ nCells,
-                                            HitContainer const * __restrict__ foundNtuplets,
-                                            TkSoA * __restrict__ tracks) {
-  constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
-  constexpr auto loose = trackQuality::loose;
-
-  assert(nCells);
-
-  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (cellIndex >= (*nCells))
-    return;
-  auto const &thisCell = cells[cellIndex];
-  if (thisCell.theDoubletId < 0)
-    return;
-
-  float mc = 10000.f;
-  uint16_t im = 60000;
-
-  auto score = [&](auto it) {
-    return std::abs(tracks->tip(it));  // tip
-    // return tracks->chi2(it);  //chi2
-  };
-
-  // find min socre
-  for (auto it : thisCell.tracks()) {
-    if (tracks->quality(it) == loose && score(it) < mc) {
-      mc = score(it);
-      im = it;
-    }
-  }
-  // mark all other duplicates
-  for (auto it : thisCell.tracks()) {
-    if (tracks->quality(it) != bad && it != im)
-        tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
-  }
-}
-
-
-__global__ void kernel_connect(AtomicPairCounter *apc1,
-                               AtomicPairCounter *apc2,  // just to zero them,
-                               GPUCACell::Hits const *__restrict__ hhp,
-                               GPUCACell *cells,
-                               uint32_t const *__restrict__ nCells,
-                               CellNeighborsVector *cellNeighbors,
-                               GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
-                               float hardCurvCut,
-                               float ptmin,
-                               float CAThetaCutBarrel,
-                               float CAThetaCutForward,
-                               float dcaCutInnerTriplet,
-                               float dcaCutOuterTriplet) {
-  auto const &hh = *hhp;
-
-  auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
-  auto first = threadIdx.x;
-  auto stride = blockDim.x;
-
-  if (0 == (cellIndex + first)) {
-    (*apc1) = 0;
-    (*apc2) = 0;
-  }  // ready for next kernel
-
-  if (cellIndex >= (*nCells))
-    return;
-  auto & thisCell = cells[cellIndex];
-  //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
-  //  return;
-  auto innerHitId = thisCell.get_inner_hit_id();
-  auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
-  auto vi = isOuterHitOfCell[innerHitId].data();
-
-  constexpr uint32_t last_bpix1_detIndex = 96;
-  constexpr uint32_t last_barrel_detIndex = 1184;
-  auto ri = thisCell.get_inner_r(hh);
-  auto zi = thisCell.get_inner_z(hh);
-
-  auto ro = thisCell.get_outer_r(hh);
-  auto zo = thisCell.get_outer_z(hh);
-  auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
-
-  for (auto j = first; j < numberOfPossibleNeighbors; j += stride) {
-    auto otherCell = __ldg(vi + j);
-    auto & oc = cells[otherCell];
-    // if (cells[otherCell].theDoubletId < 0 ||
-    //    cells[otherCell].theUsed>1 )
-    //  continue;
-    auto r1 = oc.get_inner_r(hh);
-    auto z1 = oc.get_inner_z(hh);
-    // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
-    bool aligned = GPUCACell::areAlignedRZ(r1,
-                                z1,
-                                ri,
-                                zi,
-                                ro,
-                                zo,
-                                ptmin,
-                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-    if(aligned &&
-      thisCell.dcaCut(hh,oc,
-                      oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
-                      hardCurvCut)
-       ) {  // FIXME tune cuts
-      oc.addOuterNeighbor(cellIndex, *cellNeighbors);
-      thisCell.theUsed |= 1;
-      oc.theUsed |= 1;
-    }
-  } // loop on inner cells
-}
-
-__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
-                                     GPUCACell *__restrict__ cells,
-                                     uint32_t const *nCells,
-                                     CellTracksVector *cellTracks,
-                                     HitContainer *foundNtuplets,
-                                     AtomicPairCounter *apc,
-                                     Quality * __restrict__ quality,
-                                     unsigned int minHitsPerNtuplet) {
-  // recursive: not obvious to widen
-  auto const &hh = *hhp;
-
-  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
-  if (cellIndex >= (*nCells))
-    return;
-  auto &thisCell = cells[cellIndex];
-
-  if (thisCell.theDoubletId < 0)
-    return;
-
-  auto pid = thisCell.theLayerPairId;
-  auto doit = minHitsPerNtuplet>3 ? pid<3 : pid<8 || pid >12;
-  if (doit) {
-    GPUCACell::TmpTuple stack;
-    stack.reset();
-    thisCell.find_ntuplets(hh,
-                           cells,
-                           *cellTracks,
-                           *foundNtuplets,
-                           *apc,
-                           quality,
-                           stack,
-                           minHitsPerNtuplet, 
-                           pid<3);
-    assert(stack.size() == 0);
-    // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
-  }
-
-}
-
-
-__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
-                                     GPUCACell *__restrict__ cells,
-                                     uint32_t const *nCells) {
-
-  // auto const &hh = *hhp;
-
-  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
-  if (cellIndex >= (*nCells))
-    return;
-  auto &thisCell = cells[cellIndex];
-  if (!thisCell.tracks().empty())
-    thisCell.theUsed |= 2;
-
-}
-
-
-__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const * __restrict__ quality,
-                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
-  auto it = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (it >= foundNtuplets->nbins())
-    return;
-
-  auto nhits = foundNtuplets->size(it);
-  if (nhits < 3)
-    return;
-  if (quality[it] == trackQuality::dup) return;
-  assert(quality[it] == trackQuality::bad);
-  if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
-  assert(nhits<8);
-  tupleMultiplicity->countDirect(nhits);
-}
-
-
+#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const * __restrict__ quality,
-                                        CAConstants::TupleMultiplicity *tupleMultiplicity) {
-  auto it = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (it >= foundNtuplets->nbins())
-    return;
-
-  auto nhits = foundNtuplets->size(it);
-  if (nhits < 3)
-    return;
-  if (quality[it] == trackQuality::dup) return;
-  if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
-  assert(nhits<8);
-  tupleMultiplicity->fillDirect(nhits, it);
-}
-
-
-
-__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
-                                      TkSoA const * __restrict__ tracks,
-                                      CAHitNtupletGeneratorKernels::QualityCuts cuts,
-                                      Quality *__restrict__ quality) {
-  auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= tuples->nbins()) {
-    return;
-  }
-  if (tuples->size(idx) == 0) {
-    return;
-  }
-
-  // if duplicate: not even fit
-  if (quality[idx] == trackQuality::dup) return;
-
-  assert(quality[idx] == trackQuality::bad);
-
-  // mark doublets as bad
-  if (tuples->size(idx) < 3) {
-    return;
-  }
-
-  // if the fit has any invalid parameters, mark it as bad
-  bool isNaN = false;
-  for (int i = 0; i < 5; ++i) {
-    isNaN |= isnan(tracks->stateAtBS.state(idx)(i));
-  }
-  if (isNaN) {
-#ifdef NTUPLE_DEBUG
-    printf("NaN in fit %d size %d chi2 %f\n",
-           idx,
-           tuples->size(idx),
-           tracks->chi2(idx)
-    );
-#endif
-    return;
-  }
-
-  // compute a pT-dependent chi2 cut
-  // default parameters:
-  //   - chi2MaxPt = 10 GeV
-  //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
-  //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
-  // (see CAHitNtupletGeneratorGPU.cc)
-  float pt = std::min<float>(tracks->pt(idx), cuts.chi2MaxPt);
-  float chi2Cut = cuts.chi2Scale *
-                  (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
-  // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
-  if (3.f*tracks->chi2(idx) >= chi2Cut) {
-#ifdef NTUPLE_DEBUG
-    printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
-           idx,
-           tuples->size(idx), 
-           tracks->pt(idx),
-           tracks->eta(idx),
-           3.f*tracks->chi2(idx)
-    );
-#endif
-    return;
-  }
-
-  // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
-  // default cuts:
-  //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
-  //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
-  // (see CAHitNtupletGeneratorGPU.cc)
-  auto const &region = (tuples->size(idx) > 3) ? cuts.quadruplet : cuts.triplet;
-  bool isOk = (std::abs(tracks->tip(idx)) < region.maxTip) and (tracks->pt(idx) > region.minPt) and
-              (std::abs(tracks->zip(idx)) < region.maxZip);
-
-  if (isOk) {
-    quality[idx] = trackQuality::loose;
-  }
-}
-
-__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
-                                        CAHitNtupletGeneratorKernels::Counters *counters) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      continue;
-    if (quality[idx] != trackQuality::loose)
-      continue;
-    atomicAdd(&(counters->nGoodTracks), 1);
-  }
-}
-
-
-__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
-                                        CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      continue;
-    if (quality[idx] != trackQuality::loose)
-      continue;
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->countDirect(*h);
-  }
-}
-
-__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                       Quality const *__restrict__ quality,
-                                       CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      continue;
-    if (quality[idx] != trackQuality::loose)
-      continue;
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->fillDirect(*h, idx);
-  }
-}
-
-__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
-                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                         HitContainer *__restrict__ hitDetIndices) {
-
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  // copy offsets
-  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    hitDetIndices->off[idx] = tuples->off[idx];
-  }
-  // fill hit indices
-  auto const & hh = *hhp;
-  auto nhits = hh.nHits();
-  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    assert(tuples->bins[idx]<nhits);
-    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
-  }
-}
-
-void CAHitNtupletGeneratorKernels::fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tracks_d, cudaStream_t cudaStream) {
+template<>
+void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const * hv, TkSoA * tracks_d, cudaStream_t cudaStream) {
   auto blockSize=128;
   auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
 
-  kernel_fillHitDetIndices<<<numberOfBlocks,blockSize,0,cudaStream>>>(&tracks_d->hitIndices, hh.view(), &tracks_d->detIndices);
+  kernel_fillHitDetIndices<<<numberOfBlocks,blockSize,0,cudaStream>>>(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaDeviceSynchronize();
@@ -510,123 +14,19 @@ void CAHitNtupletGeneratorKernels::fillHitDetIndices(HitsOnCPU const &hh, TkSoA
 }
 
 
-__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ hitToTuple,
-                                             CAHitNtupletGeneratorKernels::Counters *counters) {
-  auto &c = *counters;
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple->size(idx) == 0)
-      continue;
-    atomicAdd(&c.nUsedHits, 1);
-    if (hitToTuple->size(idx) > 1)
-      atomicAdd(&c.nDupHits, 1);
-  }
-}
-
-
-__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                      HitContainer const *__restrict__ ptuples,
-                                      TkSoA const * __restrict__ ptracks,
-                                      Quality *__restrict__ quality,
-                                      CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple) {
-  constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
-  // constexpr auto loose = trackQuality::loose;
-
-  auto &hitToTuple = *phitToTuple;
-  auto const &foundNtuplets = *ptuples;
-  auto const & tracks = *ptracks;
-
-  //  auto const & hh = *hhp;
-  // auto l1end = hh.hitsLayerStart_d[1];
-
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-
-  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple.size(idx) < 2)
-      continue;
-
-    float mc = 10000.f;
-    uint16_t im = 60000;
-    uint32_t maxNh = 0;
-
-    // find maxNh
-    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      uint32_t nh = foundNtuplets.size(*it);
-      maxNh = std::max(nh, maxNh);
-    }
-    // kill all tracks shorter than maxHn (only triplets???)
-    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      uint32_t nh = foundNtuplets.size(*it);
-      if (maxNh != nh)
-        quality[*it] = dup;
-    }
-
-    if (maxNh > 3)
-      continue;
-    // if (idx>=l1end) continue;  // only for layer 1
-    // for triplets choose best tip!
-    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-      auto const it = *ip;
-      if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
-        mc = std::abs(tracks.tip(it));
-        im = it;
-      }
-    }
-    // mark duplicates
-    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-      auto const it = *ip;
-      if (quality[it] != bad && it != im)
-        quality[it] = dup;  //no race:  simple assignment of the same constant
-    }
-  }  // loop over hits
-}
-
-__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                      HitContainer const *__restrict__ ptuples,
-                                      TkSoA const * __restrict__ ptracks,
-                                      Quality const *__restrict__ quality,
-                                      CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple,
-                                      uint32_t maxPrint, int iev) {
-  auto const & foundNtuplets = *ptuples;
-  auto const & tracks = *ptracks;
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first; i < std::min(maxPrint, foundNtuplets.nbins()); i+=blockDim.x*gridDim.x) {
-    auto nh = foundNtuplets.size(i);
-    if (nh<3) continue;
-    printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
-           10000*iev+i,
-           int(quality[i]),
-           nh,
-           tracks.charge(i),
-           tracks.pt(i),
-           tracks.eta(i),
-           tracks.phi(i),
-           tracks.tip(i),
-           tracks.zip(i),
-//           asinhf(fit_results[i].par(3)),
-           tracks.chi2(i),
-           *foundNtuplets.begin(i),
-           *(foundNtuplets.begin(i) + 1),
-           *(foundNtuplets.begin(i) + 2),
-           nh>3 ? int(*(foundNtuplets.begin(i) + 3)):-1,
-           nh>4 ? int(*(foundNtuplets.begin(i) + 4)):-1
-          );
-  }
-}
-
-
-void CAHitNtupletGeneratorKernels::launchKernels(
+template<>
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(
     HitsOnCPU const &hh,
     TkSoA * tracks_d,
     cudaStream_t cudaStream) {
 
-  auto maxNumberOfDoublets_ = CAConstants::maxNumberOfDoublets();
-
   // these are pointer on GPU!
   auto * tuples_d = &tracks_d->hitIndices; 
   auto * quality_d = (Quality*)(&tracks_d->m_quality);  
 
+  // zero tuples
+  cudautils::launchZero(tuples_d, cudaStream);  
+
   auto nhits = hh.nHits();
   assert(nhits <= pixelGPUConstants::maxNumberOfHits);
 
@@ -640,10 +40,10 @@ void CAHitNtupletGeneratorKernels::launchKernels(
   auto nthTot = 64;
   auto stride = 4;
   auto blockSize = nthTot / stride;
-  auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  auto numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
   auto rescale = numberOfBlocks / 65536;
   blockSize *= (rescale + 1);
-  numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
   assert(numberOfBlocks < 65536);
   assert(blockSize > 0 && 0 == blockSize % 16);
   dim3 blks(1, numberOfBlocks, 1);
@@ -680,7 +80,7 @@ void CAHitNtupletGeneratorKernels::launchKernels(
 
 
   blockSize = 64;
-  numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
   kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
@@ -696,7 +96,7 @@ void CAHitNtupletGeneratorKernels::launchKernels(
                                                                    device_theCells_.get(),
                                                                    device_nCells_);
   cudaCheck(cudaGetLastError());
-  
+ 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
@@ -708,13 +108,13 @@ void CAHitNtupletGeneratorKernels::launchKernels(
   cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, quality_d);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
-  numberOfBlocks = (CAConstants::maxTuples() + blockSize - 1) / blockSize;
+  numberOfBlocks = (3*CAConstants::maxTuples()/4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
@@ -733,7 +133,7 @@ void CAHitNtupletGeneratorKernels::launchKernels(
   }
 
   if (m_params.doStats_) {
-    numberOfBlocks = (std::max(nhits, maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
     kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
                                                                         device_tupleMultiplicity_.get(),
                                                                         device_hitTuple_apc_,
@@ -743,6 +143,7 @@ void CAHitNtupletGeneratorKernels::launchKernels(
                                                                         device_theCellTracks_,
                                                                         device_isOuterHitOfCell_.get(),
                                                                         nhits,
+                                                                        m_params.maxNumberOfDoublets_,
                                                                         counters_);
     cudaCheck(cudaGetLastError());
   }
@@ -753,7 +154,9 @@ void CAHitNtupletGeneratorKernels::launchKernels(
 
 }
 
-void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+
+template<>
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
@@ -782,7 +185,7 @@ void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cuda::stre
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cs->make_device_unique<GPUCACell[]>(CAConstants::maxNumberOfDoublets(), stream);
+  device_theCells_ = cs->make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -815,7 +218,8 @@ void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cuda::stre
                                                                          m_params.idealConditions_,
                                                                          m_params.doClusterCut_,
                                                                          m_params.doZCut_,
-                                                                         m_params.doPhiCut_);
+                                                                         m_params.doPhiCut_,
+                                                                         m_params.maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -825,7 +229,9 @@ void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cuda::stre
 
 }
 
-void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh,
+
+template<>
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh,
                                                      TkSoA * tracks_d,
                                                      cudaStream_t cudaStream) {
   // these are pointer on GPU!
@@ -835,30 +241,31 @@ void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh,
   auto blockSize = 64;
 
   // classify tracks based on kinematics
-  auto numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+  auto numberOfBlocks = (3*CAConstants::maxNumberOfQuadruplets()/4 + blockSize - 1) / blockSize;
   kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, tracks_d, m_params.cuts_, quality_d);
   cudaCheck(cudaGetLastError());
 
   if (m_params.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+    numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
     kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         device_theCells_.get(), device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
   }
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
   cudaCheck(cudaGetLastError());
 
+
   if (m_params.minHitsPerNtuplet_<4 || m_params.doStats_) {
     // fill hit->track "map"
-    numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+    numberOfBlocks = (3*CAConstants::maxNumberOfQuadruplets()/4 + blockSize - 1) / blockSize;
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_hitToTuple_.get());
+        tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
     cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
     cudaCheck(cudaGetLastError());
@@ -873,12 +280,14 @@ void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh,
         hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
   }
+
+
   if (m_params.doStats_) {
     // counters (add flag???)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
-    numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+    numberOfBlocks = (3*CAConstants::maxNumberOfQuadruplets()/4 + blockSize - 1) / blockSize;
     kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
     cudaCheck(cudaGetLastError());
   }
@@ -895,38 +304,8 @@ void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh,
 
 }
 
-__global__ void kernel_printCounters(CAHitNtupletGeneratorKernels::Counters const *counters) {
-  auto const &c = *counters;
-  printf(
-      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | nKilledCells | "
-      "nEmptyCells | nZeroTrackCells ||\n");
-  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
-         c.nEvents,
-         c.nHits,
-         c.nCells,
-         c.nTuples,
-         c.nGoodTracks,
-         c.nFitTracks,
-         c.nUsedHits,
-         c.nDupHits,
-         c.nKilledCells,
-         c.nEmptyCells,
-         c.nZeroTrackCells);
-  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
-         c.nEvents,
-         c.nHits / double(c.nEvents),
-         c.nCells / double(c.nEvents),
-         c.nTuples / double(c.nEvents),
-         c.nFitTracks / double(c.nEvents),
-         c.nGoodTracks / double(c.nEvents),
-         c.nUsedHits / double(c.nEvents),
-         c.nDupHits / double(c.nEvents),
-         c.nKilledCells / double(c.nEvents),
-         c.nEmptyCells / double(c.nCells),
-         c.nZeroTrackCells / double(c.nCells));
-}
-
-void CAHitNtupletGeneratorKernels::printCounters(Counters const * counters) { 
+template<>
+void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const * counters) { 
    kernel_printCounters<<<1, 1>>>(counters);
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 147ba98310c14..a3a1f45213576 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -1,17 +1,14 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "GPUCACell.h"
 
 // #define DUMP_GPU_TK_TUPLES
 
+namespace cAHitNtupletGenerator {
 
-class CAHitNtupletGeneratorKernels {
-public:
   // counters
   struct Counters {
     unsigned long long nEvents;
@@ -27,8 +24,8 @@ class CAHitNtupletGeneratorKernels {
     unsigned long long nZeroTrackCells;
   };
 
+  using HitsView = TrackingRecHit2DSOAView;
   using HitsOnGPU = TrackingRecHit2DSOAView;
-  using HitsOnCPU = TrackingRecHit2DCUDA;
 
   using HitToTuple = CAConstants::HitToTuple;
   using TupleMultiplicity = CAConstants::TupleMultiplicity;
@@ -57,7 +54,9 @@ class CAHitNtupletGeneratorKernels {
 
   // params
   struct Params {
-    Params(uint32_t minHitsPerNtuplet,
+    Params(bool onGPU, 
+           uint32_t minHitsPerNtuplet,
+           uint32_t maxNumberOfDoublets,
                                   bool useRiemannFit,
                                   bool fit5as4,
                                   bool includeJumpingForwardDoublets,
@@ -75,7 +74,9 @@ class CAHitNtupletGeneratorKernels {
                                   float dcaCutInnerTriplet,
                                   float dcaCutOuterTriplet,
                                   QualityCuts const& cuts)
-      : minHitsPerNtuplet_(minHitsPerNtuplet),
+      : onGPU_(onGPU),
+        minHitsPerNtuplet_(minHitsPerNtuplet),
+        maxNumberOfDoublets_(maxNumberOfDoublets),
         useRiemannFit_(useRiemannFit),
         fit5as4_(fit5as4),
         includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
@@ -94,7 +95,9 @@ class CAHitNtupletGeneratorKernels {
         dcaCutOuterTriplet_(dcaCutOuterTriplet),
         cuts_(cuts) { }
 
+  const bool onGPU_;
   const uint32_t minHitsPerNtuplet_;
+  const uint32_t maxNumberOfDoublets_;
   const bool useRiemannFit_;
   const bool fit5as4_;
   const bool includeJumpingForwardDoublets_;
@@ -137,6 +140,34 @@ class CAHitNtupletGeneratorKernels {
   
   }; // Params
 
+}
+
+template<typename TTraits>
+class CAHitNtupletGeneratorKernels {
+public:
+
+  using Traits = TTraits;
+
+  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
+  using Params = cAHitNtupletGenerator::Params;
+  using Counters = cAHitNtupletGenerator::Counters;
+
+  template<typename T>
+  using unique_ptr = typename Traits:: template unique_ptr<T>;
+
+
+  using HitsView = TrackingRecHit2DSOAView;
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+
 
   CAHitNtupletGeneratorKernels(Params const & params) : m_params(params){}
   ~CAHitNtupletGeneratorKernels() = default;
@@ -147,7 +178,7 @@ class CAHitNtupletGeneratorKernels {
 
   void classifyTuples(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream);
 
-  void fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+  void fillHitDetIndices(HitsView const * hv, TkSoA * tuples_d, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream);
   void allocateOnGPU(cuda::stream_t<>& stream);
@@ -161,26 +192,29 @@ class CAHitNtupletGeneratorKernels {
 
   // workspace
   CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
-  cudautils::device::unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
+  unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
   CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
-  cudautils::device::unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
+  unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
 
-  cudautils::device::unique_ptr<GPUCACell[]> device_theCells_;
-  cudautils::device::unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
+  unique_ptr<GPUCACell[]> device_theCells_;
+  unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
   uint32_t* device_nCells_ = nullptr;
 
-  cudautils::device::unique_ptr<HitToTuple> device_hitToTuple_;
+  unique_ptr<HitToTuple> device_hitToTuple_;
   AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
 
   AtomicPairCounter* device_hitTuple_apc_ = nullptr;
 
-  cudautils::device::unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
+  unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
   
   uint8_t * device_tmws_;
 
-  cudautils::device::unique_ptr<AtomicPairCounter::c_type[]> device_storage_;
+  unique_ptr<AtomicPairCounter::c_type[]> device_storage_;
   // params
   Params const & m_params;
 };
 
+using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cudaCompat::GPUTraits>;
+using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cudaCompat::CPUTraits>;
+
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
new file mode 100644
index 0000000000000..96381673388ca
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -0,0 +1 @@
+#include "CAHitNtupletGeneratorKernelsAlloc.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
index 126b6237bd0d7..96381673388ca 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
@@ -1,40 +1 @@
-#include "CAHitNtupletGeneratorKernels.h"
-
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-void CAHitNtupletGeneratorKernels::allocateOnGPU(cuda::stream_t<>& stream) {
-  //////////////////////////////////////////////////////////
-  // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
-  //////////////////////////////////////////////////////////
-
-  edm::Service<CUDAService> cs;
-
-  /* not used at the moment 
-  cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
-  cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
-  cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
-  cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
-  */
-
-  device_hitToTuple_ = cs->make_device_unique<HitToTuple>(stream);
-
-  device_tupleMultiplicity_ = cs->make_device_unique<TupleMultiplicity>(stream);
-
-  auto storageSize = 3+(std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize())+sizeof(AtomicPairCounter::c_type))/sizeof(AtomicPairCounter::c_type);
-
-  device_storage_ = cs->make_device_unique<AtomicPairCounter::c_type[]>(storageSize,stream);
-  
-  device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
-  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get()+1;
-  device_nCells_ = (uint32_t *)(device_storage_.get()+2);
-  device_tmws_ = (uint8_t*)(device_storage_.get()+3);
-
-  assert(device_tmws_+std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <= (uint8_t*)(device_storage_.get()+storageSize));
-
-  cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
-  cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
-  cudautils::launchZero(device_hitToTuple_.get(), stream.id());  // we may wish to keep it in the edm...
-}
-
+#include "CAHitNtupletGeneratorKernelsAlloc.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
new file mode 100644
index 0000000000000..b54a1e2e415f0
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -0,0 +1,54 @@
+#include "CAHitNtupletGeneratorKernels.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+
+template<>
+#ifdef __CUDACC__
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cuda::stream_t<>& stream) {
+#else
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
+#endif
+  //////////////////////////////////////////////////////////
+  // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
+  //////////////////////////////////////////////////////////
+
+  edm::Service<CUDAService> cs;
+
+  /* not used at the moment 
+  cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
+  cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
+  */
+
+  device_hitToTuple_ = Traits:: template make_unique<HitToTuple>(cs,stream);
+
+  device_tupleMultiplicity_ = Traits:: template make_unique<TupleMultiplicity>(cs,stream);
+
+  auto storageSize = 3+(std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize())+sizeof(AtomicPairCounter::c_type))/sizeof(AtomicPairCounter::c_type);
+
+  device_storage_ = Traits:: template make_unique<AtomicPairCounter::c_type[]>(cs, storageSize,stream);
+  
+  device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
+  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get()+1;
+  device_nCells_ = (uint32_t *)(device_storage_.get()+2);
+  device_tmws_ = (uint8_t*)(device_storage_.get()+3);
+
+  assert(device_tmws_+std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <= (uint8_t*)(device_storage_.get()+storageSize));
+
+  if
+#ifndef __CUDACC__
+    constexpr
+#endif
+      (std::is_same<Traits,cudaCompat::GPUTraits>::value) {
+    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
+  }else {
+     *device_nCells_ = 0;
+  }  
+  cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
+  cudautils::launchZero(device_hitToTuple_.get(), stream.id());  // we may wish to keep it in the edm...
+}
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
new file mode 100644
index 0000000000000..1a03a54a98074
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -0,0 +1,614 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+// #define NTUPLE_DEBUG
+
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+#include "CAConstants.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "GPUCACell.h"
+#include "gpuFishbone.h"
+#include "gpuPixelDoublets.h"
+
+using namespace gpuPixelDoublets;
+
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+__global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
+                                      CAConstants::TupleMultiplicity * tupleMultiplicity,
+                                      AtomicPairCounter *apc,
+                                      GPUCACell const *__restrict__ cells,
+                                      uint32_t const *__restrict__ nCells,
+                                      CellNeighborsVector const *cellNeighbors,
+                                      CellTracksVector const *cellTracks,
+                                      GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                                      uint32_t nHits,
+                                      uint32_t maxNumberOfDoublets,
+                                      CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto &c = *counters;
+  // counters once per event
+  if (0 == first) {
+    atomicAdd(&c.nEvents, 1);
+    atomicAdd(&c.nHits, nHits);
+    atomicAdd(&c.nCells, *nCells);
+    atomicAdd(&c.nTuples, apc->get().m);
+    atomicAdd(&c.nFitTracks,tupleMultiplicity->size());
+  }
+
+#ifdef NTUPLE_DEBUG
+  if (0 == first) {
+    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
+           *nCells,
+           apc->get().m,
+           apc->get().n,
+           nHits);
+    if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) {
+      assert(foundNtuplets->size(apc->get().m) == 0);
+      assert(foundNtuplets->size() == apc->get().n);
+    }
+  }
+
+  for (int idx = first, nt = foundNtuplets->nbins(); idx<nt; idx += gridDim.x * blockDim.x) {
+    if (foundNtuplets->size(idx) > 5)
+      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
+    assert(foundNtuplets->size(idx) < 6);
+    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+      assert(*ih < nHits);
+  }
+#endif
+
+  if (0 == first) {
+    if (apc->get().m >= CAConstants::maxNumberOfQuadruplets())
+      printf("Tuples overflow\n");
+    if (*nCells >= maxNumberOfDoublets)
+      printf("Cells overflow\n");
+  }
+
+  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+    auto const & thisCell = cells[idx];
+    if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.theDoubletId < 0)
+      atomicAdd(&c.nKilledCells, 1);
+    if (0==thisCell.theUsed)
+      atomicAdd(&c.nEmptyCells, 1);
+    if (thisCell.tracks().empty())
+      atomicAdd(&c.nZeroTrackCells, 1);
+  }
+
+  for (int idx = first, nt = nHits; idx<nt; idx += gridDim.x * blockDim.x) {
+    if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
+      printf("OuterHitOfCell overflow %d\n", idx);
+  }
+}
+
+
+__global__ void kernel_fishboneCleaner(GPUCACell const *cells,
+                                       uint32_t const *__restrict__ nCells,
+                                       Quality *quality) {
+  constexpr auto bad = trackQuality::bad;
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.theDoubletId >= 0) continue;
+
+    for (auto it : thisCell.tracks())
+      quality[it] = bad; 
+  }
+}
+
+__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer *foundNtuplets,
+                                            Quality *quality) {
+  // constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+
+    if (thisCell.tracks().size()<2) continue;
+    //if (0==thisCell.theUsed) continue;
+    // if (thisCell.theDoubletId < 0) continue;
+
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it : thisCell.tracks()) {
+      auto nh = foundNtuplets->size(it);
+      maxNh = std::max(nh, maxNh);
+    }
+
+    for (auto it : thisCell.tracks()) {
+      if (foundNtuplets->size(it) != maxNh)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+
+  }
+}
+
+__global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer const * __restrict__ foundNtuplets,
+                                            TkSoA * __restrict__ tracks) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.tracks().size()<2) continue;
+    // if (thisCell.theDoubletId < 0) continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+
+    auto score = [&](auto it) {
+      return std::abs(tracks->tip(it));  // tip
+      // return tracks->chi2(it);  //chi2
+    };
+
+    // find min socre
+    for (auto it : thisCell.tracks()) {
+      if (tracks->quality(it) == loose && score(it) < mc) {
+        mc = score(it);
+        im = it;
+      }
+    }
+    // mark all other duplicates
+    for (auto it : thisCell.tracks()) {
+      if (tracks->quality(it) != bad && it != im)
+          tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
+    }
+  }
+}
+
+__global__ void kernel_connect(AtomicPairCounter *apc1,
+                               AtomicPairCounter *apc2,  // just to zero them,
+                               GPUCACell::Hits const *__restrict__ hhp,
+                               GPUCACell *cells,
+                               uint32_t const *__restrict__ nCells,
+                               CellNeighborsVector *cellNeighbors,
+                               GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                               float hardCurvCut,
+                               float ptmin,
+                               float CAThetaCutBarrel,
+                               float CAThetaCutForward,
+                               float dcaCutInnerTriplet,
+                               float dcaCutOuterTriplet) {
+  auto const &hh = *hhp;
+
+  auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+  auto first = threadIdx.x;
+  auto stride = blockDim.x;
+
+  if (0 == (firstCellIndex + first)) {
+    (*apc1) = 0;
+    (*apc2) = 0;
+  }  // ready for next kernel
+
+  for (int idx = firstCellIndex, nt = (*nCells); idx<nt; idx += gridDim.y * blockDim.y) {
+    auto cellIndex = idx;
+    auto & thisCell = cells[idx];
+    //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
+    //  continue;
+    auto innerHitId = thisCell.get_inner_hit_id();
+    int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+    auto vi = isOuterHitOfCell[innerHitId].data();
+
+    constexpr uint32_t last_bpix1_detIndex = 96;
+    constexpr uint32_t last_barrel_detIndex = 1184;
+    auto ri = thisCell.get_inner_r(hh);
+    auto zi = thisCell.get_inner_z(hh);
+
+    auto ro = thisCell.get_outer_r(hh);
+    auto zo = thisCell.get_outer_z(hh);
+    auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
+
+    for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
+      auto otherCell = __ldg(vi + j);
+      auto & oc = cells[otherCell];
+      // if (cells[otherCell].theDoubletId < 0 ||
+      //    cells[otherCell].theUsed>1 )
+      //  continue;
+      auto r1 = oc.get_inner_r(hh);
+      auto z1 = oc.get_inner_z(hh);
+      // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
+      bool aligned = GPUCACell::areAlignedRZ(r1,
+                                z1,
+                                ri,
+                                zi,
+                                ro,
+                                zo,
+                                ptmin,
+                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+      if(aligned &&
+        thisCell.dcaCut(hh,oc,
+                      oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
+                      hardCurvCut)
+         ) {  // FIXME tune cuts
+        oc.addOuterNeighbor(cellIndex, *cellNeighbors);
+        thisCell.theUsed |= 1;
+        oc.theUsed |= 1;
+      }
+    } // loop on inner cells
+  } // loop on outer cells
+}
+
+__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells,
+                                     CellTracksVector *cellTracks,
+                                     HitContainer *foundNtuplets,
+                                     AtomicPairCounter *apc,
+                                     Quality * __restrict__ quality,
+                                     unsigned int minHitsPerNtuplet) {
+  // recursive: not obvious to widen
+  auto const &hh = *hhp;
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.theDoubletId < 0) continue;  // cut by earlyFishbone
+
+    auto pid = thisCell.theLayerPairId;
+    auto doit = minHitsPerNtuplet>3 ? pid<3 : pid <8 || pid > 12;
+    if (doit) {
+      GPUCACell::TmpTuple stack;
+      stack.reset();
+      thisCell.find_ntuplets(hh,
+                           cells,
+                           *cellTracks,
+                           *foundNtuplets,
+                           *apc,
+                           quality,
+                           stack,
+                           minHitsPerNtuplet, 
+                           pid<3);
+      assert(stack.size() == 0);
+      // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
+    }
+  }
+}
+
+
+__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells) {
+
+  // auto const &hh = *hhp;
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+    auto &thisCell = cells[idx];
+    if (!thisCell.tracks().empty()) thisCell.theUsed |= 2;
+  }
+}
+
+
+__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                         Quality const * __restrict__ quality,
+                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
+
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int it = first, nt = foundNtuplets->nbins(); it<nt; it += gridDim.x * blockDim.x) {
+    auto nhits = foundNtuplets->size(it);
+    if (nhits < 3) continue;
+    if (quality[it] == trackQuality::dup) continue;
+    assert(quality[it] == trackQuality::bad);
+    if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
+    assert(nhits<8);
+    tupleMultiplicity->countDirect(nhits);
+  }
+}
+
+
+__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                        Quality const * __restrict__ quality,
+                                        CAConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int it = first, nt = foundNtuplets->nbins(); it<nt; it += gridDim.x * blockDim.x) {
+    auto nhits = foundNtuplets->size(it);
+    if (nhits < 3) continue;
+    if (quality[it] == trackQuality::dup) continue;
+    assert(quality[it] == trackQuality::bad);
+    if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
+    assert(nhits<8);
+    tupleMultiplicity->fillDirect(nhits, it);
+  }
+}
+
+
+__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
+                                      TkSoA const * __restrict__ tracks,
+                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
+                                      Quality *__restrict__ quality) {
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int it = first, nt = tuples->nbins(); it<nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tuples->size(it);
+    if (nhits == 0) break; // guard
+
+    // if duplicate: not even fit
+    if (quality[it] == trackQuality::dup) continue;
+
+    assert(quality[it] == trackQuality::bad);
+
+    // mark doublets as bad
+    if (nhits < 3) continue;
+
+    // if the fit has any invalid parameters, mark it as bad
+    bool isNaN = false;
+    for (int i = 0; i < 5; ++i) {
+      isNaN |= isnan(tracks->stateAtBS.state(it)(i));
+    }
+    if (isNaN) {
+#ifdef NTUPLE_DEBUG
+      printf("NaN in fit %d size %d chi2 %f\n",
+           it,
+           tuples->size(it),
+           tracks->chi2(it)
+      );
+#endif
+      continue;
+    }
+
+    // compute a pT-dependent chi2 cut
+    // default parameters:
+    //   - chi2MaxPt = 10 GeV
+    //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
+    //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
+    // (see CAHitNtupletGeneratorGPU.cc)
+    float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
+    float chi2Cut = cuts.chi2Scale *
+                  (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+    // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
+    if (3.f*tracks->chi2(it) >= chi2Cut) {
+#ifdef NTUPLE_DEBUG
+      printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
+           it,
+           tuples->size(it), 
+           tracks->pt(it),
+           tracks->eta(it),
+           3.f*tracks->chi2(it)
+      );
+#endif
+      continue;
+    }
+
+    // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+    // default cuts:
+    //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+    //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+    // (see CAHitNtupletGeneratorGPU.cc)
+    auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
+    bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
+              (std::abs(tracks->zip(it)) < region.maxZip);
+
+    if (isOk) quality[it] = trackQuality::loose;
+  }
+}
+
+__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0) break; //guard
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    atomicAdd(&(counters->nGoodTracks), 1);
+  }
+}
+
+
+__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0) break; // guard
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->countDirect(*h);
+  }
+}
+
+__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
+                                       Quality const *__restrict__ quality,
+                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0) break; // guard
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->fillDirect(*h, idx);
+  }
+}
+
+__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
+                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                         HitContainer *__restrict__ hitDetIndices) {
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  // copy offsets
+  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    hitDetIndices->off[idx] = tuples->off[idx];
+  }
+  // fill hit indices
+  auto const & hh = *hhp;
+  auto nhits = hh.nHits();
+  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    assert(tuples->bins[idx]<nhits);
+    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+  }
+}
+
+
+__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
+                                             CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  auto &c = *counters;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple->size(idx) == 0) continue; // SHALL NOT BE break
+    atomicAdd(&c.nUsedHits, 1);
+    if (hitToTuple->size(idx) > 1)
+      atomicAdd(&c.nDupHits, 1);
+  }
+}
+
+
+__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const * __restrict__ ptracks,
+                                      Quality *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  auto &hitToTuple = *phitToTuple;
+  auto const &foundNtuplets = *ptuples;
+  auto const & tracks = *ptracks;
+
+  //  auto const & hh = *hhp;
+  // auto l1end = hh.hitsLayerStart_d[1];
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple.size(idx) < 2)
+      continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      maxNh = std::max(nh, maxNh);
+    }
+    // kill all tracks shorter than maxHn (only triplets???)
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      if (maxNh != nh)
+        quality[*it] = dup;
+    }
+
+    if (maxNh > 3)
+      continue;
+    // if (idx>=l1end) continue;  // only for layer 1
+    // for triplets choose best tip!
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
+        mc = std::abs(tracks.tip(it));
+        im = it;
+      }
+    }
+    // mark duplicates
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && it != im)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+  }  // loop over hits
+}
+
+__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const * __restrict__ ptracks,
+                                      Quality const *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
+                                      uint32_t maxPrint, int iev) {
+  auto const & foundNtuplets = *ptuples;
+  auto const & tracks = *ptracks;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i<np; i+=blockDim.x*gridDim.x) {
+    auto nh = foundNtuplets.size(i);
+    if (nh<3) continue;
+    printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
+           10000*iev+i,
+           int(quality[i]),
+           nh,
+           tracks.charge(i),
+           tracks.pt(i),
+           tracks.eta(i),
+           tracks.phi(i),
+           tracks.tip(i),
+           tracks.zip(i),
+//           asinhf(fit_results[i].par(3)),
+           tracks.chi2(i),
+           *foundNtuplets.begin(i),
+           *(foundNtuplets.begin(i) + 1),
+           *(foundNtuplets.begin(i) + 2),
+           nh>3 ? int(*(foundNtuplets.begin(i) + 3)):-1,
+           nh>4 ? int(*(foundNtuplets.begin(i) + 4)):-1
+          );
+  }
+}
+
+__global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
+  auto const &c = *counters;
+  printf(
+      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | nKilledCells | "
+      "nEmptyCells | nZeroTrackCells ||\n");
+  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+         c.nEvents,
+         c.nHits,
+         c.nCells,
+         c.nTuples,
+         c.nGoodTracks,
+         c.nFitTracks,
+         c.nUsedHits,
+         c.nDupHits,
+         c.nKilledCells,
+         c.nEmptyCells,
+         c.nZeroTrackCells);
+  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
+         c.nEvents,
+         c.nHits / double(c.nEvents),
+         c.nCells / double(c.nEvents),
+         c.nTuples / double(c.nEvents),
+         c.nFitTracks / double(c.nEvents),
+         c.nGoodTracks / double(c.nEvents),
+         c.nUsedHits / double(c.nEvents),
+         c.nDupHits / double(c.nEvents),
+         c.nKilledCells / double(c.nEvents),
+         c.nEmptyCells / double(c.nCells),
+         c.nZeroTrackCells / double(c.nCells));
+}
+
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 08cafc7e8fc09..bbfb96ce1fc2d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -25,12 +25,12 @@ namespace {
     return x * x;
   }
 
-  CAHitNtupletGeneratorKernels::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) {
+  cAHitNtupletGenerator::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) {
     auto coeff = pset.getParameter<std::vector<double>>("chi2Coeff");
     if (coeff.size() != 4) {
       throw edm::Exception(edm::errors::Configuration, "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements");
     }
-    return CAHitNtupletGeneratorKernels::QualityCuts {
+    return cAHitNtupletGenerator::QualityCuts {
       // polynomial coefficients for the pT-dependent chi2 cut
       { (float) coeff[0], (float) coeff[1], (float) coeff[2], (float) coeff[3] },
       // max pT used to determine the chi2 cut
@@ -57,7 +57,9 @@ namespace {
 using namespace std;
 
 CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet &cfg, edm::ConsumesCollector &iC)
-    : m_params(cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+    : m_params(cfg.getParameter<bool>("onGPU"),
+              cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+              cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
               cfg.getParameter<bool>("useRiemannFit"),
               cfg.getParameter<bool>("fit5as4"),
               cfg.getParameter<bool>("includeJumpingForwardDoublets"),
@@ -82,17 +84,30 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet &
              "h1","h2","h3","h4","h5");
 #endif
 
-  cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
-  cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+  if (m_params.onGPU_) {
+    cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
+    cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+  } else {
+    m_counters = new Counters();
+    memset(m_counters, 0, sizeof(Counters));
+  }
 
 }
 
 CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU(){
  if (m_params.doStats_) {
     // crash on multi-gpu processes
-    CAHitNtupletGeneratorKernels::printCounters(m_counters);
+    if (m_params.onGPU_) {
+      CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
+    } else {
+      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
+    }
+  }
+  if (m_params.onGPU_) {
+    cudaFree(m_counters);
+  }else {
+    delete m_counters;
   }
-  cudaFree(m_counters);
 }
 
 
@@ -111,6 +126,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &
   desc.add<bool>("idealConditions", true);
   desc.add<bool>("fillStatistics", false);
   desc.add<unsigned int>("minHitsPerNtuplet", 4);
+  desc.add<unsigned int>("maxNumberOfDoublets", CAConstants::maxNumberOfDoublets());
   desc.add<bool>("includeJumpingForwardDoublets", false);
   desc.add<bool>("fit5as4", true);
   desc.add<bool>("doClusterCut", true);
@@ -142,7 +158,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 
   auto * soa = tracks.get();
   
-  CAHitNtupletGeneratorKernels kernels(m_params);
+  CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.counters_ = m_counters;
   HelixFitOnGPU fitter(bfield,m_params.fit5as4_);
 
@@ -151,14 +167,50 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 
   kernels.buildDoublets(hits_d, stream);
   kernels.launchKernels(hits_d, soa, stream.id());
-  kernels.fillHitDetIndices(hits_d, soa, stream.id());  // in principle needed only if Hits not "available"
+  kernels.fillHitDetIndices(hits_d.view(), soa, stream.id());  // in principle needed only if Hits not "available"
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   } else {
-    fitter.launchBrokenLineKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   }
   kernels.classifyTuples(hits_d, soa, stream.id());
 
   return tracks;
 }
 
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d,
+                                float bfield) const {
+
+
+  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+  auto dummyStream = cuda::stream::wrap(0,0,false);
+
+  auto * soa = tracks.get();
+  assert(soa);
+
+  CAHitNtupletGeneratorKernelsCPU kernels(m_params);
+  kernels.counters_ = m_counters;
+  kernels.allocateOnGPU(dummyStream);
+
+  kernels.buildDoublets(hits_d, dummyStream);
+  kernels.launchKernels(hits_d, soa, dummyStream.id());
+  kernels.fillHitDetIndices(hits_d.view(), soa, dummyStream.id());  // in principle needed only if Hits not "available"
+
+  if (0==hits_d.nHits()) return tracks;
+
+  // now fit
+  HelixFitOnGPU fitter(bfield,m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+  } else {
+    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+  }
+  
+
+  kernels.classifyTuples(hits_d, soa, dummyStream.id());
+
+  return tracks;
+
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 169f591c48e45..e2f70fe8ed86b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -35,6 +35,11 @@ class CAHitNtupletGeneratorOnGPU {
   using HitContainer = pixelTrack::HitContainer;
   using Tuple = HitContainer;
 
+
+  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
+  using Params = cAHitNtupletGenerator::Params;
+  using Counters = cAHitNtupletGenerator::Counters;
+
 public:
   CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC)
       : CAHitNtupletGeneratorOnGPU(cfg, iC) {}
@@ -45,9 +50,14 @@ class CAHitNtupletGeneratorOnGPU {
   static void fillDescriptions(edm::ParameterSetDescription& desc);
   static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
                                 float bfield,
                                 cuda::stream_t<>& stream) const;
+
+  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d,
+                                float bfield) const;
+
+
  
 private:
 
@@ -58,15 +68,12 @@ class CAHitNtupletGeneratorOnGPU {
                    bool useRiemannFit,
                    cuda::stream_t<>& cudaStream);
 
-  void cleanup(cudaStream_t stream);
-
   void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const;
 
 
-  CAHitNtupletGeneratorKernels::Params m_params;
+  Params m_params;
 
-  using Counters = CAHitNtupletGeneratorKernels::Counters;
-  CAHitNtupletGeneratorKernels::Counters * m_counters = nullptr;
+  Counters * m_counters = nullptr;
 
 };
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index f1709f7ae7063..ed4404345f777 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -39,7 +39,6 @@ class GPUCACell {
   static constexpr auto bad = trackQuality::bad;
 
   GPUCACell() = default;
-#ifdef __CUDACC__
 
   __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
                                        CellTracksVector& cellTracks,
@@ -98,8 +97,7 @@ class GPUCACell {
 
   __device__ void print_cell() const {
     printf(
-        "printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
-        "%d, innerradius %f, outerRadius %f \n",
+        "printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
         theDoubletId,
         theLayerPairId,
         theInnerHitId,
@@ -301,8 +299,6 @@ class GPUCACell {
     assert(tmpNtuplet.size() < 4);
   }
 
-#endif  // __CUDACC__
-
 private:
   CellNeighbors theOuterNeighbors;
   CellTracks theTracks;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index c9688fc43418c..4f4fdbf7d8299 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -34,8 +34,7 @@ namespace Rfit {
 
 class HelixFitOnGPU {
 public:
-  using HitsOnGPU = TrackingRecHit2DSOAView;
-  using HitsOnCPU = TrackingRecHit2DCUDA;
+  using HitsView = TrackingRecHit2DSOAView;
 
   using Tuples = pixelTrack::HitContainer;
   using OutputSoA = pixelTrack::TrackSoA;
@@ -46,15 +45,24 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
-  void launchRiemannKernels(HitsOnCPU const &hh,
+  void launchRiemannKernels(HitsView const * hv,
                             uint32_t nhits,
                             uint32_t maxNumberOfTuples,
                             cuda::stream_t<> &cudaStream);
-  void launchBrokenLineKernels(HitsOnCPU const &hh,
+  void launchBrokenLineKernels(HitsView const * hv,
                                uint32_t nhits,
                                uint32_t maxNumberOfTuples,
                                cuda::stream_t<> &cudaStream);
 
+  void launchRiemannKernelsOnCPU(HitsView const * hv,
+                            uint32_t nhits,
+                            uint32_t maxNumberOfTuples);
+  void launchBrokenLineKernelsOnCPU(HitsView const * hv,
+                               uint32_t nhits,
+                               uint32_t maxNumberOfTuples);
+
+
+
   void allocateOnGPU(Tuples const *tuples,
                      TupleMultiplicity const *tupleMultiplicity,
                      OutputSoA * outputSoA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
new file mode 100644
index 0000000000000..825df0cc182c7
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -0,0 +1,151 @@
+#include "RiemannFitOnGPU.h"
+
+void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const * hv,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples) {
+  assert(tuples_d);
+
+
+  //  Fit internals
+  auto hitsGPU_ = std::make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ =
+       std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU_ =
+       std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
+  auto circle_fit_resultsGPU_holder =
+       std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
+  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    kernelFastFit<3>(tuples_d,
+                                                                    tupleMultiplicity_d,
+                                                                    3,
+                                                                    hv,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    offset);
+    
+
+    kernelCircleFit<3>(tupleMultiplicity_d,
+                                                                      3,
+                                                                      bField_,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+    
+
+    kernelLineFit<3>(tupleMultiplicity_d,
+                                                                    3,
+                                                                    bField_,
+                                                                    outputSoa_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    circle_fit_resultsGPU_,
+                                                                    offset);
+    
+
+    // quads
+    kernelFastFit<4>(tuples_d,
+                                                                    tupleMultiplicity_d,
+                                                                    4,
+                                                                    hv,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    offset);
+    
+
+    kernelCircleFit<4>(tupleMultiplicity_d,
+                                                                      4,
+                                                                      bField_,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+    
+
+    kernelLineFit<4>(tupleMultiplicity_d,
+                                                                    4,
+                                                                    bField_,
+                                                                    outputSoa_d,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    circle_fit_resultsGPU_,
+                                                                    offset);
+    
+
+    if (fit5as4_) {
+      // penta
+      kernelFastFit<4>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      5,
+                                                                      hv,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      offset);
+      
+
+      kernelCircleFit<4>(tupleMultiplicity_d,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
+      
+
+      kernelLineFit<4>(tupleMultiplicity_d,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_d,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+      
+    } else {
+      // penta all 5
+      kernelFastFit<5>(tuples_d,
+                                                                      tupleMultiplicity_d,
+                                                                      5,
+                                                                      hv,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      offset);
+      
+
+      kernelCircleFit<5>(tupleMultiplicity_d,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
+      
+
+      kernelLineFit<5>(tupleMultiplicity_d,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_d,
+                                                                      hitsGPU_.get(),
+                                                                      hits_geGPU_.get(),
+                                                                      fast_fit_resultsGPU_.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+      
+    }
+  }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 38f51e676a9ca..8a371b7806907 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -1,187 +1,6 @@
-//
-// Author: Felice Pantaleo, CERN
-//
+#include "RiemannFitOnGPU.h"
 
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
-
-#include "HelixFitOnGPU.h"
-
-using HitsOnGPU = TrackingRecHit2DSOAView;
-using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
-
-using namespace Eigen;
-
-template <int N>
-__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
-                              CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                              uint32_t nHits,
-                              HitsOnGPU const *__restrict__ hhp,
-                              double *__restrict__ phits,
-                              float *__restrict__ phits_ge,
-                              double *__restrict__ pfast_fit,
-                              uint32_t offset) {
-  constexpr uint32_t hitsInFit = N;
-
-  assert(hitsInFit <= nHits);
-
-  assert(pfast_fit);
-  assert(foundNtuplets);
-  assert(tupleMultiplicity);
-
-  // look in bin for this hit multiplicity
-  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-
-#ifdef RIEMANN_DEBUG
-  if (0 == local_start)
-    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
-#endif
-
-  auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits))
-    return;
-
-  // get it from the ntuple container (one to one to helix)
-  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
-  assert(tkid < foundNtuplets->nbins());
-
-  assert(foundNtuplets->size(tkid) == nHits);
-
-  Rfit::Map3xNd<N> hits(phits + local_start);
-  Rfit::Map4d fast_fit(pfast_fit + local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
-
-  // Prepare data structure
-  auto const *hitId = foundNtuplets->begin(tkid);
-  for (unsigned int i = 0; i < hitsInFit; ++i) {
-    auto hit = hitId[i];
-    // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
-    float ge[6];
-    hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
-    // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
-
-    hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
-    hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
-  }
-  Rfit::Fast_fit(hits, fast_fit);
-
-  // no NaN here....
-  assert(fast_fit(0) == fast_fit(0));
-  assert(fast_fit(1) == fast_fit(1));
-  assert(fast_fit(2) == fast_fit(2));
-  assert(fast_fit(3) == fast_fit(3));
-}
-
-template <int N>
-__global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                uint32_t nHits,
-                                double B,
-                                double *__restrict__ phits,
-                                float *__restrict__ phits_ge,
-                                double *__restrict__ pfast_fit_input,
-                                Rfit::circle_fit *circle_fit,
-                                uint32_t offset) {
-  assert(circle_fit);
-  assert(N <= nHits);
-
-  // same as above...
-
-  // look in bin for this hit multiplicity
-  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits))
-    return;
-
-  // get it for the ntuple container (one to one to helix)
-  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
-
-  Rfit::Map3xNd<N> hits(phits + local_start);
-  Rfit::Map4d fast_fit(pfast_fit_input + local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
-
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
-
-  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge, hits_cov);
-
-  circle_fit[local_start] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
-
-#ifdef RIEMANN_DEBUG
-//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid,
-//         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
-#endif
-}
-
-template <int N>
-__global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                              uint32_t nHits,
-                              double B,
-                              OutputSoA *results,
-                              double *__restrict__ phits,
-                              float *__restrict__ phits_ge,
-                              double *__restrict__ pfast_fit_input,
-                              Rfit::circle_fit *__restrict__ circle_fit,
-                              uint32_t offset) {
-  assert(results);
-  assert(circle_fit);
-  assert(N <= nHits);
-
-  // same as above...
-
-  // look in bin for this hit multiplicity
-  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  auto tuple_start = local_start + offset;
-  if (tuple_start >= tupleMultiplicity->size(nHits))
-    return;
-
-  // get it for the ntuple container (one to one to helix)
-  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
-
-  Rfit::Map3xNd<N> hits(phits + local_start);
-  Rfit::Map4d fast_fit(pfast_fit_input + local_start);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
-
-  auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true);
-
-  Rfit::fromCircleToPerigee(circle_fit[local_start]);
-
-  results->stateAtBS.copyFromCircle(circle_fit[local_start].par,circle_fit[local_start].cov,
-                                   line_fit.par,line_fit.cov,1.f/float(B),tkid);
-  results->pt(tkid) =  B/std::abs(circle_fit[local_start].par(2));
-  results->eta(tkid) =  asinhf(line_fit.par(0));
-  results->chi2(tkid) = (circle_fit[local_start].chi2+line_fit.chi2)/(2*N-5);
-
-#ifdef RIEMANN_DEBUG
-  printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
-         N,
-         nHits,
-         tkid,
-         circle_fit[local_start].par(0),
-         circle_fit[local_start].par(1),
-         circle_fit[local_start].par(2));
-  printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
-  printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
-         circle_fit[local_start].chi2,
-         line_fit.chi2,
-         circle_fit[local_start].cov(0, 0),
-         circle_fit[local_start].cov(1, 1),
-         circle_fit[local_start].cov(2, 2),
-         line_fit.cov(0, 0),
-         line_fit.cov(1, 1));
-#endif
-}
-
-void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
+void HelixFitOnGPU::launchRiemannKernels(HitsView const * hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
                                          cuda::stream_t<> &stream) {
@@ -207,7 +26,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
     kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
                                                                     tupleMultiplicity_d,
                                                                     3,
-                                                                    hh.view(),
+                                                                    hv,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
@@ -236,17 +55,17 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
     cudaCheck(cudaGetLastError());
 
     // quads
-    kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+    kernelFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
                                                                     tupleMultiplicity_d,
                                                                     4,
-                                                                    hh.view(),
+                                                                    hv,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
                                                                     offset);
     cudaCheck(cudaGetLastError());
 
-    kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+    kernelCircleFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                       4,
                                                                       bField_,
                                                                       hitsGPU_.get(),
@@ -256,7 +75,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
                                                                       offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+    kernelLineFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     4,
                                                                     bField_,
                                                                     outputSoa_d,
@@ -269,17 +88,17 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
 
     if (fit5as4_) {
       // penta
-      kernelFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+      kernelFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
                                                                       tupleMultiplicity_d,
                                                                       5,
-                                                                      hh.view(),
+                                                                      hv,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
                                                                       offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+      kernelCircleFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                         5,
                                                                         bField_,
                                                                         hitsGPU_.get(),
@@ -289,7 +108,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
                                                                         offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+      kernelLineFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                       5,
                                                                       bField_,
                                                                       outputSoa_d,
@@ -301,17 +120,17 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
       cudaCheck(cudaGetLastError());
     } else {
       // penta all 5
-      kernelFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
+      kernelFastFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
                                                                       tupleMultiplicity_d,
                                                                       5,
-                                                                      hh.view(),
+                                                                      hv,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
                                                                       offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+      kernelCircleFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                         5,
                                                                         bField_,
                                                                         hitsGPU_.get(),
@@ -321,7 +140,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
                                                                         offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+      kernelLineFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                       5,
                                                                       bField_,
                                                                       outputSoa_d,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
new file mode 100644
index 0000000000000..fbc82f888e722
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -0,0 +1,184 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+
+#include "HelixFitOnGPU.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
+
+using namespace Eigen;
+
+template <int N>
+__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
+                              CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                              uint32_t nHits,
+                              HitsOnGPU const *__restrict__ hhp,
+                              double *__restrict__ phits,
+                              float *__restrict__ phits_ge,
+                              double *__restrict__ pfast_fit,
+                              uint32_t offset) {
+  constexpr uint32_t hitsInFit = N;
+
+  assert(hitsInFit <= nHits);
+
+  assert(pfast_fit);
+  assert(foundNtuplets);
+  assert(tupleMultiplicity);
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+
+#ifdef RIEMANN_DEBUG
+  if (0 == local_start)
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+#endif
+
+
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
+     auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+
+    // get it from the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    assert(tkid < foundNtuplets->nbins());
+
+    assert(foundNtuplets->size(tkid) == nHits);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    // Prepare data structure
+    auto const *hitId = foundNtuplets->begin(tkid);
+    for (unsigned int i = 0; i < hitsInFit; ++i) {
+      auto hit = hitId[i];
+      // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
+      float ge[6];
+      hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+      // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+
+      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+    }
+    Rfit::Fast_fit(hits, fast_fit);
+
+    // no NaN here....
+    assert(fast_fit(0) == fast_fit(0));
+    assert(fast_fit(1) == fast_fit(1));
+    assert(fast_fit(2) == fast_fit(2));
+    assert(fast_fit(3) == fast_fit(3));
+  }
+}
+
+template <int N>
+__global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                uint32_t nHits,
+                                double B,
+                                double *__restrict__ phits,
+                                float *__restrict__ phits_ge,
+                                double *__restrict__ pfast_fit_input,
+                                Rfit::circle_fit *circle_fit,
+                                uint32_t offset) {
+  assert(circle_fit);
+  assert(N <= nHits);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
+     auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+
+    Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
+    Rfit::loadCovariance2D(hits_ge, hits_cov);
+
+    circle_fit[local_idx] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
+
+#ifdef RIEMANN_DEBUG
+//    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid,
+//         circle_fit[local_idx].par(0), circle_fit[local_idx].par(1), circle_fit[local_idx].par(2));
+#endif
+  }
+}
+
+template <int N>
+__global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                              uint32_t nHits,
+                              double B,
+                              OutputSoA *results,
+                              double *__restrict__ phits,
+                              float *__restrict__ phits_ge,
+                              double *__restrict__ pfast_fit_input,
+                              Rfit::circle_fit *__restrict__ circle_fit,
+                              uint32_t offset) {
+  assert(results);
+  assert(circle_fit);
+  assert(N <= nHits);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
+     auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+
+     // get it for the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+
+    Rfit::Map3xNd<N> hits(phits + local_idx);
+    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
+    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_idx], fast_fit, B, true);
+
+    Rfit::fromCircleToPerigee(circle_fit[local_idx]);
+
+    results->stateAtBS.copyFromCircle(circle_fit[local_idx].par,circle_fit[local_idx].cov,
+                                     line_fit.par,line_fit.cov,1.f/float(B),tkid);
+    results->pt(tkid) =  B/std::abs(circle_fit[local_idx].par(2));
+    results->eta(tkid) =  asinhf(line_fit.par(0));
+    results->chi2(tkid) = (circle_fit[local_idx].chi2+line_fit.chi2)/(2*N-5);
+
+#ifdef RIEMANN_DEBUG
+  printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+         N,
+         nHits,
+         tkid,
+         circle_fit[local_idx].par(0),
+         circle_fit[local_idx].par(1),
+         circle_fit[local_idx].par(2));
+  printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
+  printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
+         circle_fit[local_idx].chi2,
+         line_fit.chi2,
+         circle_fit[local_idx].cov(0, 0),
+         circle_fit[local_idx].cov(1, 1),
+         circle_fit[local_idx].cov(2, 2),
+         line_fit.cov(0, 0),
+         line_fit.cov(1, 1));
+#endif
+  }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index ba2c3d9b146a8..b4dff8c103d2d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -24,32 +24,32 @@ namespace gpuPixelDoublets {
                            GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
                            uint32_t nHits,
                            bool checkTrack) {
-    constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
+  constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
-    auto const& hh = *hhp;
-    auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
+  auto const& hh = *hhp;
+  // auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
 
-    // x run faster...
-    auto idy = threadIdx.y + blockIdx.y * blockDim.y;
-    auto first = threadIdx.x;
+  // x run faster...
+  auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
+  auto firstX = threadIdx.x;
 
-    if (idy >= nHits)
-      return;
+   float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
+   uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
+   uint32_t cc[maxCellsPerHit];
+
+  for (int idy = firstY, nt = nHits; idy<nt; idy += gridDim.y * blockDim.y) {
     auto const& vc = isOuterHitOfCell[idy];
     auto s = vc.size();
     if (s < 2)
-      return;
+      continue;
     // if alligned kill one of the two.
     // in principle one could try to relax the cut (only in r-z?) for jumping-doublets 
     auto const& c0 = cells[vc[0]];
     auto xo = c0.get_outer_x(hh);
     auto yo = c0.get_outer_y(hh);
     auto zo = c0.get_outer_z(hh);
-    float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
-    uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
-    uint32_t cc[maxCellsPerHit];
     auto sg = 0;
-    for (uint32_t ic = 0; ic < s; ++ic) {
+    for (int32_t ic = 0; ic < s; ++ic) {
       auto& ci = cells[vc[ic]];
       if (0==ci.theUsed) continue; // for triplets equivalent to next 
       if (checkTrack && ci.tracks().empty())
@@ -64,9 +64,9 @@ namespace gpuPixelDoublets {
       ++sg;
     }
     if (sg < 2)
-      return;
+      continue;
     // here we parallelize
-    for (uint32_t ic = first; ic < sg - 1; ic += blockDim.x) {
+    for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) {
       auto& ci = cells[cc[ic]];
       for (auto jc = ic + 1; jc < sg; ++jc) {
         auto& cj = cells[cc[jc]];
@@ -85,8 +85,8 @@ namespace gpuPixelDoublets {
         }
       }  //cj
     }    // ci
-  }
-
+  } // hits
+ }
 }  // namespace gpuPixelDoublets
 
 #endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 32b4e4f538ea9..f77ecb01cd416 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -44,10 +44,6 @@ namespace gpuPixelDoublets {
 // end constants
 // clang-format on
 
-  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
-
-  constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
-
   using CellNeighbors = CAConstants::CellNeighbors;
   using CellTracks = CAConstants::CellTracks;
   using CellNeighborsVector = CAConstants::CellNeighborsVector;
@@ -68,9 +64,13 @@ namespace gpuPixelDoublets {
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
-  __global__ __launch_bounds__(
+  __global__ 
+#ifdef __CUDACC__
+  __launch_bounds__(
       getDoubletsFromHistoMaxBlockSize,
-      getDoubletsFromHistoMinBlocksPerMP) void getDoubletsFromHisto(GPUCACell* cells,
+      getDoubletsFromHistoMinBlocksPerMP) 
+#endif
+                                          void getDoubletsFromHisto(GPUCACell* cells,
                                                                     uint32_t* nCells,
                                                                     CellNeighborsVector* cellNeighbors,
                                                                     CellTracksVector* cellTracks,
@@ -80,7 +80,8 @@ namespace gpuPixelDoublets {
                                                                     bool ideal_cond,
                                                                     bool doClusterCut,
                                                                     bool doZCut,
-                                                                    bool doPhiCut) {
+                                                                    bool doPhiCut,
+                                                                    uint32_t maxNumOfDoublets) {
     auto const& __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs,
                       nActualPairs,
@@ -97,7 +98,8 @@ namespace gpuPixelDoublets {
                       ideal_cond,
                       doClusterCut,
                       doZCut,
-                      doPhiCut);
+                      doPhiCut,
+                      maxNumOfDoublets);
   }
 
 }  // namespace gpuPixelDoublets
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 472fa671e7d23..52b0cde5f7e91 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -17,10 +17,6 @@
 
 namespace gpuPixelDoubletsAlgos {
 
-  constexpr uint32_t MaxNumOfDoublets = CAConstants::maxNumberOfDoublets();  // not really relevant
-
-  constexpr uint32_t MaxNumOfActiveDoublets = CAConstants::maxNumOfActiveDoublets();
-
   using CellNeighbors = CAConstants::CellNeighbors;
   using CellTracks = CAConstants::CellTracks;
   using CellNeighborsVector = CAConstants::CellNeighborsVector;
@@ -41,7 +37,8 @@ namespace gpuPixelDoubletsAlgos {
                                                     bool ideal_cond,
                                                     bool doClusterCut,
                                                     bool doZCut,
-                                                    bool doPhiCut) {
+                                                    bool doPhiCut,
+                                                    uint32_t maxNumOfDoublets) {
     // ysize cuts (z in the barrel)  times 8
     // these are used if doClusterCut is true
     constexpr int minYsizeB1 = 36;
@@ -51,7 +48,6 @@ namespace gpuPixelDoubletsAlgos {
     constexpr int maxDYPred = 20;
     constexpr float dzdrFact = 8*0.0285/0.015;  // from dz/dr to "DY"
 
-    int16_t mes;
     bool isOuterLadder = ideal_cond;
 
     using Hist = TrackingRecHit2DSOAView::Hist;
@@ -82,10 +78,10 @@ namespace gpuPixelDoubletsAlgos {
     auto idy = blockIdx.y * blockDim.y + threadIdx.y;
     auto first = threadIdx.x;
     auto stride = blockDim.x;
+      
+    uint32_t pairLayerId = 0; // cannot go backward 
     for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
-      uint32_t pairLayerId = 0;
-      while (j >= innerLayerCumulativeSize[pairLayerId++])
-        ;
+      while (j >= innerLayerCumulativeSize[pairLayerId++]);
       --pairLayerId;  // move to lower_bound ??
 
       assert(pairLayerId < nPairs);
@@ -107,14 +103,17 @@ namespace gpuPixelDoubletsAlgos {
       assert(i < offsets[inner + 1]);
 
       // found hit corresponding to our cuda thread, now do the job
+      auto mi = hh.detectorIndex(i);
+      if (mi>2000) continue; // invalid
+
       auto mez = hh.zGlobal(i);
 
       if (doZCut && (mez < minz[pairLayerId] || mez > maxz[pairLayerId]))
         continue;
 
+      int16_t mes=-1;  // make compiler happy 
       if (doClusterCut) {
         // if ideal treat inner ladder as outer
-        auto mi = hh.detectorIndex(i);
         if (inner == 0)
           assert(mi < 96);
         isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
@@ -194,17 +193,19 @@ namespace gpuPixelDoubletsAlgos {
           auto oi = __ldg(p);
           assert(oi >= offsets[outer]);
           assert(oi < offsets[outer + 1]);
+          auto mo = hh.detectorIndex(oi);
+          if (mo>2000) continue; //    invalid
           auto mop = hh.iphi(oi);
           if (std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))) > iphicut)
             continue;
           if (doPhiCut) {
             if (doClusterCut && zsizeCut(oi))
-              continue;
+               continue;
             if (z0cutoff(oi) || ptcut(oi,mop))
               continue;
           }
           auto ind = atomicAdd(nCells, 1);
-          if (ind >= MaxNumOfDoublets) {
+          if (ind >= maxNumOfDoublets) {
             atomicSub(nCells, 1);
             break;
           }  // move to SimpleVector??

From a68be03fbdd4ad1f712569ecebc7739df30b46be Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 10 Sep 2019 16:03:58 -0500
Subject: [PATCH 056/102] Move event and stream caches, and caching allocators
 out from CUDAService (cms-patatrack#364)

To reduce dependencies on edm::Service, and to make CUDAService less
of a collection of everything, split off from it:
  - the CUDAEventCache
  - the CUDAStreamCache
  - the caching allocators

Other changes:
  - clean up unnecessary use of CUDAService
  - fix maxCachedFraction, add debug printouts
  - add make_*_unique_uninitialized that avoid the static_assert
---
 CUDADataFormats/Track/BuildFile.xml                    |  4 +---
 .../plugins/PixelTrackProducerFromSoA.cc               |  3 ---
 .../PixelTriplets/plugins/BrokenLineFitOnGPU.cu        |  8 ++++----
 .../PixelTriplets/plugins/BrokenLineFitOnGPU.h         |  2 --
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc          |  3 ---
 .../plugins/CAHitNtupletGeneratorKernels.cc            |  1 -
 .../plugins/CAHitNtupletGeneratorKernels.cu            |  5 ++---
 .../plugins/CAHitNtupletGeneratorKernelsAlloc.h        | 10 +++-------
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h         |  2 --
 .../plugins/CAHitNtupletGeneratorOnGPU.cc              |  3 +--
 .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h |  1 -
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu           | 10 +++++-----
 .../PixelTriplets/plugins/RiemannFitOnGPU.h            |  2 --
 .../TkSeedGenerator/plugins/SeedProducerFromSoA.cc     |  2 --
 14 files changed, 16 insertions(+), 40 deletions(-)

diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
index 521ea8fe29753..2aa4baedb0bb2 100644
--- a/CUDADataFormats/Track/BuildFile.xml
+++ b/CUDADataFormats/Track/BuildFile.xml
@@ -1,8 +1,6 @@
 <use name="cuda-api-wrappers"/>
 <use name="rootcore"/>
-<use name="FWCore/ServiceRegistry"/>
-<use name="FWCore/ParameterSetReader"/>
-<use name="HeterogeneousCore/CUDAServices"/>
+<use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="eigen"/>
 <export>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index b0987b4c4cc2f..7041a08413f81 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -18,12 +18,9 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 #include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
 #include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 27d8b2022211f..4dc93f961b4a2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,4 +1,5 @@
 #include "BrokenLineFitOnGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 void HelixFitOnGPU::launchBrokenLineKernels(HitsView const * hv,
                                             uint32_t hitsInFit,
@@ -10,13 +11,12 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const * hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  edm::Service<CUDAService> cs;
-  auto hitsGPU_ = cs->make_device_unique<double[]>(
+  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
   auto hits_geGPU_ =
-      cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+      cudautils::make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
   auto fast_fit_resultsGPU_ =
-      cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+      cudautils::make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index e903700ebd91b..e79b55a69227d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -9,8 +9,6 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index af3a86815b479..88d9f7934275d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -13,12 +13,9 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 62d07ed38bc8b..d18ef856d2782 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -23,7 +23,6 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cuda::s
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
   // overkill to use template here (std::make_unique would suffice)
-  //edm::Service<CUDAService> cs;
   // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
   device_isOuterHitOfCell_.reset((GPUCACell::OuterHitOfCell*)malloc(std::max(1U,nhits)*sizeof(GPUCACell::OuterHitOfCell)));
   assert(device_isOuterHitOfCell_.get());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index fdba20cf86b9a..4ca6d9988dcd6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -169,8 +169,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  edm::Service<CUDAService> cs;
-  device_isOuterHitOfCell_ = cs->make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits), stream);
+  device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits), stream);
   assert(device_isOuterHitOfCell_.get());
   {
     int threadsPerBlock = 128;
@@ -185,7 +184,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cs->make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+  device_theCells_ = cudautils::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index b54a1e2e415f0..f16a850b2e70a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -1,7 +1,5 @@
 #include "CAHitNtupletGeneratorKernels.h"
 
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 
@@ -15,8 +13,6 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  edm::Service<CUDAService> cs;
-
   /* not used at the moment 
   cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
   cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
@@ -24,13 +20,13 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
   cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
   */
 
-  device_hitToTuple_ = Traits:: template make_unique<HitToTuple>(cs,stream);
+  device_hitToTuple_ = Traits:: template make_unique<HitToTuple>(stream);
 
-  device_tupleMultiplicity_ = Traits:: template make_unique<TupleMultiplicity>(cs,stream);
+  device_tupleMultiplicity_ = Traits:: template make_unique<TupleMultiplicity>(stream);
 
   auto storageSize = 3+(std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize())+sizeof(AtomicPairCounter::c_type))/sizeof(AtomicPairCounter::c_type);
 
-  device_storage_ = Traits:: template make_unique<AtomicPairCounter::c_type[]>(cs, storageSize,stream);
+  device_storage_ = Traits:: template make_unique<AtomicPairCounter::c_type[]>(storageSize,stream);
   
   device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
   device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get()+1;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 1a03a54a98074..1939eae6b86d9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -9,8 +9,6 @@
 
 #include <cuda_runtime.h>
 
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index bbfb96ce1fc2d..c0bea61537670 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -153,8 +153,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
                                                            float bfield,
                                                            cuda::stream_t<>& stream) const {
-  edm::Service<CUDAService> cs;
-  PixelTrackHeterogeneous tracks(cs->make_device_unique<pixelTrack::TrackSoA>(stream));
+  PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
 
   auto * soa = tracks.get();
   
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index e2f70fe8ed86b..67f75e21e2ef9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -9,7 +9,6 @@
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 
 #include "CAHitNtupletGeneratorKernels.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 8a371b7806907..042a9b4e6982d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -1,4 +1,5 @@
 #include "RiemannFitOnGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 void HelixFitOnGPU::launchRiemannKernels(HitsView const * hv,
                                          uint32_t nhits,
@@ -10,15 +11,14 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const * hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  edm::Service<CUDAService> cs;
-  auto hitsGPU_ = cs->make_device_unique<double[]>(
+  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
   auto hits_geGPU_ =
-      cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+      cudautils::make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
   auto fast_fit_resultsGPU_ =
-      cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+      cudautils::make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
   auto circle_fit_resultsGPU_holder =
-      cs->make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
+      cudautils::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
   Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index fbc82f888e722..3f4230085efe3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -7,8 +7,6 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index b10de55871185..fe78853d568ee 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -15,12 +15,10 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "Geometry/CommonDetUnit/interface/GeomDet.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"

From 2d6d811564820456d35c6808a89ef5b8f7312ebf Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 12 Sep 2019 00:22:05 +0200
Subject: [PATCH 057/102] Apply clang-format style formatting

---
 .../Track/interface/PixelTrackHeterogeneous.h |  33 +--
 .../Track/interface/TrajectoryStateSoA.h      |  78 +++--
 CUDADataFormats/Track/src/classes.h           |   2 +-
 .../Track/test/TrajectoryStateSOA_t.h         |  60 ++--
 .../PixelTrackFitting/interface/FitUtils.h    |  46 ++-
 .../plugins/PixelTrackProducerFromSoA.cc      | 120 ++++----
 .../plugins/PixelTrackSoAFromCUDA.cc          |  27 +-
 .../PixelTrackFitting/plugins/storeTracks.h   |  45 ++-
 .../test/testEigenJacobian.cpp                | 133 ++++-----
 .../PixelTrackFitting/test/test_common.h      |  22 +-
 .../interface/CAHitQuadrupletGenerator.h      | 193 ++++++-------
 .../PixelTriplets/interface/CircleEq.h        | 103 +++----
 .../plugins/BrokenLineFitOnGPU.cc             | 106 +++----
 .../plugins/BrokenLineFitOnGPU.cu             |  80 ++----
 .../plugins/BrokenLineFitOnGPU.h              | 114 ++++----
 .../PixelTriplets/plugins/CAConstants.h       |   6 +-
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  36 +--
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 197 ++++++-------
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 122 ++++----
 .../plugins/CAHitNtupletGeneratorKernels.h    | 208 +++++++-------
 .../CAHitNtupletGeneratorKernelsAlloc.h       |  35 +--
 .../CAHitNtupletGeneratorKernelsImpl.h        | 267 +++++++++---------
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 151 +++++-----
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  21 +-
 .../PixelTriplets/plugins/GPUCACell.h         |  67 +++--
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |   4 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  20 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cc  | 189 +++++--------
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  | 152 +++++-----
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   |  73 ++---
 .../PixelTriplets/plugins/gpuFishbone.h       | 115 ++++----
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  96 ++++---
 .../plugins/gpuPixelDoubletsAlgos.h           |  32 ++-
 .../PixelTriplets/test/CircleEq_t.cpp         | 104 +++----
 .../PixelTriplets/test/fastDPHI_t.cpp         | 148 ++++------
 .../plugins/SeedProducerFromSoA.cc            |  89 +++---
 36 files changed, 1487 insertions(+), 1807 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index a576604b6e935..bd4ec059f6e9c 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -7,13 +7,12 @@
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 
 namespace trackQuality {
-  enum Quality : uint8_t { bad=0, dup, loose, strict, tight, highPurity };
+  enum Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
 }
 
 template <int32_t S>
 class TrackSoAT {
 public:
-
   static constexpr int32_t stride() { return S; }
 
   using Quality = trackQuality::Quality;
@@ -23,23 +22,22 @@ class TrackSoAT {
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
   eigenSoA::ScalarSoA<uint8_t, S> m_quality;
-  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i));}
-  constexpr Quality & quality(int32_t i) { return (Quality&)(m_quality(i));}
-  constexpr Quality const * qualityData() const { return (Quality const *)(m_quality.data());}
-  constexpr Quality * qualityData() { return (Quality*)(m_quality.data());}
-
+  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i)); }
+  constexpr Quality &quality(int32_t i) { return (Quality &)(m_quality(i)); }
+  constexpr Quality const *qualityData() const { return (Quality const *)(m_quality.data()); }
+  constexpr Quality *qualityData() { return (Quality *)(m_quality.data()); }
 
-  // this is chi2/ndof as not necessarely all hits are used in the fit  
+  // this is chi2/ndof as not necessarely all hits are used in the fit
   eigenSoA::ScalarSoA<float, S> chi2;
 
-  constexpr int nHits(int i) const { return detIndices.size(i);}
+  constexpr int nHits(int i) const { return detIndices.size(i); }
 
   // State at the Beam spot
   // phi,tip,1/pt,cotan(theta),zip
   TrajectoryStateSoA<S> stateAtBS;
   eigenSoA::ScalarSoA<float, S> eta;
   eigenSoA::ScalarSoA<float, S> pt;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f,stateAtBS.state(i)(2)); }
+  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
   constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
   constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
   constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
@@ -51,18 +49,17 @@ class TrackSoAT {
 
   HitContainer hitIndices;
   HitContainer detIndices;
-  
+
   // total number of tracks (including those not fitted)
   uint32_t m_nTracks;
-
 };
 
-namespace pixelTrack{
+namespace pixelTrack {
 
 #ifdef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumber() { return 2 * 1024;}
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
 #else
-  constexpr uint32_t maxNumber() { return 32 * 1024;}
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
 #endif
 
   using TrackSoA = TrackSoAT<maxNumber()>;
@@ -70,10 +67,8 @@ namespace pixelTrack{
   using HitContainer = TrackSoA::HitContainer;
   using Quality = trackQuality::Quality;
 
-}
+}  // namespace pixelTrack
 
 using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
 
-
-#endif // CUDADataFormatsTrackTrackSoA_H
-
+#endif  // CUDADataFormatsTrackTrackSoA_H
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
index 3aca896475ac0..7cd2e93fb914e 100644
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
@@ -6,60 +6,54 @@
 
 template <int32_t S>
 struct TrajectoryStateSoA {
-
   using Vector5f = Eigen::Matrix<float, 5, 1>;
   using Vector15f = Eigen::Matrix<float, 15, 1>;
 
   using Vector5d = Eigen::Matrix<double, 5, 1>;
   using Matrix5d = Eigen::Matrix<double, 5, 5>;
 
-
   static constexpr int32_t stride() { return S; }
 
-  eigenSoA::MatrixSoA<Vector5f,S> state;
-  eigenSoA::MatrixSoA<Vector15f,S> covariance;
-
-
-  template<typename V3, typename M3, typename V2, typename M2>
-  __host__ __device__ inline
-  void copyFromCircle(V3 const & cp, M3 const & ccov, V2 const & lp, M2 const & lcov, float b, int32_t i) {
-     state(i) << cp.template cast<float>(), lp.template cast<float>();
-     state(i)(2) *=b;
-     auto cov =  covariance(i);
-     cov(0) = ccov(0,0);
-     cov(1) = ccov(0,1);
-     cov(2) = b*float(ccov(0,2));
-     cov(4) = cov(3) = 0;
-     cov(5) = ccov(1,1);
-     cov(6) = b*float(ccov(1,2));
-     cov(8) = cov(7) = 0;
-     cov(9) = b*b*float(ccov(2,2));
-     cov(11) = cov(10) = 0;
-     cov(12) = lcov(0,0);
-     cov(13) = lcov(0,1);
-     cov(14) = lcov(1,1);
+  eigenSoA::MatrixSoA<Vector5f, S> state;
+  eigenSoA::MatrixSoA<Vector15f, S> covariance;
+
+  template <typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__ inline void copyFromCircle(
+      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
+    state(i) << cp.template cast<float>(), lp.template cast<float>();
+    state(i)(2) *= b;
+    auto cov = covariance(i);
+    cov(0) = ccov(0, 0);
+    cov(1) = ccov(0, 1);
+    cov(2) = b * float(ccov(0, 2));
+    cov(4) = cov(3) = 0;
+    cov(5) = ccov(1, 1);
+    cov(6) = b * float(ccov(1, 2));
+    cov(8) = cov(7) = 0;
+    cov(9) = b * b * float(ccov(2, 2));
+    cov(11) = cov(10) = 0;
+    cov(12) = lcov(0, 0);
+    cov(13) = lcov(0, 1);
+    cov(14) = lcov(1, 1);
   }
 
-
-  template<typename V5, typename M5>
-  __host__ __device__ inline
-  void copyFromDense(V5 const & v, M5 const & cov, int32_t i) {
-     state(i) = v.template cast<float>();
-     for(int j=0, ind=0; j<5; ++j) for (auto k=j;k<5;++k) covariance(i)(ind++) = cov(j,k); 
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
+    state(i) = v.template cast<float>();
+    for (int j = 0, ind = 0; j < 5; ++j)
+      for (auto k = j; k < 5; ++k)
+        covariance(i)(ind++) = cov(j, k);
   }
 
-  template<typename V5, typename M5>
-  __host__ __device__ inline
-  void copyToDense(V5 & v, M5 & cov, int32_t i) const {
-     v = state(i).template cast<typename V5::Scalar>();
-     for(int j=0, ind=0; j<5; ++j) {
-        cov(j,j) = covariance(i)(ind++); 
-        for (auto k=j+1;k<5;++k) cov(k,j)=cov(j,k) = covariance(i)(ind++);
-     }
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
+    v = state(i).template cast<typename V5::Scalar>();
+    for (int j = 0, ind = 0; j < 5; ++j) {
+      cov(j, j) = covariance(i)(ind++);
+      for (auto k = j + 1; k < 5; ++k)
+        cov(k, j) = cov(j, k) = covariance(i)(ind++);
+    }
   }
-
 };
 
-#endif // CUDADataFormatsTrackTrajectoryStateSOA_H
-
-
+#endif  // CUDADataFormatsTrackTrajectoryStateSOA_H
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 7f89096977e64..699e45ede05d4 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -7,4 +7,4 @@
 #include "CUDADataFormats/Common/interface/ArrayShadow.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
-#endif  
+#endif
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
index adefb57d7bbe5..03c51c39acdfb 100644
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -3,50 +3,50 @@
 using Vector5d = Eigen::Matrix<double, 5, 1>;
 using Matrix5d = Eigen::Matrix<double, 5, 5>;
 
-__host__ __device__
-Matrix5d loadCov(Vector5d const & e) {
+__host__ __device__ Matrix5d loadCov(Vector5d const& e) {
   Matrix5d cov;
-  for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
+  for (int i = 0; i < 5; ++i)
+    cov(i, i) = e(i) * e(i);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < i; ++j) {
-      double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined
-      cov(i,j) = (i+j)%2 ? -0.4*v  : 0.1*v;
-      cov(j,i) = cov(i,j);
+      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
+      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
+      cov(j, i) = cov(i, j);
     }
-   }
+  }
   return cov;
 }
 
-
 using TS = TrajectoryStateSoA<128>;
 
-__global__ void testTSSoA(TS * pts, int n) {
-
-  assert(n<=128);
+__global__ void testTSSoA(TS* pts, int n) {
+  assert(n <= 128);
 
-  Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
-  Vector5d e0; e0 << 0.01,0.01,0.035,-0.03,-0.01;
+  Vector5d par0;
+  par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
+  Vector5d e0;
+  e0 << 0.01, 0.01, 0.035, -0.03, -0.01;
   auto cov0 = loadCov(e0);
 
-  TS & ts = *pts;
+  TS& ts = *pts;
 
   int first = threadIdx.x + blockIdx.x * blockDim.x;
 
   for (int i = first; i < n; i += blockDim.x * gridDim.x) {
-    ts.copyFromDense(par0,cov0,i);
-    Vector5d par1; Matrix5d cov1;
-    ts.copyToDense(par1,cov1,i);
-    Vector5d delV = par1-par0;
-    Matrix5d delM = cov1-cov0;
-    for(int j=0; j<5; ++j) {
-      assert(std::abs(delV(j))<1.e-5);
-      for (auto k=j;k<5;++k) {
-        assert(cov0(k,j)==cov0(j,k));
-        assert(cov1(k,j)==cov1(j,k));
-        assert(std::abs(delM(k,j))<1.e-5);
+    ts.copyFromDense(par0, cov0, i);
+    Vector5d par1;
+    Matrix5d cov1;
+    ts.copyToDense(par1, cov1, i);
+    Vector5d delV = par1 - par0;
+    Matrix5d delM = cov1 - cov0;
+    for (int j = 0; j < 5; ++j) {
+      assert(std::abs(delV(j)) < 1.e-5);
+      for (auto k = j; k < 5; ++k) {
+        assert(cov0(k, j) == cov0(j, k));
+        assert(cov1(k, j) == cov1(j, k));
+        assert(std::abs(delM(k, j)) < 1.e-5);
       }
     }
-
   }
 }
 
@@ -60,18 +60,16 @@ int main() {
   exitSansCUDADevices();
 #endif
 
-
   TS ts;
 
 #ifdef __CUDACC__
-  TS * ts_d;
+  TS* ts_d;
   cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
-  testTSSoA<<<1, 64>>>(ts_d,128);
+  testTSSoA<<<1, 64>>>(ts_d, 128);
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
   cudaCheck(cudaDeviceSynchronize());
 #else
-  testTSSoA(&ts,128);
+  testTSSoA(&ts, 128);
 #endif
-
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
index e44a58f676106..8710bdcf6c444 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -200,32 +200,29 @@ namespace Rfit {
     const double temp0 = circle.par.head(2).squaredNorm();
     const double temp1 = sqrt(temp0);
     par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
-           circle.q/circle.par(2);
-    
-      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-      const double temp3 = 1. / temp1 * circle.q;
-      Matrix3d J4;
-      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
-          circle.par(1) * temp3, -circle.q, 0., 0., -circle.q/(circle.par(2)*circle.par(2));
-      circle.cov = J4 * circle.cov * J4.transpose();
-    
-    circle.par = par_pak;
-  }
+        circle.q / circle.par(2);
 
+    const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+    const double temp3 = 1. / temp1 * circle.q;
+    Matrix3d J4;
+    J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+        circle.par(1) * temp3, -circle.q, 0., 0., -circle.q / (circle.par(2) * circle.par(2));
+    circle.cov = J4 * circle.cov * J4.transpose();
 
+    circle.par = par_pak;
+  }
 
   // transformation between the "perigee" to cmssw localcoord frame
   // the plane of the latter is the perigee plane...
   // from   //!<(phi,Tip,q/pt,cotan(theta)),Zip)
   // to q/p,dx/dz,dy/dz,x,z
-  template<typename VI5, typename MI5, typename VO5, typename MO5>
-  __host__ __device__ inline void transformToPerigeePlane(VI5 const & ip, MI5 const & icov, VO5 & op, MO5 & ocov) {
-
-    auto sinTheta2 = 1./(1.+ip(3)*ip(3));
+  template <typename VI5, typename MI5, typename VO5, typename MO5>
+  __host__ __device__ inline void transformToPerigeePlane(VI5 const& ip, MI5 const& icov, VO5& op, MO5& ocov) {
+    auto sinTheta2 = 1. / (1. + ip(3) * ip(3));
     auto sinTheta = std::sqrt(sinTheta2);
-    auto cosTheta = ip(3)*sinTheta;
+    auto cosTheta = ip(3) * sinTheta;
 
-    op(0) = sinTheta*ip(2);
+    op(0) = sinTheta * ip(2);
     op(1) = 0.;
     op(2) = -ip(3);
     op(3) = ip(1);
@@ -233,15 +230,14 @@ namespace Rfit {
 
     Matrix5d J = Matrix5d::Zero();
 
-    J(0,2) = sinTheta;
-    J(0,3) = -sinTheta2*cosTheta*ip(2);
-    J(1,0) = 1.;
-    J(2,3) = -1.;
-    J(3,1) = 1.;
-    J(4,4) = -1;
-
-    ocov=  J*icov*J.transpose();
+    J(0, 2) = sinTheta;
+    J(0, 3) = -sinTheta2 * cosTheta * ip(2);
+    J(1, 0) = 1.;
+    J(2, 3) = -1.;
+    J(3, 1) = 1.;
+    J(4, 4) = -1;
 
+    ocov = J * icov * J.transpose();
   }
 
 }  // namespace Rfit
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 7041a08413f81..522678ce352f5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -34,14 +34,12 @@
 #include "storeTracks.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 
-
 /**
  * This class creates "leagcy"  reco::Track
  * objects from the output of SoA CA. 
  */
 class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 public:
-
   using IndToEdm = std::vector<uint16_t>;
 
   explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig);
@@ -49,12 +47,11 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
   static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
 
-//  using HitModuleStart = std::array<uint32_t,gpuClustering::MaxNumModules + 1>;
+  //  using HitModuleStart = std::array<uint32_t,gpuClustering::MaxNumModules + 1>;
   using HMSstorage = HostProduct<unsigned int[]>;
 
-
 private:
-  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
 
   edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
@@ -64,17 +61,16 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
   int32_t const minNumberOfHits_;
 };
 
-PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) :
-      tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig)
+    : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
       tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
       cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
-      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
-{
-    produces<reco::TrackCollection>();
-    produces<TrackingRecHitCollection>();
-    produces<reco::TrackExtraCollection>();
-    produces<IndToEdm>();
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits")) {
+  produces<reco::TrackCollection>();
+  produces<TrackingRecHitCollection>();
+  produces<reco::TrackExtraCollection>();
+  produces<IndToEdm>();
 }
 
 void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
@@ -87,12 +83,13 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions
   descriptions.addWithDefaultLabel(desc);
 }
 
-
-void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
+                                        edm::Event &iEvent,
+                                        const edm::EventSetup &iSetup) const {
   // std::cout << "Converting gpu helix in reco tracks" << std::endl;
 
   auto indToEdmP = std::make_unique<IndToEdm>();
-  auto & indToEdm = *indToEdmP;
+  auto &indToEdm = *indToEdmP;
 
   edm::ESHandle<MagneticField> fieldESH;
   iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
@@ -101,7 +98,6 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEve
   edm::ESHandle<TrackerTopology> httopo;
   iSetup.get<TrackerTopologyRcd>().get(httopo);
 
-
   edm::Handle<reco::BeamSpot> bsHandle;
   iEvent.getByToken(tBeamSpot_, bsHandle);
   const auto &bsh = *bsHandle;
@@ -111,92 +107,95 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEve
   edm::Handle<SiPixelRecHitCollectionNew> gh;
   iEvent.getByToken(cpuHits_, gh);
   auto const &rechits = *gh;
-  std::vector<TrackingRecHit const*> hitmap;
+  std::vector<TrackingRecHit const *> hitmap;
   auto const &rcs = rechits.data();
   auto nhits = rcs.size();
-  hitmap.resize(nhits,nullptr);
+  hitmap.resize(nhits, nullptr);
 
   edm::Handle<HMSstorage> hhms;
-  iEvent.getByToken(hmsToken_,hhms);
-  auto const * hitsModuleStart = (*hhms).get();
+  iEvent.getByToken(hmsToken_, hhms);
+  auto const *hitsModuleStart = (*hhms).get();
   auto fc = hitsModuleStart;
 
   for (auto const &h : rcs) {
-      auto const &thit = static_cast<BaseTrackerRecHit const &>(h);
-      auto detI = thit.det()->index();
-      auto const &clus = thit.firstClusterRef();
-      assert(clus.isPixel());
-      auto i = fc[detI] + clus.pixelCluster().originalId();
-      if(i >= hitmap.size()) hitmap.resize(i+256,nullptr);  // only in case of hit overflow in one module
-      assert(nullptr==hitmap[i]);
-      hitmap[i] = &h;
+    auto const &thit = static_cast<BaseTrackerRecHit const &>(h);
+    auto detI = thit.det()->index();
+    auto const &clus = thit.firstClusterRef();
+    assert(clus.isPixel());
+    auto i = fc[detI] + clus.pixelCluster().originalId();
+    if (i >= hitmap.size())
+      hitmap.resize(i + 256, nullptr);  // only in case of hit overflow in one module
+    assert(nullptr == hitmap[i]);
+    hitmap[i] = &h;
   }
 
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  const auto & tsoa = *iEvent.get(tokenTrack_);
+  const auto &tsoa = *iEvent.get(tokenTrack_);
 
-  auto const * quality = tsoa.qualityData();
-  auto const & fit = tsoa.stateAtBS;
-  auto const & hitIndices = tsoa.hitIndices; 
-  auto maxTracks =tsoa.stride();
+  auto const *quality = tsoa.qualityData();
+  auto const &fit = tsoa.stateAtBS;
+  auto const &hitIndices = tsoa.hitIndices;
+  auto maxTracks = tsoa.stride();
 
   int32_t nt = 0;
- 
+
   for (int32_t it = 0; it < maxTracks; ++it) {
     auto nHits = tsoa.nHits(it);
-    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
     indToEdm.push_back(-1);
     auto q = quality[it];
     if (q != trackQuality::loose)
-      continue;                           // FIXME
-    if (nHits< minNumberOfHits_) continue;
+      continue;  // FIXME
+    if (nHits < minNumberOfHits_)
+      continue;
     indToEdm.back() = nt;
     ++nt;
 
     hits.resize(nHits);
     auto b = hitIndices.begin(it);
     for (int iHit = 0; iHit < nHits; ++iHit)
-      hits[iHit] =  hitmap[*(b+iHit)];
+      hits[iHit] = hitmap[*(b + iHit)];
 
     // mind: this values are respect the beamspot!
 
     float chi2 = tsoa.chi2(it);
     float phi = tsoa.phi(it);
 
-    Rfit::Vector5d ipar,opar;
-    Rfit::Matrix5d icov,ocov;
-    fit.copyToDense(ipar,icov,it);
-    Rfit::transformToPerigeePlane(ipar,icov,opar,ocov);
+    Rfit::Vector5d ipar, opar;
+    Rfit::Matrix5d icov, ocov;
+    fit.copyToDense(ipar, icov, it);
+    Rfit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
-    LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.);
+    LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
     AlgebraicSymMatrix55 m;
-    for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j);
+    for (int i = 0; i < 5; ++i)
+      for (int j = i; j < 5; ++j)
+        m(i, j) = ocov(i, j);
 
     float sp = std::sin(phi);
     float cp = std::cos(phi);
-    Surface::RotationType rot(
-                              sp, -cp,    0,
-                               0,   0, -1.f,
-                              cp,  sp,    0);
+    Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
 
-    Plane impPointPlane(bs,rot);
-    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()), 
-                                  impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product());
-    JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product());
+    Plane impPointPlane(bs, rot);
+    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()),
+                                  impPointPlane.toGlobal(lpar.momentum()),
+                                  lpar.charge(),
+                                  fieldESH.product());
+    JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, *fieldESH.product());
 
-    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
 
-    int ndof = 2*hits.size()-5;
-    chi2 = chi2*ndof; // FIXME
+    int ndof = 2 * hits.size() - 5;
+    chi2 = chi2 * ndof;  // FIXME
     GlobalPoint vv = gp.position();
-    math::XYZPoint  pos( vv.x(), vv.y(), vv.z() );
+    math::XYZPoint pos(vv.x(), vv.y(), vv.z());
     GlobalVector pp = gp.momentum();
-    math::XYZVector mom( pp.x(), pp.y(), pp.z() );
+    math::XYZVector mom(pp.x(), pp.y(), pp.z());
 
-    auto track =  std::make_unique<reco::Track> ( chi2, ndof, pos, mom,
-                  gp.charge(), CurvilinearTrajectoryError(mo));
+    auto track = std::make_unique<reco::Track>(chi2, ndof, pos, mom, gp.charge(), CurvilinearTrajectoryError(mo));
     // filter???
     tracks.emplace_back(track.release(), hits);
   }
@@ -207,5 +206,4 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEve
   iEvent.put(std::move(indToEdmP));
 }
 
-
 DEFINE_FWK_MODULE(PixelTrackProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index c8dc04633f832..2d6da6a631151 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -17,7 +17,6 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
 
-
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 
 class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
@@ -33,43 +32,34 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-
   edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
   cudautils::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
-
 };
 
-PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) :
-  tokenCUDA_(consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-  tokenSOA_(produces<PixelTrackHeterogeneous>())
-{}
-
+PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
+    : tokenCUDA_(consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-
   edm::ParameterSetDescription desc;
 
-   desc.add<edm::InputTag>("src", edm::InputTag("caHitNtupletCUDA"));
-   descriptions.add("pixelTrackSoA", desc);
-
+  desc.add<edm::InputTag>("src", edm::InputTag("caHitNtupletCUDA"));
+  descriptions.add("pixelTrackSoA", desc);
 }
 
-
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
-               edm::EventSetup const& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+                                    edm::EventSetup const& iSetup,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   CUDAProduct<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
   m_soa = inputData.toHostAsync(ctx.stream());
-
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-
   /*
   auto const & tsoa = *m_soa;
   auto maxTracks = tsoa.stride();
@@ -86,10 +76,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   */
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_,PixelTrackHeterogeneous(std::move(m_soa)));
+  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(m_soa)));
 
   assert(!m_soa);
 }
 
-
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
index 13bdee8164780..59101b6ba5214 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
@@ -16,62 +16,57 @@
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 
-template<typename Ev, typename TWH>
-void storeTracks(Ev & ev, const TWH& tracksWithHits, const TrackerTopology& ttopo)
-{
+template <typename Ev, typename TWH>
+void storeTracks(Ev& ev, const TWH& tracksWithHits, const TrackerTopology& ttopo) {
   auto tracks = std::make_unique<reco::TrackCollection>();
   auto recHits = std::make_unique<TrackingRecHitCollection>();
   auto trackExtras = std::make_unique<reco::TrackExtraCollection>();
 
   int cc = 0, nTracks = tracksWithHits.size();
 
-  for (int i = 0; i < nTracks; i++)
-  {
-    reco::Track* track =  tracksWithHits[i].first;
-    const auto & hits = tracksWithHits[i].second;
+  for (int i = 0; i < nTracks; i++) {
+    reco::Track* track = tracksWithHits[i].first;
+    const auto& hits = tracksWithHits[i].second;
 
-    for (unsigned int k = 0; k < hits.size(); k++)
-    {
-      auto * hit = hits[k]->clone();
+    for (unsigned int k = 0; k < hits.size(); k++) {
+      auto* hit = hits[k]->clone();
 
       track->appendHitPattern(*hit, ttopo);
       recHits->push_back(hit);
     }
     tracks->push_back(*track);
     delete track;
-
   }
 
-  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event" << "\n";
-  edm::OrphanHandle <TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
+  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event"
+                            << "\n";
+  edm::OrphanHandle<TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
 
   edm::RefProd<TrackingRecHitCollection> hitCollProd(ohRH);
-  for (int k = 0; k < nTracks; k++)
-  {
+  for (int k = 0; k < nTracks; k++) {
     reco::TrackExtra theTrackExtra{};
 
     //fill the TrackExtra with TrackingRecHitRef
     unsigned int nHits = tracks->at(k).numberOfValidHits();
     theTrackExtra.setHits(hitCollProd, cc, nHits);
-    cc +=nHits;
-    AlgebraicVector5 v = AlgebraicVector5(0,0,0,0,0);
-    reco::TrackExtra::TrajParams trajParams(nHits,LocalTrajectoryParameters(v,1.));
-    reco::TrackExtra::Chi2sFive chi2s(nHits,0);
-    theTrackExtra.setTrajParams(std::move(trajParams),std::move(chi2s));
+    cc += nHits;
+    AlgebraicVector5 v = AlgebraicVector5(0, 0, 0, 0, 0);
+    reco::TrackExtra::TrajParams trajParams(nHits, LocalTrajectoryParameters(v, 1.));
+    reco::TrackExtra::Chi2sFive chi2s(nHits, 0);
+    theTrackExtra.setTrajParams(std::move(trajParams), std::move(chi2s));
     trackExtras->push_back(theTrackExtra);
   }
 
-  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event" << "\n";
+  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event"
+                            << "\n";
   edm::OrphanHandle<reco::TrackExtraCollection> ohTE = ev.put(std::move(trackExtras));
 
-  for (int k = 0; k < nTracks; k++)
-  {
-    const reco::TrackExtraRef theTrackExtraRef(ohTE,k);
+  for (int k = 0; k < nTracks; k++) {
+    const reco::TrackExtraRef theTrackExtraRef(ohTE, k);
     (tracks->at(k)).setExtra(theTrackExtraRef);
   }
 
   ev.put(std::move(tracks));
-
 }
 
 #endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
index d294e4cc6c1d6..709757a803884 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
@@ -1,9 +1,8 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
-#include<cmath>
+#include <cmath>
 
-using Rfit::Vector5d;
 using Rfit::Matrix5d;
-
+using Rfit::Vector5d;
 
 #include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
 
@@ -17,102 +16,93 @@ using Rfit::Matrix5d;
 
 namespace {
 
-  struct M5T : public  MagneticField {
-    M5T() :  mf(0.,0.,5.){}
-    virtual GlobalVector inTesla (const GlobalPoint&) const {
-      return mf;
-    }
+  struct M5T : public MagneticField {
+    M5T() : mf(0., 0., 5.) {}
+    virtual GlobalVector inTesla(const GlobalPoint&) const { return mf; }
 
     GlobalVector mf;
   };
 
-}
+}  // namespace
 
 // old pixeltrack version...
-Matrix5d transfFast(Matrix5d cov, Vector5d const &  p) {
-  auto sqr = [](auto x) { return x*x;};
-  auto sinTheta = 1/std::sqrt(1+p(3)*p(3));
-  auto cosTheta = p(3)*sinTheta;
-  cov(2,2) = sqr(sinTheta) * (
-              cov(2,2)*sqr(1./(p(2)*p(2)))
-            + cov(3,3)*sqr(cosTheta*sinTheta/p(2))
-            );
-  cov(3,2) = cov(2,3) = cov(3,3) * cosTheta * sqr(sinTheta) / p(2); 
+Matrix5d transfFast(Matrix5d cov, Vector5d const& p) {
+  auto sqr = [](auto x) { return x * x; };
+  auto sinTheta = 1 / std::sqrt(1 + p(3) * p(3));
+  auto cosTheta = p(3) * sinTheta;
+  cov(2, 2) = sqr(sinTheta) * (cov(2, 2) * sqr(1. / (p(2) * p(2))) + cov(3, 3) * sqr(cosTheta * sinTheta / p(2)));
+  cov(3, 2) = cov(2, 3) = cov(3, 3) * cosTheta * sqr(sinTheta) / p(2);
   // for (int i=0; i<5; ++i) cov(i,2) *= -sinTheta/(p(2)*p(2));
   // for (int i=0; i<5; ++i) cov(2,i) *= -sinTheta/(p(2)*p(2));
   return cov;
-
-
 }
 
-Matrix5d loadCov(Vector5d const & e) {
-
+Matrix5d loadCov(Vector5d const& e) {
   Matrix5d cov;
-  for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
+  for (int i = 0; i < 5; ++i)
+    cov(i, i) = e(i) * e(i);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < i; ++j) {
-      double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined
-      cov(i,j) = (i+j)%2 ? -0.4*v  : 0.1*v;
-      cov(j,i) = cov(i,j);
+      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
+      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
+      cov(j, i) = cov(i, j);
     }
-   }
+  }
   return cov;
 }
 
-
-#include<iostream>
+#include <iostream>
 int main() {
+  M5T const mf;
 
- M5T const mf;
-
- for (auto charge=-1; charge<2; charge+=2)
- for (auto szip=-1; szip<2; szip+=2)
- for (auto stip=-1; stip<2; stip+=2)
- {
-  Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
-  Vector5d del0; del0 << 0.01,0.01,0.035,-0.03,-0.01;
-  //!<(phi,Tip,pt,cotan(theta)),Zip)
-    par0(1) *= stip;
-    par0(4) *= szip;
+  for (auto charge = -1; charge < 2; charge += 2)
+    for (auto szip = -1; szip < 2; szip += 2)
+      for (auto stip = -1; stip < 2; stip += 2) {
+        Vector5d par0;
+        par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
+        Vector5d del0;
+        del0 << 0.01, 0.01, 0.035, -0.03, -0.01;
+        //!<(phi,Tip,pt,cotan(theta)),Zip)
+        par0(1) *= stip;
+        par0(4) *= szip;
 
-  Matrix5d cov0 = loadCov(del0);
+        Matrix5d cov0 = loadCov(del0);
 
-  Vector5d par1;
-  Vector5d par2;
+        Vector5d par1;
+        Vector5d par2;
 
-  Matrix5d cov1;
-  Matrix5d cov2;
+        Matrix5d cov1;
+        Matrix5d cov2;
 
-  // Matrix5d covf = transfFast(cov0,par0);
+        // Matrix5d covf = transfFast(cov0,par0);
 
-  Rfit::transformToPerigeePlane(par0,cov0,par1,cov1);
-
-  std::cout << "cov1\n" << cov1 << std::endl;
+        Rfit::transformToPerigeePlane(par0, cov0, par1, cov1);
 
+        std::cout << "cov1\n" << cov1 << std::endl;
 
-  LocalTrajectoryParameters lpar(par1(0),par1(1),par1(2),par1(3),par1(4),1.);
-  AlgebraicSymMatrix55 m;
-  for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = cov1(i,j);
+        LocalTrajectoryParameters lpar(par1(0), par1(1), par1(2), par1(3), par1(4), 1.);
+        AlgebraicSymMatrix55 m;
+        for (int i = 0; i < 5; ++i)
+          for (int j = i; j < 5; ++j)
+            m(i, j) = cov1(i, j);
 
-    float phi = par0(0);
-    float sp = std::sin(phi);
-    float cp = std::cos(phi);
-    Surface::RotationType rot(
-                              sp, -cp,    0,
-                               0,   0, -1.f,
-                              cp,  sp,    0);
+        float phi = par0(0);
+        float sp = std::sin(phi);
+        float cp = std::cos(phi);
+        Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
 
-  Surface::PositionType bs(0., 0., 0.);
-  Plane plane(bs,rot);
-  GlobalTrajectoryParameters gp(plane.toGlobal(lpar.position()), plane.toGlobal(lpar.momentum()),lpar.charge(),&mf);
-  std::cout << "global par " << gp.position() << ' ' << gp.momentum() << ' ' << gp.charge() << std::endl;
-  JacobianLocalToCurvilinear jl2c(plane,lpar,mf);
-  std::cout << "jac l2c" << jl2c.jacobian() << std::endl;
+        Surface::PositionType bs(0., 0., 0.);
+        Plane plane(bs, rot);
+        GlobalTrajectoryParameters gp(
+            plane.toGlobal(lpar.position()), plane.toGlobal(lpar.momentum()), lpar.charge(), &mf);
+        std::cout << "global par " << gp.position() << ' ' << gp.momentum() << ' ' << gp.charge() << std::endl;
+        JacobianLocalToCurvilinear jl2c(plane, lpar, mf);
+        std::cout << "jac l2c" << jl2c.jacobian() << std::endl;
 
-  AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
-  std::cout << "curv error\n" << mo << std::endl;
+        AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
+        std::cout << "curv error\n" << mo << std::endl;
 
-  /*
+        /*
 
   // not accurate as the perigee plane move as well...
   Vector5d del1 = par2-par1;
@@ -136,12 +126,9 @@ int main() {
   std::cout << "cov2\n" << cov2 << std::endl;
   */
 
-  std::cout << std::endl << "----------" << std::endl;
+        std::cout << std::endl << "----------" << std::endl;
 
-
-  } // lopp over signs
+      }  // lopp over signs
 
   return 0;
-
-
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
index 79bb128eeec8a..6377628b0eeca 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
+++ b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
@@ -5,37 +5,34 @@
 #include <cassert>
 #include <random>
 
-template<class C>
-__host__ __device__
-void printIt(C * m) {
+template <class C>
+__host__ __device__ void printIt(C* m) {
 #ifdef TEST_DEBUG
   printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
   for (u_int r = 0; r < m->rows(); ++r) {
     for (u_int c = 0; c < m->cols(); ++c) {
-      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r,c));
+      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r, c));
     }
   }
 #endif
 }
 
-template<class C1, class C2>
+template <class C1, class C2>
 bool isEqualFuzzy(C1 a, C2 b, double epsilon = 1e-6) {
   for (unsigned int i = 0; i < a.rows(); ++i) {
     for (unsigned int j = 0; j < a.cols(); ++j) {
-      assert(std::abs(a(i,j)-b(i,j))
-          < std::min(std::abs(a(i,j)), std::abs(b(i,j)))*epsilon);
+      assert(std::abs(a(i, j) - b(i, j)) < std::min(std::abs(a(i, j)), std::abs(b(i, j))) * epsilon);
     }
   }
   return true;
 }
 
-bool isEqualFuzzy(double a, double b, double epsilon=1e-6) {
-  return std::abs(a-b) < std::min(std::abs(a), std::abs(b))*epsilon;
+bool isEqualFuzzy(double a, double b, double epsilon = 1e-6) {
+  return std::abs(a - b) < std::min(std::abs(a), std::abs(b)) * epsilon;
 }
 
-
-template<typename T>
-void fillMatrix(T & t) {
+template <typename T>
+void fillMatrix(T& t) {
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_real_distribution<> dis(0.0, 2.0);
@@ -47,5 +44,4 @@ void fillMatrix(T & t) {
   return;
 }
 
-
 #endif
diff --git a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
index 721a5dfaef07e..065dcd0f3ecb1 100644
--- a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
+++ b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
@@ -21,113 +21,114 @@ class TrackingRegion;
 class SeedingLayerSetsHits;
 
 namespace edm {
-    class Event;
-    class EventSetup;
-    class ParameterSetDescription;
-}
+  class Event;
+  class EventSetup;
+  class ParameterSetDescription;
+}  // namespace edm
 
 class CAHitQuadrupletGenerator {
 public:
-    typedef LayerHitMapCache LayerCacheType;
+  typedef LayerHitMapCache LayerCacheType;
 
-    static constexpr unsigned int minLayers = 4;
-    typedef OrderedHitSeeds ResultType;
+  static constexpr unsigned int minLayers = 4;
+  typedef OrderedHitSeeds ResultType;
 
 public:
+  CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC)
+      : CAHitQuadrupletGenerator(cfg, iC) {}
+  CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
 
-    CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC): CAHitQuadrupletGenerator(cfg, iC) {}
-    CAHitQuadrupletGenerator(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
+  ~CAHitQuadrupletGenerator() = default;
 
-    ~CAHitQuadrupletGenerator() = default;
+  static void fillDescriptions(edm::ParameterSetDescription& desc);
+  static const char* fillDescriptionsLabel() { return "caHitQuadrupletDefault"; }
 
-    static void fillDescriptions(edm::ParameterSetDescription& desc);
-    static const char *fillDescriptionsLabel() { return "caHitQuadrupletDefault"; }
+  void initEvent(const edm::Event& ev, const edm::EventSetup& es);
 
-    void initEvent(const edm::Event& ev, const edm::EventSetup& es);
-
-    void hitNtuplets(const IntermediateHitDoublets& regionDoublets,
-                     std::vector<OrderedHitSeeds>& result,
-                     const edm::EventSetup& es,
-                     const SeedingLayerSetsHits& layers);
+  void hitNtuplets(const IntermediateHitDoublets& regionDoublets,
+                   std::vector<OrderedHitSeeds>& result,
+                   const edm::EventSetup& es,
+                   const SeedingLayerSetsHits& layers);
 
 private:
-    LayerCacheType theLayerCache;
-
-    std::unique_ptr<SeedComparitor> theComparitor;
-
-    class QuantityDependsPtEval {
-    public:
-
-        QuantityDependsPtEval(float v1, float v2, float c1, float c2) :
-        value1_(v1), value2_(v2), curvature1_(c1), curvature2_(c2) {
-        }
-
-        float value(float curvature) const {
-            if (value1_ == value2_) // not enabled
-                return value1_;
-
-            if (curvature1_ < curvature)
-                return value1_;
-            if (curvature2_ < curvature && curvature <= curvature1_)
-                return value2_ + (curvature - curvature2_) / (curvature1_ - curvature2_) * (value1_ - value2_);
-            return value2_;
-        }
-
-    private:
-        const float value1_;
-        const float value2_;
-        const float curvature1_;
-        const float curvature2_;
-    };
-
-    // Linear interpolation (in curvature) between value1 at pt1 and
-    // value2 at pt2. If disabled, value2 is given (the point is to
-    // allow larger/smaller values of the quantity at low pt, so it
-    // makes more sense to have the high-pt value as the default).
-
-    class QuantityDependsPt {
-    public:
-
-        explicit QuantityDependsPt(const edm::ParameterSet& pset) :
-        value1_(pset.getParameter<double>("value1")),
-        value2_(pset.getParameter<double>("value2")),
-        pt1_(pset.getParameter<double>("pt1")),
-        pt2_(pset.getParameter<double>("pt2")),
-        enabled_(pset.getParameter<bool>("enabled")) {
-            if (enabled_ && pt1_ >= pt2_)
-                throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 (" << pt1_ << ") needs to be smaller than pt2 (" << pt2_ << ")";
-            if (pt1_ <= 0)
-                throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 needs to be > 0; is " << pt1_;
-            if (pt2_ <= 0)
-                throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt2 needs to be > 0; is " << pt2_;
-        }
-
-        QuantityDependsPtEval evaluator(const edm::EventSetup& es) const {
-            if (enabled_) {
-                return QuantityDependsPtEval(value1_, value2_,
-                        PixelRecoUtilities::curvature(1.f / pt1_, es),
-                        PixelRecoUtilities::curvature(1.f / pt2_, es));
-            }
-            return QuantityDependsPtEval(value2_, value2_, 0.f, 0.f);
-        }
-
-    private:
-        const float value1_;
-        const float value2_;
-        const float pt1_;
-        const float pt2_;
-        const bool enabled_;
-    };
-
-    const float extraHitRPhitolerance;
-
-    const QuantityDependsPt maxChi2;
-    const bool fitFastCircle;
-    const bool fitFastCircleChi2Cut;
-    const bool useBendingCorrection;
-
-    const float caThetaCut = 0.00125f;
-    const float caPhiCut = 0.1f;
-    const float caHardPtCut = 0.f;
+  LayerCacheType theLayerCache;
+
+  std::unique_ptr<SeedComparitor> theComparitor;
+
+  class QuantityDependsPtEval {
+  public:
+    QuantityDependsPtEval(float v1, float v2, float c1, float c2)
+        : value1_(v1), value2_(v2), curvature1_(c1), curvature2_(c2) {}
+
+    float value(float curvature) const {
+      if (value1_ == value2_)  // not enabled
+        return value1_;
+
+      if (curvature1_ < curvature)
+        return value1_;
+      if (curvature2_ < curvature && curvature <= curvature1_)
+        return value2_ + (curvature - curvature2_) / (curvature1_ - curvature2_) * (value1_ - value2_);
+      return value2_;
+    }
+
+  private:
+    const float value1_;
+    const float value2_;
+    const float curvature1_;
+    const float curvature2_;
+  };
+
+  // Linear interpolation (in curvature) between value1 at pt1 and
+  // value2 at pt2. If disabled, value2 is given (the point is to
+  // allow larger/smaller values of the quantity at low pt, so it
+  // makes more sense to have the high-pt value as the default).
+
+  class QuantityDependsPt {
+  public:
+    explicit QuantityDependsPt(const edm::ParameterSet& pset)
+        : value1_(pset.getParameter<double>("value1")),
+          value2_(pset.getParameter<double>("value2")),
+          pt1_(pset.getParameter<double>("pt1")),
+          pt2_(pset.getParameter<double>("pt2")),
+          enabled_(pset.getParameter<bool>("enabled")) {
+      if (enabled_ && pt1_ >= pt2_)
+        throw cms::Exception("Configuration") << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 (" << pt1_
+                                              << ") needs to be smaller than pt2 (" << pt2_ << ")";
+      if (pt1_ <= 0)
+        throw cms::Exception("Configuration")
+            << "PixelQuadrupletGenerator::QuantityDependsPt: pt1 needs to be > 0; is " << pt1_;
+      if (pt2_ <= 0)
+        throw cms::Exception("Configuration")
+            << "PixelQuadrupletGenerator::QuantityDependsPt: pt2 needs to be > 0; is " << pt2_;
+    }
+
+    QuantityDependsPtEval evaluator(const edm::EventSetup& es) const {
+      if (enabled_) {
+        return QuantityDependsPtEval(value1_,
+                                     value2_,
+                                     PixelRecoUtilities::curvature(1.f / pt1_, es),
+                                     PixelRecoUtilities::curvature(1.f / pt2_, es));
+      }
+      return QuantityDependsPtEval(value2_, value2_, 0.f, 0.f);
+    }
+
+  private:
+    const float value1_;
+    const float value2_;
+    const float pt1_;
+    const float pt2_;
+    const bool enabled_;
+  };
+
+  const float extraHitRPhitolerance;
+
+  const QuantityDependsPt maxChi2;
+  const bool fitFastCircle;
+  const bool fitFastCircleChi2Cut;
+  const bool useBendingCorrection;
+
+  const float caThetaCut = 0.00125f;
+  const float caPhiCut = 0.1f;
+  const float caHardPtCut = 0.f;
 };
 #endif
diff --git a/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
index fa538256ed010..dfe7da010f99e 100644
--- a/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
+++ b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
@@ -18,86 +18,67 @@
 |
 */
 
-#include<cmath>
+#include <cmath>
 
-template<typename T>
+template <typename T>
 class CircleEq {
-
 public:
+  CircleEq() {}
 
-  CircleEq(){}
-
-  constexpr CircleEq(T x1, T y1,
-         T x2, T y2,
-         T x3, T y3) {
-    compute(x1,y1,x2,y2,x3,y3);
-  }
+  constexpr CircleEq(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); }
 
-  constexpr void compute(T x1, T y1,
-           T x2, T y2,
-           T x3, T y3);
+  constexpr void compute(T x1, T y1, T x2, T y2, T x3, T y3);
 
   // dca to origin divided by curvature
   constexpr T dca0() const {
-   auto x =  m_c*m_xp + m_alpha;
-   auto y =  m_c*m_yp + m_beta;
-   return std::sqrt(x*x+y*y) - T(1);
+    auto x = m_c * m_xp + m_alpha;
+    auto y = m_c * m_yp + m_beta;
+    return std::sqrt(x * x + y * y) - T(1);
   }
 
-   // dca to given point (divided by curvature)
+  // dca to given point (divided by curvature)
   constexpr T dca(T x, T y) const {
-    x =  m_c*(m_xp-x) + m_alpha;
-    y =  m_c*(m_yp-y) + m_beta;
-   return std::sqrt(x*x+y*y) - T(1);
-
+    x = m_c * (m_xp - x) + m_alpha;
+    y = m_c * (m_yp - y) + m_beta;
+    return std::sqrt(x * x + y * y) - T(1);
   }
 
   // curvature
-  constexpr auto curvature() const { return m_c;}
-
+  constexpr auto curvature() const { return m_c; }
 
   // alpha and beta
-  constexpr std::pair<T,T> cosdir() const {
-    return std::make_pair(m_alpha, m_beta);
-  }
-
+  constexpr std::pair<T, T> cosdir() const { return std::make_pair(m_alpha, m_beta); }
 
   // alpha and beta af given point
-  constexpr std::pair<T,T> cosdir(T x, T y) const {
-      return std::make_pair(m_alpha - m_c*(x-m_xp), m_beta - m_c*(y-m_yp));
+  constexpr std::pair<T, T> cosdir(T x, T y) const {
+    return std::make_pair(m_alpha - m_c * (x - m_xp), m_beta - m_c * (y - m_yp));
   }
 
   // center
-  constexpr std::pair<T,T> center() const {
-   return std::make_pair(m_xp + m_alpha/m_c, m_yp + m_beta/m_c);
-  }
-
-  constexpr auto radius() const { return T(1)/m_c;}
+  constexpr std::pair<T, T> center() const { return std::make_pair(m_xp + m_alpha / m_c, m_yp + m_beta / m_c); }
 
-  T m_xp=0;
-  T m_yp=0;
-  T m_c=0;
-  T m_alpha=0;
-  T m_beta=0;
+  constexpr auto radius() const { return T(1) / m_c; }
 
+  T m_xp = 0;
+  T m_yp = 0;
+  T m_c = 0;
+  T m_alpha = 0;
+  T m_beta = 0;
 };
 
+template <typename T>
+constexpr void CircleEq<T>::compute(T x1, T y1, T x2, T y2, T x3, T y3) {
+  bool noflip = std::abs(x3 - x1) < std::abs(y3 - y1);
 
-template<typename T>
-constexpr void CircleEq<T>::compute(T x1, T y1,
-                T x2, T y2,
-                T x3, T y3) {
-  bool noflip = std::abs(x3-x1) < std::abs(y3-y1);
-
-  auto x1p = noflip ? x1-x2 : y1-y2;
-  auto y1p = noflip ? y1-y2 : x1-x2;
-  auto d12 = x1p*x1p + y1p*y1p;
-  auto x3p = noflip ? x3-x2 : y3-y2;
-  auto y3p = noflip ? y3-y2 : x3-x2;
-  auto d32 = x3p*x3p + y3p*y3p;
+  auto x1p = noflip ? x1 - x2 : y1 - y2;
+  auto y1p = noflip ? y1 - y2 : x1 - x2;
+  auto d12 = x1p * x1p + y1p * y1p;
+  auto x3p = noflip ? x3 - x2 : y3 - y2;
+  auto y3p = noflip ? y3 - y2 : x3 - x2;
+  auto d32 = x3p * x3p + y3p * y3p;
 
-  auto num = x1p*y3p-y1p*x3p;  // num also gives correct sign for CT
-  auto det = d12*y3p-d32*y1p;
+  auto num = x1p * y3p - y1p * x3p;  // num also gives correct sign for CT
+  auto det = d12 * y3p - d32 * y1p;
 
   /*
   auto ct  = num/det;
@@ -109,20 +90,18 @@ constexpr void CircleEq<T>::compute(T x1, T y1,
   ct *= T(2.)*al2;
   */
 
-  auto st2 = (d12*x3p-d32*x1p);
-  auto seq = det*det +st2*st2;
-  auto al2 = T(1.)/std::sqrt(seq);
-  auto be2 = -st2*al2;
-  auto ct = T(2.)*num*al2;
-  al2 *=det;
+  auto st2 = (d12 * x3p - d32 * x1p);
+  auto seq = det * det + st2 * st2;
+  auto al2 = T(1.) / std::sqrt(seq);
+  auto be2 = -st2 * al2;
+  auto ct = T(2.) * num * al2;
+  al2 *= det;
 
   m_xp = x2;
   m_yp = y2;
   m_c = noflip ? ct : -ct;
   m_alpha = noflip ? al2 : -be2;
-  m_beta  = noflip ? be2 : -al2;
-
+  m_beta = noflip ? be2 : -al2;
 }
 
 #endif
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index 7f94fd05ece77..cc5865d97fd95 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -1,95 +1,67 @@
 #include "BrokenLineFitOnGPU.h"
 
-void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const * hv,
-                                            uint32_t hitsInFit,
-                                            uint32_t maxNumberOfTuples) {
+void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
   assert(tuples_d);
 
   //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ =
-      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
   auto fast_fit_resultsGPU_ =
       std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      hv,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      3,
-                                                                      offset);
+    kernelBLFastFit<3>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
 
     kernelBLFit<3>(tupleMultiplicity_d,
-                                                                  bField_,
-                                                                  outputSoa_d,
-                                                                  hitsGPU_.get(),
-                                                                  hits_geGPU_.get(),
-                                                                  fast_fit_resultsGPU_.get(),
-                                                                  3,
-                                                                  offset);
+                   bField_,
+                   outputSoa_d,
+                   hitsGPU_.get(),
+                   hits_geGPU_.get(),
+                   fast_fit_resultsGPU_.get(),
+                   3,
+                   offset);
 
     // fit quads
-    kernelBLFastFit<4>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      hv,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      4,
-                                                                      offset);
+    kernelBLFastFit<4>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
 
     kernelBLFit<4>(tupleMultiplicity_d,
-                                                                  bField_,
-                                                                  outputSoa_d,
-                                                                  hitsGPU_.get(),
-                                                                  hits_geGPU_.get(),
-                                                                  fast_fit_resultsGPU_.get(),
-                                                                  4,
-                                                                  offset);
+                   bField_,
+                   outputSoa_d,
+                   hitsGPU_.get(),
+                   hits_geGPU_.get(),
+                   fast_fit_resultsGPU_.get(),
+                   4,
+                   offset);
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4>(tuples_d,
-                                                                        tupleMultiplicity_d,
-                                                                        hv,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        5,
-                                                                        offset);
+      kernelBLFastFit<4>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
 
       kernelBLFit<4>(tupleMultiplicity_d,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     5,
+                     offset);
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5>(tuples_d,
-                                                                        tupleMultiplicity_d,
-                                                                        hv,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        5,
-                                                                        offset);
+      kernelBLFastFit<5>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
 
       kernelBLFit<5>(tupleMultiplicity_d,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     5,
+                     offset);
     }
 
   }  // loop on concurrent fits
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 4dc93f961b4a2..3825c6d812cfb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,7 +1,7 @@
 #include "BrokenLineFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsView const * hv,
+void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
                                             cuda::stream_t<> &stream) {
@@ -13,21 +13,15 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const * hv,
   //  Fit internals
   auto hitsGPU_ = cudautils::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ =
-      cudautils::make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ =
-      cudautils::make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      hv,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      3,
-                                                                      offset);
+    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
     cudaCheck(cudaGetLastError());
 
     kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
@@ -41,9 +35,13 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const * hv,
     cudaCheck(cudaGetLastError());
 
     // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      hv,
+    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    cudaCheck(cudaGetLastError());
+
+    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                      bField_,
+                                                                      outputSoa_d,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
@@ -51,58 +49,36 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const * hv,
                                                                       offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                  bField_,
-                                                                  outputSoa_d,
-                                                                  hitsGPU_.get(),
-                                                                  hits_geGPU_.get(),
-                                                                  fast_fit_resultsGPU_.get(),
-                                                                  4,
-                                                                  offset);
-    cudaCheck(cudaGetLastError());
-
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                        tupleMultiplicity_d,
-                                                                        hv,
+      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                        bField_,
+                                                                        outputSoa_d,
                                                                         hitsGPU_.get(),
                                                                         hits_geGPU_.get(),
                                                                         fast_fit_resultsGPU_.get(),
                                                                         5,
                                                                         offset);
       cudaCheck(cudaGetLastError());
-
-      kernelBLFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
-      cudaCheck(cudaGetLastError());
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                        tupleMultiplicity_d,
-                                                                        hv,
+      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                        bField_,
+                                                                        outputSoa_d,
                                                                         hitsGPU_.get(),
                                                                         hits_geGPU_.get(),
                                                                         fast_fit_resultsGPU_.get(),
                                                                         5,
                                                                         offset);
       cudaCheck(cudaGetLastError());
-
-      kernelBLFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
-      cudaCheck(cudaGetLastError());
     }
 
   }  // loop on concurrent fits
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index e79b55a69227d..82a5bee443f88 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -47,14 +47,16 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
 
 #ifdef BROKENLINE_DEBUG
   if (0 == local_start) {
-    printf("%d total Ntuple\n",foundNtuplets->nbins());
+    printf("%d total Ntuple\n", foundNtuplets->nbins());
     printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
   }
 #endif
 
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
-     auto tuple_idx = local_idx + offset;
-    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
 
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
@@ -67,10 +69,10 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
     Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
 #ifdef BL_DUMP_HITS
-  __shared__ int done;
-  done = 0;
-  __syncthreads();
-  bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
+    __shared__ int done;
+    done = 0;
+    __syncthreads();
+    bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
 #endif
 
     // Prepare data structure
@@ -78,27 +80,29 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
     for (unsigned int i = 0; i < hitsInFit; ++i) {
       auto hit = hitId[i];
       float ge[6];
-      hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+      hhp->cpeParams()
+          .detParams(hhp->detectorIndex(hit))
+          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
 #ifdef BL_DUMP_HITS
-    if (dump) {
-      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
-             tkid,
-             hhp->detectorIndex(hit),
-             i,
-             hhp->xGlobal(hit),
-             hhp->yGlobal(hit),
-             hhp->zGlobal(hit));
-      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
-             tkid,
-             hhp->detetectorIndex(hit),
-             i,
-             ge[0],
-             ge[1],
-             ge[2],
-             ge[3],
-             ge[4],
-             ge[5]);
-    }
+      if (dump) {
+        printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
+               tkid,
+               hhp->detectorIndex(hit),
+               i,
+               hhp->xGlobal(hit),
+               hhp->yGlobal(hit),
+               hhp->zGlobal(hit));
+        printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
+               tkid,
+               hhp->detetectorIndex(hit),
+               i,
+               ge[0],
+               ge[1],
+               ge[2],
+               ge[3],
+               ge[4],
+               ge[5]);
+      }
 #endif
       hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
@@ -131,9 +135,11 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
 
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
-     auto tuple_idx = local_idx + offset;
-    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
 
     // get it for the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
@@ -152,32 +158,30 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
     BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
     BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
 
-    results->stateAtBS.copyFromCircle(circle.par,circle.cov,line.par,line.cov,1.f/float(B),tkid);
-    results->pt(tkid) =  float(B)/float(std::abs(circle.par(2)));
-    results->eta(tkid) =  asinhf(line.par(0));
-    results->chi2(tkid) = (circle.chi2+line.chi2)/(2*N-5);
+    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(B), tkid);
+    results->pt(tkid) = float(B) / float(std::abs(circle.par(2)));
+    results->eta(tkid) = asinhf(line.par(0));
+    results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
 
 #ifdef BROKENLINE_DEBUG
-  if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
-    printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
-  printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
-         N,
-         nHits,
-         tkid,
-         circle.par(0),
-         circle.par(1),
-         circle.par(2));
-  printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
-  printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
-         circle.chi2,
-         line.chi2,
-         circle.cov(0, 0),
-         circle.cov(1, 1),
-         circle.cov(2, 2),
-         line.cov(0, 0),
-         line.cov(1, 1));
+    if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
+      printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
+    printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+           N,
+           nHits,
+           tkid,
+           circle.par(0),
+           circle.par(1),
+           circle.par(2));
+    printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
+    printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
+           circle.chi2,
+           line.chi2,
+           circle.cov(0, 0),
+           circle.cov(1, 1),
+           circle.cov(2, 2),
+           line.cov(0, 0),
+           line.cov(1, 1));
 #endif
   }
 }
-
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 06d5fdf7dd898..d93d89965607f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -23,14 +23,14 @@ namespace CAConstants {
   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
 #ifndef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumberOfDoublets() { return 448*1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 128; }
 #else
-  constexpr uint32_t maxNumberOfDoublets() { return  128*1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
 #endif
 #else
-  constexpr uint32_t maxNumberOfDoublets() { return 448*1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 4 * 128; }
 #endif
   constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 88d9f7934275d..f70b61d406d1a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -22,7 +22,6 @@
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
-
 class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 public:
   explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig);
@@ -41,27 +40,23 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
 
   CAHitNtupletGeneratorOnGPU gpuAlgo_;
-
 };
 
-CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) :
-      m_OnGPU(iConfig.getParameter<bool>("onGPU")),
-      gpuAlgo_(iConfig, consumesCollector()) {
+CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
+    : m_OnGPU(iConfig.getParameter<bool>("onGPU")), gpuAlgo_(iConfig, consumesCollector()) {
   if (m_OnGPU) {
-      tokenHitGPU_ = consumes<CUDAProduct<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-      tokenTrackGPU_ = produces<CUDAProduct<PixelTrackHeterogeneous>>();
+    tokenHitGPU_ = consumes<CUDAProduct<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenTrackGPU_ = produces<CUDAProduct<PixelTrackHeterogeneous>>();
   } else {
-      tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-      tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
+    tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
   }
-
 }
 
-
 void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
-  desc.add<bool>("onGPU",true);
+  desc.add<bool>("onGPU", true);
   desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
 
   CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
@@ -70,31 +65,20 @@ void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descript
 }
 
 void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
-
-  auto bf = 1./PixelRecoUtilities::fieldInInvGev(es);
+  auto bf = 1. / PixelRecoUtilities::fieldInInvGev(es);
 
   if (m_OnGPU) {
-    edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>>  hHits;
+    edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>> hHits;
     iEvent.getByToken(tokenHitGPU_, hHits);
 
     CUDAScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
 
-    ctx.emplace(
-      iEvent,
-      tokenTrackGPU_,
-      std::move(gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()))
-     );
+    ctx.emplace(iEvent, tokenTrackGPU_, std::move(gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())));
   } else {
-    
     auto const& hits = iEvent.get(tokenHitCPU_);
     iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
-
   }
-
-
 }
 
-
-
 DEFINE_FWK_MODULE(CAHitNtupletCUDA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index d18ef856d2782..d85e09d7e2df6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -1,19 +1,16 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-template<>
-void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const * counters) {
-   kernel_printCounters(counters);
+template <>
+void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
+  kernel_printCounters(counters);
 }
 
-
-template<>
-void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const * hv, TkSoA * tracks_d, cudaStream_t) {
+template <>
+void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) {
   kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
 }
 
-
-
-template<>
+template <>
 void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
   auto nhits = hh.nHits();
 
@@ -24,60 +21,54 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cuda::s
   // in principle we can use "nhits" to heuristically dimension the workspace...
   // overkill to use template here (std::make_unique would suffice)
   // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
-  device_isOuterHitOfCell_.reset((GPUCACell::OuterHitOfCell*)malloc(std::max(1U,nhits)*sizeof(GPUCACell::OuterHitOfCell)));
+  device_isOuterHitOfCell_.reset(
+      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
   assert(device_isOuterHitOfCell_.get());
   gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
-                                                                                   nhits,
-                                                                                   device_theCellNeighbors_,
-                                                                                   device_theCellNeighborsContainer_.get(),
-                                                                                   device_theCellTracks_,
-                                                                                   device_theCellTracksContainer_.get());
+                                 nhits,
+                                 device_theCellNeighbors_,
+                                 device_theCellNeighborsContainer_.get(),
+                                 device_theCellTracks_,
+                                 device_theCellTracksContainer_.get());
 
   // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
-  device_theCells_.reset((GPUCACell*)malloc(sizeof(GPUCACell)*m_params.maxNumberOfDoublets_));
+  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
   if (0 == nhits)
     return;  // protect against empty events
 
   // FIXME avoid magic numbers
-  auto nActualPairs=gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_) nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_>3) {
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_ > 3) {
     nActualPairs = 13;
   }
 
-  assert(nActualPairs<=gpuPixelDoublets::nPairs);
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
   gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
-                                                                         device_nCells_,
-                                                                         device_theCellNeighbors_,
-                                                                         device_theCellTracks_,
-                                                                         hh.view(),
-                                                                         device_isOuterHitOfCell_.get(),
-                                                                         nActualPairs,
-                                                                         m_params.idealConditions_,
-                                                                         m_params.doClusterCut_,
-                                                                         m_params.doZCut_,
-                                                                         m_params.doPhiCut_,
-                                                                         m_params.maxNumberOfDoublets_);
-
-
+                                         device_nCells_,
+                                         device_theCellNeighbors_,
+                                         device_theCellTracks_,
+                                         hh.view(),
+                                         device_isOuterHitOfCell_.get(),
+                                         nActualPairs,
+                                         m_params.idealConditions_,
+                                         m_params.doClusterCut_,
+                                         m_params.doZCut_,
+                                         m_params.doPhiCut_,
+                                         m_params.maxNumberOfDoublets_);
 }
 
-
-template<>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(
-    HitsOnCPU const &hh,
-    TkSoA * tracks_d,
-    cudaStream_t cudaStream) {
-
-  auto * tuples_d = &tracks_d->hitIndices;
-  auto * quality_d = (Quality*)(&tracks_d->m_quality);
+template <>
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
 
   assert(tuples_d && quality_d);
 
   // zero tuples
   cudautils::launchZero(tuples_d, cudaStream);
 
-
   auto nhits = hh.nHits();
   assert(nhits <= pixelGPUConstants::maxNumberOfHits);
 
@@ -88,107 +79,86 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(
   // applying conbinatoric cleaning such as fishbone at this stage is too expensive
   //
 
-  kernel_connect(
-      device_hitTuple_apc_,
-      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-      hh.view(),
-      device_theCells_.get(),
-      device_nCells_,
-      device_theCellNeighbors_,
-      device_isOuterHitOfCell_.get(),
-      m_params.hardCurvCut_,
-      m_params.ptmin_,
-      m_params.CAThetaCutBarrel_,
-      m_params.CAThetaCutForward_,
-      m_params.dcaCutInnerTriplet_,
-      m_params.dcaCutOuterTriplet_);
-
-
- if (nhits > 1 && m_params.earlyFishbone_) {
-    fishbone(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+  kernel_connect(device_hitTuple_apc_,
+                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+                 hh.view(),
+                 device_theCells_.get(),
+                 device_nCells_,
+                 device_theCellNeighbors_,
+                 device_isOuterHitOfCell_.get(),
+                 m_params.hardCurvCut_,
+                 m_params.ptmin_,
+                 m_params.CAThetaCutBarrel_,
+                 m_params.CAThetaCutForward_,
+                 m_params.dcaCutInnerTriplet_,
+                 m_params.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && m_params.earlyFishbone_) {
+    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
   }
 
-
   kernel_find_ntuplets(hh.view(),
-                                                                     device_theCells_.get(),
-                                                                     device_nCells_,
-                                                                     device_theCellTracks_,
-                                                                     tuples_d,
-                                                                     device_hitTuple_apc_,
-                                                                     quality_d,
-                                                                     m_params.minHitsPerNtuplet_);
+                       device_theCells_.get(),
+                       device_nCells_,
+                       device_theCellTracks_,
+                       tuples_d,
+                       device_hitTuple_apc_,
+                       quality_d,
+                       m_params.minHitsPerNtuplet_);
   if (m_params.doStats_)
-    kernel_mark_used(hh.view(),
-                                                                   device_theCells_.get(),
-                                                                   device_nCells_);
-
+    kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
 
   cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(
-      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
 
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && m_params.lateFishbone_) {
-    fishbone(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
   }
 
-
- if (m_params.doStats_) {
+  if (m_params.doStats_) {
     kernel_checkOverflows(tuples_d,
-                                                                        device_tupleMultiplicity_.get(),
-                                                                        device_hitTuple_apc_,
-                                                                        device_theCells_.get(),
-                                                                        device_nCells_,
-                                                                        device_theCellNeighbors_,
-                                                                        device_theCellTracks_,
-                                                                        device_isOuterHitOfCell_.get(),
-                                                                        nhits,
-                                                                        m_params.maxNumberOfDoublets_,
-                                                                        counters_);
+                          device_tupleMultiplicity_.get(),
+                          device_hitTuple_apc_,
+                          device_theCells_.get(),
+                          device_nCells_,
+                          device_theCellNeighbors_,
+                          device_theCellTracks_,
+                          device_isOuterHitOfCell_.get(),
+                          nhits,
+                          m_params.maxNumberOfDoublets_,
+                          counters_);
   }
-
 }
 
-
-
-template<>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh,
-                                                     TkSoA * tracks_d,
-                                                     cudaStream_t cudaStream) {
-  auto const * tuples_d = &tracks_d->hitIndices;
-  auto * quality_d = (Quality*)(&tracks_d->m_quality);
+template <>
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
 
   // classify tracks based on kinematics
-  kernel_classifyTracks(
-      tuples_d, tracks_d, m_params.cuts_, quality_d);
+  kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d);
 
   if (m_params.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    kernel_fishboneCleaner(
-        device_theCells_.get(), device_nCells_, quality_d);
+    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
   }
 
   // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(
-      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
 
   // fill hit->track "map"
-  kernel_countHitInTracks(
-      tuples_d, quality_d, device_hitToTuple_.get());
+  kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
   cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
-  kernel_fillHitInTracks(
-      tuples_d, quality_d, device_hitToTuple_.get());
+  kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
 
   // remove duplicates (tracks that share a hit)
-  kernel_tripletCleaner(
-      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+  kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
 
   if (m_params.doStats_) {
     // counters (add flag???)
@@ -196,10 +166,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh,
     kernel_doStatsForTracks(tuples_d, quality_d, counters_);
   }
 
-#ifdef    DUMP_GPU_TK_TUPLES
+#ifdef DUMP_GPU_TK_TUPLES
   static std::atomic<int> iev(0);
   ++iev;
-  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100,iev);
+  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
 #endif
-
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 4ca6d9988dcd6..93af3d43ff06e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,31 +1,27 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-template<>
-void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const * hv, TkSoA * tracks_d, cudaStream_t cudaStream) {
-  auto blockSize=128;
+template <>
+void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto blockSize = 128;
   auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
 
-  kernel_fillHitDetIndices<<<numberOfBlocks,blockSize,0,cudaStream>>>(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
 #endif
 }
 
-
-template<>
-void CAHitNtupletGeneratorKernelsGPU::launchKernels(
-    HitsOnCPU const &hh,
-    TkSoA * tracks_d,
-    cudaStream_t cudaStream) {
-
+template <>
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto * tuples_d = &tracks_d->hitIndices; 
-  auto * quality_d = (Quality*)(&tracks_d->m_quality);  
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
 
   // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);  
+  cudautils::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
   assert(nhits <= pixelGPUConstants::maxNumberOfHits);
@@ -40,10 +36,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(
   auto nthTot = 64;
   auto stride = 4;
   auto blockSize = nthTot / stride;
-  auto numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
+  auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   auto rescale = numberOfBlocks / 65536;
   blockSize *= (rescale + 1);
-  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   assert(numberOfBlocks < 65536);
   assert(blockSize > 0 && 0 == blockSize % 16);
   dim3 blks(1, numberOfBlocks, 1);
@@ -65,7 +61,6 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(
       m_params.dcaCutOuterTriplet_);
   cudaCheck(cudaGetLastError());
 
-
   if (nhits > 1 && m_params.earlyFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
@@ -78,9 +73,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(
     cudaCheck(cudaGetLastError());
   }
 
-
   blockSize = 64;
-  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
@@ -92,32 +86,31 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(
   cudaCheck(cudaGetLastError());
 
   if (m_params.doStats_)
-    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                   device_theCells_.get(),
-                                                                   device_nCells_);
+    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
   cudaCheck(cudaGetLastError());
- 
+
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
 
-
   blockSize = 128;
   numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
   cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, quality_d);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
-  numberOfBlocks = (3*CAConstants::maxTuples()/4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 
   if (nhits > 1 && m_params.lateFishbone_) {
@@ -151,11 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
-
 }
 
-
-template<>
+template <>
 void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
   auto nhits = hh.nHits();
 
@@ -169,12 +160,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits), stream);
+  device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
   assert(device_isOuterHitOfCell_.get());
   {
     int threadsPerBlock = 128;
     // at least one block!
-    int blocks = ( std::max(1U,nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
     gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream.id()>>>(device_isOuterHitOfCell_.get(),
                                                                                 nhits,
                                                                                 device_theCellNeighbors_,
@@ -195,13 +186,14 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
     return;  // protect against empty events
 
   // FIXME avoid magic numbers
-  auto nActualPairs=gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_) nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_>3) {
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_ > 3) {
     nActualPairs = 13;
   }
 
-  assert(nActualPairs<=gpuPixelDoublets::nPairs);
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
   int stride = 1;
   int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
   int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
@@ -225,54 +217,47 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
-
 }
 
-
-template<>
-void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh,
-                                                     TkSoA * tracks_d,
-                                                     cudaStream_t cudaStream) {
+template <>
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
-  auto const * tuples_d = &tracks_d->hitIndices;
-  auto * quality_d = (Quality*)(&tracks_d->m_quality);
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
 
   auto blockSize = 64;
 
   // classify tracks based on kinematics
-  auto numberOfBlocks = (3*CAConstants::maxNumberOfQuadruplets()/4 + blockSize - 1) / blockSize;
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, tracks_d, m_params.cuts_, quality_d);
+  auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, m_params.cuts_, quality_d);
   cudaCheck(cudaGetLastError());
 
   if (m_params.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
+    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
     kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         device_theCells_.get(), device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
   }
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3*m_params.maxNumberOfDoublets_/4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
   cudaCheck(cudaGetLastError());
 
-
-  if (m_params.minHitsPerNtuplet_<4 || m_params.doStats_) {
+  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
     // fill hit->track "map"
-    numberOfBlocks = (3*CAConstants::maxNumberOfQuadruplets()/4 + blockSize - 1) / blockSize;
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
     cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
     cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
   }
-  if (m_params.minHitsPerNtuplet_<4) {
+  if (m_params.minHitsPerNtuplet_ < 4) {
     // remove duplicates (tracks that share a hit)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
     kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
@@ -280,32 +265,29 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh,
     cudaCheck(cudaGetLastError());
   }
 
-
   if (m_params.doStats_) {
     // counters (add flag???)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
-    numberOfBlocks = (3*CAConstants::maxNumberOfQuadruplets()/4 + blockSize - 1) / blockSize;
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
     kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
     cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
 #endif
 
-#ifdef    DUMP_GPU_TK_TUPLES
+#ifdef DUMP_GPU_TK_TUPLES
   static std::atomic<int> iev(0);
   ++iev;
-  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100,iev);
+  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
+      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
 #endif
-
 }
 
-template<>
-void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const * counters) { 
-   kernel_printCounters<<<1, 1>>>(counters);
+template <>
+void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) {
+  kernel_printCounters<<<1, 1>>>(counters);
 }
-
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index a3a1f45213576..2cbea06e66c55 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -1,7 +1,6 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 
-
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "GPUCACell.h"
 
@@ -34,127 +33,120 @@ namespace cAHitNtupletGenerator {
   using TkSoA = pixelTrack::TrackSoA;
   using HitContainer = pixelTrack::HitContainer;
 
-
   struct QualityCuts {
     // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
     float chi2Coeff[4];
-    float chi2MaxPt;    // GeV
+    float chi2MaxPt;  // GeV
     float chi2Scale;
 
     struct region {
-      float maxTip;     // cm
-      float minPt;      // GeV
-      float maxZip;     // cm
+      float maxTip;  // cm
+      float minPt;   // GeV
+      float maxZip;  // cm
     };
 
     region triplet;
     region quadruplet;
   };
 
-
   // params
   struct Params {
-    Params(bool onGPU, 
+    Params(bool onGPU,
            uint32_t minHitsPerNtuplet,
            uint32_t maxNumberOfDoublets,
-                                  bool useRiemannFit,
-                                  bool fit5as4,
-                                  bool includeJumpingForwardDoublets,
-                                  bool earlyFishbone,
-                                  bool lateFishbone,
-                                  bool idealConditions,
-                                  bool doStats,
-                                  bool doClusterCut,
-                                  bool doZCut,
-                                  bool doPhiCut,
-                                  float ptmin,
-                                  float CAThetaCutBarrel,
-                                  float CAThetaCutForward,
-                                  float hardCurvCut,
-                                  float dcaCutInnerTriplet,
-                                  float dcaCutOuterTriplet,
-                                  QualityCuts const& cuts)
-      : onGPU_(onGPU),
-        minHitsPerNtuplet_(minHitsPerNtuplet),
-        maxNumberOfDoublets_(maxNumberOfDoublets),
-        useRiemannFit_(useRiemannFit),
-        fit5as4_(fit5as4),
-        includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
-        earlyFishbone_(earlyFishbone),
-        lateFishbone_(lateFishbone),
-        idealConditions_(idealConditions),
-        doStats_(doStats),
-        doClusterCut_(doClusterCut),
-        doZCut_(doZCut),
-        doPhiCut_(doPhiCut),
-        ptmin_(ptmin),
-        CAThetaCutBarrel_(CAThetaCutBarrel),
-        CAThetaCutForward_(CAThetaCutForward),
-        hardCurvCut_(hardCurvCut),
-        dcaCutInnerTriplet_(dcaCutInnerTriplet),
-        dcaCutOuterTriplet_(dcaCutOuterTriplet),
-        cuts_(cuts) { }
-
-  const bool onGPU_;
-  const uint32_t minHitsPerNtuplet_;
-  const uint32_t maxNumberOfDoublets_;
-  const bool useRiemannFit_;
-  const bool fit5as4_;
-  const bool includeJumpingForwardDoublets_;
-  const bool earlyFishbone_;
-  const bool lateFishbone_;
-  const bool idealConditions_;
-  const bool doStats_;
-  const bool doClusterCut_;
-  const bool doZCut_;
-  const bool doPhiCut_;
-  const float ptmin_;
-  const float CAThetaCutBarrel_;
-  const float CAThetaCutForward_;
-  const float hardCurvCut_;
-  const float dcaCutInnerTriplet_;
-  const float dcaCutOuterTriplet_;
-
-  // quality cuts
-  QualityCuts cuts_
-  {
-    // polynomial coefficients for the pT-dependent chi2 cut
-    { 0.68177776, 0.74609577, -0.08035491, 0.00315399 },
-    // max pT used to determine the chi2 cut
-    10.,
-    // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
-    30.,
-    // regional cuts for triplets
-    {
-      0.3,  // |Tip| < 0.3 cm
-      0.5,  // pT > 0.5 GeV
-      12.0  // |Zip| < 12.0 cm
-    },
-    // regional cuts for quadruplets
-    {
-      0.5,  // |Tip| < 0.5 cm
-      0.3,  // pT > 0.3 GeV
-      12.0  // |Zip| < 12.0 cm
-    }
-   };
-  
-  }; // Params
-
-}
-
-template<typename TTraits>
+           bool useRiemannFit,
+           bool fit5as4,
+           bool includeJumpingForwardDoublets,
+           bool earlyFishbone,
+           bool lateFishbone,
+           bool idealConditions,
+           bool doStats,
+           bool doClusterCut,
+           bool doZCut,
+           bool doPhiCut,
+           float ptmin,
+           float CAThetaCutBarrel,
+           float CAThetaCutForward,
+           float hardCurvCut,
+           float dcaCutInnerTriplet,
+           float dcaCutOuterTriplet,
+           QualityCuts const& cuts)
+        : onGPU_(onGPU),
+          minHitsPerNtuplet_(minHitsPerNtuplet),
+          maxNumberOfDoublets_(maxNumberOfDoublets),
+          useRiemannFit_(useRiemannFit),
+          fit5as4_(fit5as4),
+          includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
+          earlyFishbone_(earlyFishbone),
+          lateFishbone_(lateFishbone),
+          idealConditions_(idealConditions),
+          doStats_(doStats),
+          doClusterCut_(doClusterCut),
+          doZCut_(doZCut),
+          doPhiCut_(doPhiCut),
+          ptmin_(ptmin),
+          CAThetaCutBarrel_(CAThetaCutBarrel),
+          CAThetaCutForward_(CAThetaCutForward),
+          hardCurvCut_(hardCurvCut),
+          dcaCutInnerTriplet_(dcaCutInnerTriplet),
+          dcaCutOuterTriplet_(dcaCutOuterTriplet),
+          cuts_(cuts) {}
+
+    const bool onGPU_;
+    const uint32_t minHitsPerNtuplet_;
+    const uint32_t maxNumberOfDoublets_;
+    const bool useRiemannFit_;
+    const bool fit5as4_;
+    const bool includeJumpingForwardDoublets_;
+    const bool earlyFishbone_;
+    const bool lateFishbone_;
+    const bool idealConditions_;
+    const bool doStats_;
+    const bool doClusterCut_;
+    const bool doZCut_;
+    const bool doPhiCut_;
+    const float ptmin_;
+    const float CAThetaCutBarrel_;
+    const float CAThetaCutForward_;
+    const float hardCurvCut_;
+    const float dcaCutInnerTriplet_;
+    const float dcaCutOuterTriplet_;
+
+    // quality cuts
+    QualityCuts cuts_{// polynomial coefficients for the pT-dependent chi2 cut
+                      {0.68177776, 0.74609577, -0.08035491, 0.00315399},
+                      // max pT used to determine the chi2 cut
+                      10.,
+                      // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                      30.,
+                      // regional cuts for triplets
+                      {
+                          0.3,  // |Tip| < 0.3 cm
+                          0.5,  // pT > 0.5 GeV
+                          12.0  // |Zip| < 12.0 cm
+                      },
+                      // regional cuts for quadruplets
+                      {
+                          0.5,  // |Tip| < 0.5 cm
+                          0.3,  // pT > 0.3 GeV
+                          12.0  // |Zip| < 12.0 cm
+                      }};
+
+  };  // Params
+
+}  // namespace cAHitNtupletGenerator
+
+template <typename TTraits>
 class CAHitNtupletGeneratorKernels {
 public:
-
   using Traits = TTraits;
 
   using QualityCuts = cAHitNtupletGenerator::QualityCuts;
   using Params = cAHitNtupletGenerator::Params;
   using Counters = cAHitNtupletGenerator::Counters;
 
-  template<typename T>
-  using unique_ptr = typename Traits:: template unique_ptr<T>;
-
+  template <typename T>
+  using unique_ptr = typename Traits::template unique_ptr<T>;
 
   using HitsView = TrackingRecHit2DSOAView;
   using HitsOnGPU = TrackingRecHit2DSOAView;
@@ -167,29 +159,25 @@ class CAHitNtupletGeneratorKernels {
   using TkSoA = pixelTrack::TrackSoA;
   using HitContainer = pixelTrack::HitContainer;
 
-
-
-  CAHitNtupletGeneratorKernels(Params const & params) : m_params(params){}
+  CAHitNtupletGeneratorKernels(Params const& params) : m_params(params) {}
   ~CAHitNtupletGeneratorKernels() = default;
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
 
-  void launchKernels(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
 
-  void classifyTuples(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
 
-  void fillHitDetIndices(HitsView const * hv, TkSoA * tuples_d, cudaStream_t cudaStream);
+  void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream);
   void allocateOnGPU(cuda::stream_t<>& stream);
   void cleanup(cudaStream_t cudaStream);
 
-  static void printCounters(Counters const * counters);
+  static void printCounters(Counters const* counters);
   Counters* counters_ = nullptr;
 
-
 private:
-
   // workspace
   CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
   unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
@@ -206,12 +194,12 @@ class CAHitNtupletGeneratorKernels {
   AtomicPairCounter* device_hitTuple_apc_ = nullptr;
 
   unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
-  
-  uint8_t * device_tmws_;
+
+  uint8_t* device_tmws_;
 
   unique_ptr<AtomicPairCounter::c_type[]> device_storage_;
   // params
-  Params const & m_params;
+  Params const& m_params;
 };
 
 using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cudaCompat::GPUTraits>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index f16a850b2e70a..42a89a13ff78e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -2,8 +2,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-
-template<>
+template <>
 #ifdef __CUDACC__
 void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cuda::stream_t<>& stream) {
 #else
@@ -20,31 +19,33 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
   cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
   */
 
-  device_hitToTuple_ = Traits:: template make_unique<HitToTuple>(stream);
+  device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
+
+  device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
 
-  device_tupleMultiplicity_ = Traits:: template make_unique<TupleMultiplicity>(stream);
+  auto storageSize =
+      3 + (std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) + sizeof(AtomicPairCounter::c_type)) /
+              sizeof(AtomicPairCounter::c_type);
 
-  auto storageSize = 3+(std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize())+sizeof(AtomicPairCounter::c_type))/sizeof(AtomicPairCounter::c_type);
+  device_storage_ = Traits::template make_unique<AtomicPairCounter::c_type[]>(storageSize, stream);
 
-  device_storage_ = Traits:: template make_unique<AtomicPairCounter::c_type[]>(storageSize,stream);
-  
   device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
-  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get()+1;
-  device_nCells_ = (uint32_t *)(device_storage_.get()+2);
-  device_tmws_ = (uint8_t*)(device_storage_.get()+3);
+  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get() + 1;
+  device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
+  device_tmws_ = (uint8_t*)(device_storage_.get() + 3);
 
-  assert(device_tmws_+std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <= (uint8_t*)(device_storage_.get()+storageSize));
+  assert(device_tmws_ + std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <=
+         (uint8_t*)(device_storage_.get() + storageSize));
 
   if
 #ifndef __CUDACC__
-    constexpr
+      constexpr
 #endif
-      (std::is_same<Traits,cudaCompat::GPUTraits>::value) {
+      (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
-  }else {
-     *device_nCells_ = 0;
-  }  
+  } else {
+    *device_nCells_ = 0;
+  }
   cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
   cudautils::launchZero(device_hitToTuple_.get(), stream.id());  // we may wish to keep it in the edm...
 }
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 1939eae6b86d9..c180ca25bfa61 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -21,18 +21,18 @@
 
 using namespace gpuPixelDoublets;
 
-  using HitsOnGPU = TrackingRecHit2DSOAView;
-  using HitsOnCPU = TrackingRecHit2DCUDA;
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using HitsOnCPU = TrackingRecHit2DCUDA;
 
-  using HitToTuple = CAConstants::HitToTuple;
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+using HitToTuple = CAConstants::HitToTuple;
+using TupleMultiplicity = CAConstants::TupleMultiplicity;
 
-  using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
-  using HitContainer = pixelTrack::HitContainer;
+using Quality = pixelTrack::Quality;
+using TkSoA = pixelTrack::TrackSoA;
+using HitContainer = pixelTrack::HitContainer;
 
-__global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
-                                      CAConstants::TupleMultiplicity * tupleMultiplicity,
+__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
+                                      CAConstants::TupleMultiplicity *tupleMultiplicity,
                                       AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
                                       uint32_t const *__restrict__ nCells,
@@ -42,7 +42,6 @@ __global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
                                       uint32_t nHits,
                                       uint32_t maxNumberOfDoublets,
                                       CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
-
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
 
   auto &c = *counters;
@@ -52,7 +51,7 @@ __global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
     atomicAdd(&c.nHits, nHits);
     atomicAdd(&c.nCells, *nCells);
     atomicAdd(&c.nTuples, apc->get().m);
-    atomicAdd(&c.nFitTracks,tupleMultiplicity->size());
+    atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
   }
 
 #ifdef NTUPLE_DEBUG
@@ -68,7 +67,7 @@ __global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
     }
   }
 
-  for (int idx = first, nt = foundNtuplets->nbins(); idx<nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = foundNtuplets->nbins(); idx < nt; idx += gridDim.x * blockDim.x) {
     if (foundNtuplets->size(idx) > 5)
       printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
     assert(foundNtuplets->size(idx) < 6);
@@ -84,56 +83,55 @@ __global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
       printf("Cells overflow\n");
   }
 
-  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
-    auto const & thisCell = cells[idx];
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
     if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
       printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
     if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
       printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
     if (thisCell.theDoubletId < 0)
       atomicAdd(&c.nKilledCells, 1);
-    if (0==thisCell.theUsed)
+    if (0 == thisCell.theUsed)
       atomicAdd(&c.nEmptyCells, 1);
     if (thisCell.tracks().empty())
       atomicAdd(&c.nZeroTrackCells, 1);
   }
 
-  for (int idx = first, nt = nHits; idx<nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = nHits; idx < nt; idx += gridDim.x * blockDim.x) {
     if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
       printf("OuterHitOfCell overflow %d\n", idx);
   }
 }
 
-
-__global__ void kernel_fishboneCleaner(GPUCACell const *cells,
-                                       uint32_t const *__restrict__ nCells,
-                                       Quality *quality) {
+__global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
   constexpr auto bad = trackQuality::bad;
 
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
-    if (thisCell.theDoubletId >= 0) continue;
+    if (thisCell.theDoubletId >= 0)
+      continue;
 
     for (auto it : thisCell.tracks())
-      quality[it] = bad; 
+      quality[it] = bad;
   }
 }
 
 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
-                                            uint32_t const *__restrict__ nCells,
-                                            HitContainer *foundNtuplets,
-                                            Quality *quality) {
+                                             uint32_t const *__restrict__ nCells,
+                                             HitContainer *foundNtuplets,
+                                             Quality *quality) {
   // constexpr auto bad = trackQuality::bad;
   constexpr auto dup = trackQuality::dup;
   // constexpr auto loose = trackQuality::loose;
 
   assert(nCells);
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
 
-    if (thisCell.tracks().size()<2) continue;
+    if (thisCell.tracks().size() < 2)
+      continue;
     //if (0==thisCell.theUsed) continue;
     // if (thisCell.theDoubletId < 0) continue;
 
@@ -149,14 +147,13 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
       if (foundNtuplets->size(it) != maxNh)
         quality[it] = dup;  //no race:  simple assignment of the same constant
     }
-
   }
 }
 
-__global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells,
+__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
                                             uint32_t const *__restrict__ nCells,
-                                            HitContainer const * __restrict__ foundNtuplets,
-                                            TkSoA * __restrict__ tracks) {
+                                            HitContainer const *__restrict__ foundNtuplets,
+                                            TkSoA *__restrict__ tracks) {
   constexpr auto bad = trackQuality::bad;
   constexpr auto dup = trackQuality::dup;
   constexpr auto loose = trackQuality::loose;
@@ -164,9 +161,10 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells
   assert(nCells);
 
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
-    if (thisCell.tracks().size()<2) continue;
+    if (thisCell.tracks().size() < 2)
+      continue;
     // if (thisCell.theDoubletId < 0) continue;
 
     float mc = 10000.f;
@@ -187,7 +185,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells
     // mark all other duplicates
     for (auto it : thisCell.tracks()) {
       if (tracks->quality(it) != bad && it != im)
-          tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
+        tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
     }
   }
 }
@@ -216,9 +214,9 @@ __global__ void kernel_connect(AtomicPairCounter *apc1,
     (*apc2) = 0;
   }  // ready for next kernel
 
-  for (int idx = firstCellIndex, nt = (*nCells); idx<nt; idx += gridDim.y * blockDim.y) {
+  for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
     auto cellIndex = idx;
-    auto & thisCell = cells[idx];
+    auto &thisCell = cells[idx];
     //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
     //  continue;
     auto innerHitId = thisCell.get_inner_hit_id();
@@ -236,32 +234,33 @@ __global__ void kernel_connect(AtomicPairCounter *apc1,
 
     for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
       auto otherCell = __ldg(vi + j);
-      auto & oc = cells[otherCell];
+      auto &oc = cells[otherCell];
       // if (cells[otherCell].theDoubletId < 0 ||
       //    cells[otherCell].theUsed>1 )
       //  continue;
       auto r1 = oc.get_inner_r(hh);
       auto z1 = oc.get_inner_z(hh);
       // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
-      bool aligned = GPUCACell::areAlignedRZ(r1,
-                                z1,
-                                ri,
-                                zi,
-                                ro,
-                                zo,
-                                ptmin,
-                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-      if(aligned &&
-        thisCell.dcaCut(hh,oc,
-                      oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
-                      hardCurvCut)
-         ) {  // FIXME tune cuts
+      bool aligned = GPUCACell::areAlignedRZ(
+          r1,
+          z1,
+          ri,
+          zi,
+          ro,
+          zo,
+          ptmin,
+          isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+      if (aligned &&
+          thisCell.dcaCut(hh,
+                          oc,
+                          oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
+                          hardCurvCut)) {  // FIXME tune cuts
         oc.addOuterNeighbor(cellIndex, *cellNeighbors);
         thisCell.theUsed |= 1;
         oc.theUsed |= 1;
       }
-    } // loop on inner cells
-  } // loop on outer cells
+    }  // loop on inner cells
+  }    // loop on outer cells
 }
 
 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
@@ -270,100 +269,96 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                      CellTracksVector *cellTracks,
                                      HitContainer *foundNtuplets,
                                      AtomicPairCounter *apc,
-                                     Quality * __restrict__ quality,
+                                     Quality *__restrict__ quality,
                                      unsigned int minHitsPerNtuplet) {
   // recursive: not obvious to widen
   auto const &hh = *hhp;
 
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
-    if (thisCell.theDoubletId < 0) continue;  // cut by earlyFishbone
+    if (thisCell.theDoubletId < 0)
+      continue;  // cut by earlyFishbone
 
     auto pid = thisCell.theLayerPairId;
-    auto doit = minHitsPerNtuplet>3 ? pid<3 : pid <8 || pid > 12;
+    auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
     if (doit) {
       GPUCACell::TmpTuple stack;
       stack.reset();
-      thisCell.find_ntuplets(hh,
-                           cells,
-                           *cellTracks,
-                           *foundNtuplets,
-                           *apc,
-                           quality,
-                           stack,
-                           minHitsPerNtuplet, 
-                           pid<3);
+      thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
       assert(stack.size() == 0);
       // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
     }
   }
 }
 
-
 __global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
-                                     GPUCACell *__restrict__ cells,
-                                     uint32_t const *nCells) {
-
+                                 GPUCACell *__restrict__ cells,
+                                 uint32_t const *nCells) {
   // auto const &hh = *hhp;
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx<nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto &thisCell = cells[idx];
-    if (!thisCell.tracks().empty()) thisCell.theUsed |= 2;
+    if (!thisCell.tracks().empty())
+      thisCell.theUsed |= 2;
   }
 }
 
-
 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const * __restrict__ quality,
+                                         Quality const *__restrict__ quality,
                                          CAConstants::TupleMultiplicity *tupleMultiplicity) {
-
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nbins(); it<nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
-    if (nhits < 3) continue;
-    if (quality[it] == trackQuality::dup) continue;
+    if (nhits < 3)
+      continue;
+    if (quality[it] == trackQuality::dup)
+      continue;
     assert(quality[it] == trackQuality::bad);
-    if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
-    assert(nhits<8);
+    if (nhits > 5)
+      printf("wrong mult %d %d\n", it, nhits);
+    assert(nhits < 8);
     tupleMultiplicity->countDirect(nhits);
   }
 }
 
-
 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const * __restrict__ quality,
+                                        Quality const *__restrict__ quality,
                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nbins(); it<nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
-    if (nhits < 3) continue;
-    if (quality[it] == trackQuality::dup) continue;
+    if (nhits < 3)
+      continue;
+    if (quality[it] == trackQuality::dup)
+      continue;
     assert(quality[it] == trackQuality::bad);
-    if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
-    assert(nhits<8);
+    if (nhits > 5)
+      printf("wrong mult %d %d\n", it, nhits);
+    assert(nhits < 8);
     tupleMultiplicity->fillDirect(nhits, it);
   }
 }
 
-
 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
-                                      TkSoA const * __restrict__ tracks,
+                                      TkSoA const *__restrict__ tracks,
                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
                                       Quality *__restrict__ quality) {
-
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int it = first, nt = tuples->nbins(); it<nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = tuples->nbins(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = tuples->size(it);
-    if (nhits == 0) break; // guard
+    if (nhits == 0)
+      break;  // guard
 
     // if duplicate: not even fit
-    if (quality[it] == trackQuality::dup) continue;
+    if (quality[it] == trackQuality::dup)
+      continue;
 
     assert(quality[it] == trackQuality::bad);
 
     // mark doublets as bad
-    if (nhits < 3) continue;
+    if (nhits < 3)
+      continue;
 
     // if the fit has any invalid parameters, mark it as bad
     bool isNaN = false;
@@ -372,11 +367,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     }
     if (isNaN) {
 #ifdef NTUPLE_DEBUG
-      printf("NaN in fit %d size %d chi2 %f\n",
-           it,
-           tuples->size(it),
-           tracks->chi2(it)
-      );
+      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
 #endif
       continue;
     }
@@ -389,17 +380,16 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     // (see CAHitNtupletGeneratorGPU.cc)
     float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
     float chi2Cut = cuts.chi2Scale *
-                  (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+                    (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
     // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
-    if (3.f*tracks->chi2(it) >= chi2Cut) {
+    if (3.f * tracks->chi2(it) >= chi2Cut) {
 #ifdef NTUPLE_DEBUG
       printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
-           it,
-           tuples->size(it), 
-           tracks->pt(it),
-           tracks->eta(it),
-           3.f*tracks->chi2(it)
-      );
+             it,
+             tuples->size(it),
+             tracks->pt(it),
+             tracks->eta(it),
+             3.f * tracks->chi2(it));
 #endif
       continue;
     }
@@ -411,9 +401,10 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     // (see CAHitNtupletGeneratorGPU.cc)
     auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
     bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
-              (std::abs(tracks->zip(it)) < region.maxZip);
+                (std::abs(tracks->zip(it)) < region.maxZip);
 
-    if (isOk) quality[it] = trackQuality::loose;
+    if (isOk)
+      quality[it] = trackQuality::loose;
   }
 }
 
@@ -422,20 +413,21 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
                                         CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0) break; //guard
+    if (tuples->size(idx) == 0)
+      break;  //guard
     if (quality[idx] != trackQuality::loose)
       continue;
     atomicAdd(&(counters->nGoodTracks), 1);
   }
 }
 
-
 __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
                                         Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0) break; // guard
+    if (tuples->size(idx) == 0)
+      break;  // guard
     if (quality[idx] != trackQuality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
@@ -448,7 +440,8 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0) break; // guard
+    if (tuples->size(idx) == 0)
+      break;  // guard
     if (quality[idx] != trackQuality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
@@ -459,38 +452,36 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
                                          TrackingRecHit2DSOAView const *__restrict__ hhp,
                                          HitContainer *__restrict__ hitDetIndices) {
-
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   // copy offsets
   for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
     hitDetIndices->off[idx] = tuples->off[idx];
   }
   // fill hit indices
-  auto const & hh = *hhp;
+  auto const &hh = *hhp;
   auto nhits = hh.nHits();
   for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    assert(tuples->bins[idx]<nhits);
+    assert(tuples->bins[idx] < nhits);
     hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
   }
 }
 
-
 __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
                                              CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   auto &c = *counters;
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple->size(idx) == 0) continue; // SHALL NOT BE break
+    if (hitToTuple->size(idx) == 0)
+      continue;  // SHALL NOT BE break
     atomicAdd(&c.nUsedHits, 1);
     if (hitToTuple->size(idx) > 1)
       atomicAdd(&c.nDupHits, 1);
   }
 }
 
-
 __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
                                       HitContainer const *__restrict__ ptuples,
-                                      TkSoA const * __restrict__ ptracks,
+                                      TkSoA const *__restrict__ ptracks,
                                       Quality *__restrict__ quality,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
   constexpr auto bad = trackQuality::bad;
@@ -499,7 +490,7 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
 
   auto &hitToTuple = *phitToTuple;
   auto const &foundNtuplets = *ptuples;
-  auto const & tracks = *ptracks;
+  auto const &tracks = *ptracks;
 
   //  auto const & hh = *hhp;
   // auto l1end = hh.hitsLayerStart_d[1];
@@ -546,19 +537,21 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
 }
 
 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                      HitContainer const *__restrict__ ptuples,
-                                      TkSoA const * __restrict__ ptracks,
-                                      Quality const *__restrict__ quality,
-                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
-                                      uint32_t maxPrint, int iev) {
-  auto const & foundNtuplets = *ptuples;
-  auto const & tracks = *ptracks;
+                                            HitContainer const *__restrict__ ptuples,
+                                            TkSoA const *__restrict__ ptracks,
+                                            Quality const *__restrict__ quality,
+                                            CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
+                                            uint32_t maxPrint,
+                                            int iev) {
+  auto const &foundNtuplets = *ptuples;
+  auto const &tracks = *ptracks;
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i<np; i+=blockDim.x*gridDim.x) {
+  for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i < np; i += blockDim.x * gridDim.x) {
     auto nh = foundNtuplets.size(i);
-    if (nh<3) continue;
+    if (nh < 3)
+      continue;
     printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
-           10000*iev+i,
+           10000 * iev + i,
            int(quality[i]),
            nh,
            tracks.charge(i),
@@ -567,21 +560,21 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
            tracks.phi(i),
            tracks.tip(i),
            tracks.zip(i),
-//           asinhf(fit_results[i].par(3)),
+           //           asinhf(fit_results[i].par(3)),
            tracks.chi2(i),
            *foundNtuplets.begin(i),
            *(foundNtuplets.begin(i) + 1),
            *(foundNtuplets.begin(i) + 2),
-           nh>3 ? int(*(foundNtuplets.begin(i) + 3)):-1,
-           nh>4 ? int(*(foundNtuplets.begin(i) + 4)):-1
-          );
+           nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1,
+           nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1);
   }
 }
 
 __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
   auto const &c = *counters;
   printf(
-      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | nKilledCells | "
+      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | "
+      "nKilledCells | "
       "nEmptyCells | nZeroTrackCells ||\n");
   printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
          c.nEvents,
@@ -608,5 +601,3 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
          c.nEmptyCells / double(c.nCells),
          c.nZeroTrackCells / double(c.nCells));
 }
-
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index c0bea61537670..0114795db49d6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -28,60 +28,67 @@ namespace {
   cAHitNtupletGenerator::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) {
     auto coeff = pset.getParameter<std::vector<double>>("chi2Coeff");
     if (coeff.size() != 4) {
-      throw edm::Exception(edm::errors::Configuration, "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements");
+      throw edm::Exception(edm::errors::Configuration,
+                           "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements");
     }
-    return cAHitNtupletGenerator::QualityCuts {
-      // polynomial coefficients for the pT-dependent chi2 cut
-      { (float) coeff[0], (float) coeff[1], (float) coeff[2], (float) coeff[3] },
-      // max pT used to determine the chi2 cut
-      (float) pset.getParameter<double>("chi2MaxPt"),
-      // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
-      (float) pset.getParameter<double>("chi2Scale"),
-      // regional cuts for triplets
-      {
-        (float) pset.getParameter<double>("tripletMaxTip"),
-        (float) pset.getParameter<double>("tripletMinPt"),
-        (float) pset.getParameter<double>("tripletMaxZip")
-      },
-      // regional cuts for quadruplets
-      {
-        (float) pset.getParameter<double>("quadrupletMaxTip"),
-        (float) pset.getParameter<double>("quadrupletMinPt"),
-        (float) pset.getParameter<double>("quadrupletMaxZip")
-      }
-    };
+    return cAHitNtupletGenerator::QualityCuts{// polynomial coefficients for the pT-dependent chi2 cut
+                                              {(float)coeff[0], (float)coeff[1], (float)coeff[2], (float)coeff[3]},
+                                              // max pT used to determine the chi2 cut
+                                              (float)pset.getParameter<double>("chi2MaxPt"),
+                                              // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                                              (float)pset.getParameter<double>("chi2Scale"),
+                                              // regional cuts for triplets
+                                              {(float)pset.getParameter<double>("tripletMaxTip"),
+                                               (float)pset.getParameter<double>("tripletMinPt"),
+                                               (float)pset.getParameter<double>("tripletMaxZip")},
+                                              // regional cuts for quadruplets
+                                              {(float)pset.getParameter<double>("quadrupletMaxTip"),
+                                               (float)pset.getParameter<double>("quadrupletMinPt"),
+                                               (float)pset.getParameter<double>("quadrupletMaxZip")}};
   }
 
 }  // namespace
 
 using namespace std;
 
-CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet &cfg, edm::ConsumesCollector &iC)
+CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC)
     : m_params(cfg.getParameter<bool>("onGPU"),
-              cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
-              cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
-              cfg.getParameter<bool>("useRiemannFit"),
-              cfg.getParameter<bool>("fit5as4"),
-              cfg.getParameter<bool>("includeJumpingForwardDoublets"),
-              cfg.getParameter<bool>("earlyFishbone"),
-              cfg.getParameter<bool>("lateFishbone"),
-              cfg.getParameter<bool>("idealConditions"),
-              cfg.getParameter<bool>("fillStatistics"),
-              cfg.getParameter<bool>("doClusterCut"),
-              cfg.getParameter<bool>("doZCut"),
-              cfg.getParameter<bool>("doPhiCut"),
-              cfg.getParameter<double>("ptmin"),
-              cfg.getParameter<double>("CAThetaCutBarrel"),
-              cfg.getParameter<double>("CAThetaCutForward"),
-              cfg.getParameter<double>("hardCurvCut"),
-              cfg.getParameter<double>("dcaCutInnerTriplet"),
-              cfg.getParameter<double>("dcaCutOuterTriplet"),
-              makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
-
-#ifdef    DUMP_GPU_TK_TUPLES
-      printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
-             "tid", "qual", "nh","charge","pt","eta","phi","tip","zip","chi2",
-             "h1","h2","h3","h4","h5");
+               cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+               cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
+               cfg.getParameter<bool>("useRiemannFit"),
+               cfg.getParameter<bool>("fit5as4"),
+               cfg.getParameter<bool>("includeJumpingForwardDoublets"),
+               cfg.getParameter<bool>("earlyFishbone"),
+               cfg.getParameter<bool>("lateFishbone"),
+               cfg.getParameter<bool>("idealConditions"),
+               cfg.getParameter<bool>("fillStatistics"),
+               cfg.getParameter<bool>("doClusterCut"),
+               cfg.getParameter<bool>("doZCut"),
+               cfg.getParameter<bool>("doPhiCut"),
+               cfg.getParameter<double>("ptmin"),
+               cfg.getParameter<double>("CAThetaCutBarrel"),
+               cfg.getParameter<double>("CAThetaCutForward"),
+               cfg.getParameter<double>("hardCurvCut"),
+               cfg.getParameter<double>("dcaCutInnerTriplet"),
+               cfg.getParameter<double>("dcaCutOuterTriplet"),
+               makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
+#ifdef DUMP_GPU_TK_TUPLES
+  printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+         "tid",
+         "qual",
+         "nh",
+         "charge",
+         "pt",
+         "eta",
+         "phi",
+         "tip",
+         "zip",
+         "chi2",
+         "h1",
+         "h2",
+         "h3",
+         "h4",
+         "h5");
 #endif
 
   if (m_params.onGPU_) {
@@ -91,11 +98,10 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet &
     m_counters = new Counters();
     memset(m_counters, 0, sizeof(Counters));
   }
-
 }
 
-CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU(){
- if (m_params.doStats_) {
+CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() {
+  if (m_params.doStats_) {
     // crash on multi-gpu processes
     if (m_params.onGPU_) {
       CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
@@ -105,13 +111,12 @@ CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU(){
   }
   if (m_params.onGPU_) {
     cudaFree(m_counters);
-  }else {
+  } else {
     delete m_counters;
   }
 }
 
-
-void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &desc) {
+void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) {
   // 87 cm/GeV = 1/(3.8T * 0.3)
   // take less than radius given by the hardPtCut and reject everything below
   // auto hardCurvCut = 1.f/(0.35 * 87.f);
@@ -136,9 +141,12 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &
 
   edm::ParameterSetDescription trackQualityCuts;
   trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
-  trackQualityCuts.add<std::vector<double>>("chi2Coeff", { 0.68177776, 0.74609577, -0.08035491, 0.00315399 })
-    ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut");
-  trackQualityCuts.add<double>("chi2Scale", 30.)->setComment("Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann fit)");
+  trackQualityCuts.add<std::vector<double>>("chi2Coeff", {0.68177776, 0.74609577, -0.08035491, 0.00315399})
+      ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut");
+  trackQualityCuts.add<double>("chi2Scale", 30.)
+      ->setComment(
+          "Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann "
+          "fit)");
   trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
   trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");
   trackQualityCuts.add<double>("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm");
@@ -146,20 +154,21 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &
   trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
   trackQualityCuts.add<double>("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm");
   desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
-    ->setComment("Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply \"region cuts\" based on the fit results (pT, Tip, Zip).");
+      ->setComment(
+          "Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply \"region "
+          "cuts\" based on the fit results (pT, Tip, Zip).");
 }
 
-
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
-                                                           float bfield,
-                                                           cuda::stream_t<>& stream) const {
+                                                                    float bfield,
+                                                                    cuda::stream_t<>& stream) const {
   PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
 
-  auto * soa = tracks.get();
-  
+  auto* soa = tracks.get();
+
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.counters_ = m_counters;
-  HelixFitOnGPU fitter(bfield,m_params.fit5as4_);
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
 
   kernels.allocateOnGPU(stream);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
@@ -177,14 +186,11 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   return tracks;
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d,
-                                float bfield) const {
-
-
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
   PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
-  auto dummyStream = cuda::stream::wrap(0,0,false);
+  auto dummyStream = cuda::stream::wrap(0, 0, false);
 
-  auto * soa = tracks.get();
+  auto* soa = tracks.get();
   assert(soa);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
@@ -195,21 +201,20 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
   kernels.launchKernels(hits_d, soa, dummyStream.id());
   kernels.fillHitDetIndices(hits_d.view(), soa, dummyStream.id());  // in principle needed only if Hits not "available"
 
-  if (0==hits_d.nHits()) return tracks;
+  if (0 == hits_d.nHits())
+    return tracks;
 
   // now fit
-  HelixFitOnGPU fitter(bfield,m_params.fit5as4_);
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
-  
+
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
   } else {
     fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
   }
-  
 
   kernels.classifyTuples(hits_d, soa, dummyStream.id());
 
   return tracks;
-
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 67f75e21e2ef9..1cb8bb31fb0b6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -5,7 +5,6 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 
-
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
@@ -34,7 +33,6 @@ class CAHitNtupletGeneratorOnGPU {
   using HitContainer = pixelTrack::HitContainer;
   using Tuple = HitContainer;
 
-
   using QualityCuts = cAHitNtupletGenerator::QualityCuts;
   using Params = cAHitNtupletGenerator::Params;
   using Counters = cAHitNtupletGenerator::Counters;
@@ -50,30 +48,21 @@ class CAHitNtupletGeneratorOnGPU {
   static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
   PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                float bfield,
-                                cuda::stream_t<>& stream) const;
-
-  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d,
-                                float bfield) const;
+                                          float bfield,
+                                          cuda::stream_t<>& stream) const;
 
+  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
- 
 private:
-
   void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream) const;
 
-  void hitNtuplets(HitsOnCPU const& hh,
-                   const edm::EventSetup& es,
-                   bool useRiemannFit,
-                   cuda::stream_t<>& cudaStream);
+  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cuda::stream_t<>& cudaStream);
 
   void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const;
 
-
   Params m_params;
 
-  Counters * m_counters = nullptr;
-
+  Counters* m_counters = nullptr;
 };
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index ed4404345f777..6527c9f2bfbea 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -5,7 +5,6 @@
 // Author: Felice Pantaleo, CERN
 //
 
-
 // #define ONLY_TRIPLETS_IN_HOLE
 
 #include <cuda_runtime.h>
@@ -80,10 +79,10 @@ class GPUCACell {
   __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
   __device__ __forceinline__ float get_outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
   __device__ __forceinline__ float get_inner_z(Hits const& hh) const { return theInnerZ; }
-     // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
+  // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
   __device__ __forceinline__ float get_outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
   __device__ __forceinline__ float get_inner_r(Hits const& hh) const { return theInnerR; }
-     // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
+  // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
   __device__ __forceinline__ float get_outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
 
   __device__ __forceinline__ auto get_inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
@@ -96,12 +95,11 @@ class GPUCACell {
   constexpr unsigned int get_outer_hit_id() const { return theOuterHitId; }
 
   __device__ void print_cell() const {
-    printf(
-        "printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
-        theDoubletId,
-        theLayerPairId,
-        theInnerHitId,
-        theOuterHitId);
+    printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
+           theDoubletId,
+           theLayerPairId,
+           theInnerHitId,
+           theOuterHitId);
   }
 
   __device__ bool check_alignment(Hits const& hh,
@@ -174,11 +172,14 @@ class GPUCACell {
     return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
   }
 
-
-  __device__ __forceinline__ static bool dcaCutH(float x1,float y1, float x2,float y2, float x3,float y3,
-                                const float region_origin_radius_plus_tolerance,
-                                   const float maxCurv) {
-
+  __device__ __forceinline__ static bool dcaCutH(float x1,
+                                                 float y1,
+                                                 float x2,
+                                                 float y2,
+                                                 float x3,
+                                                 float y3,
+                                                 const float region_origin_radius_plus_tolerance,
+                                                 const float maxCurv) {
     CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
 
     if (eq.curvature() > maxCurv)
@@ -187,32 +188,29 @@ class GPUCACell {
     return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
   }
 
-
-
   __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
     constexpr uint32_t max_ladder_bpx0 = 12;
     constexpr uint32_t first_ladder_bpx0 = 0;
     constexpr float module_length = 6.7f;
-    constexpr float module_tolerance = 0.4f; // projection to cylinder is inaccurate on BPIX1
+    constexpr float module_tolerance = 0.4f;  // projection to cylinder is inaccurate on BPIX1
     int p = innerCell.get_inner_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
     p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
     p %= max_ladder_bpx0;
-    auto il = first_ladder_bpx0+p;
+    auto il = first_ladder_bpx0 + p;
     auto r0 = hh.averageGeometry().ladderR[il];
     auto ri = innerCell.get_inner_r(hh);
     auto zi = innerCell.get_inner_z(hh);
     auto ro = get_outer_r(hh);
     auto zo = get_outer_z(hh);
     auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
-    auto z_in_ladder = std::abs(z0-hh.averageGeometry().ladderZ[il]);
+    auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]);
     auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
     auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
     return gap;
   }
 
-
   __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
     constexpr uint32_t max_ladder_bpx4 = 64;
     constexpr uint32_t first_ladder_bpx4 = 84;
@@ -227,14 +225,14 @@ class GPUCACell {
       p += std::numeric_limits<unsigned short>::max();
     p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
     p %= max_ladder_bpx4;
-    auto il = first_ladder_bpx4+p;  
+    auto il = first_ladder_bpx4 + p;
     auto r4 = hh.averageGeometry().ladderR[il];
     auto ri = innerCell.get_inner_r(hh);
     auto zi = innerCell.get_inner_z(hh);
     auto ro = get_outer_r(hh);
     auto zo = get_outer_z(hh);
     auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
-    auto z_in_ladder = std::abs(z4-hh.averageGeometry().ladderZ[il]);
+    auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]);
     auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
     auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
     auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
@@ -247,7 +245,7 @@ class GPUCACell {
   __device__ inline void find_ntuplets(Hits const& hh,
                                        GPUCACell* __restrict__ cells,
                                        CellTracksVector& cellTracks,
-                                       HitContainer & foundNtuplets,
+                                       HitContainer& foundNtuplets,
                                        AtomicPairCounter& apc,
                                        Quality* __restrict__ quality,
                                        TmpTuple& tmpNtuplet,
@@ -262,22 +260,21 @@ class GPUCACell {
     tmpNtuplet.push_back_unsafe(theDoubletId);
     assert(tmpNtuplet.size() <= 4);
 
-    bool last=true;
+    bool last = true;
     for (int j = 0; j < outerNeighbors().size(); ++j) {
       auto otherCell = outerNeighbors()[j];
-      if (cells[otherCell].theDoubletId<0) continue; // killed by earlyFishbone
-        last = false;
-        cells[otherCell].find_ntuplets(
-            hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet,startAt0);
+      if (cells[otherCell].theDoubletId < 0)
+        continue;  // killed by earlyFishbone
+      last = false;
+      cells[otherCell].find_ntuplets(
+          hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
     }
-    if(last) {  // if long enough save...
+    if (last) {  // if long enough save...
       if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
 #ifdef ONLY_TRIPLETS_IN_HOLE
         // triplets accepted only pointing to the hole
-        if (tmpNtuplet.size() >= 3 || 
-            ( startAt0&&hole4(hh, cells[tmpNtuplet[0]]) ) ||
-            ( (!startAt0)&&hole0(hh, cells[tmpNtuplet[0]]) )
-        )
+        if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
+            ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
 #endif
         {
           hindex_type hits[6];
@@ -290,7 +287,7 @@ class GPUCACell {
           if (it >= 0) {  // if negative is overflow....
             for (auto c : tmpNtuplet)
               cells[c].addTrack(it, cellTracks);
-            quality[it] =  bad; // initialize to bad
+            quality[it] = bad;  // initialize to bad
           }
         }
       }
@@ -306,7 +303,7 @@ class GPUCACell {
 public:
   int32_t theDoubletId;
   int16_t theLayerPairId;
-  uint16_t theUsed; // tbd
+  uint16_t theUsed;  // tbd
 
 private:
   float theInnerZ;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index c071cdd347808..becbd0a1a8540 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -2,8 +2,8 @@
 #include "HelixFitOnGPU.h"
 
 void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
-                     TupleMultiplicity const *tupleMultiplicity,
-                     OutputSoA *helix_fit_results) {
+                                  TupleMultiplicity const *tupleMultiplicity,
+                                  OutputSoA *helix_fit_results) {
   tuples_d = tuples;
   tupleMultiplicity_d = tupleMultiplicity;
   outputSoa_d = helix_fit_results;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 4f4fdbf7d8299..50f4d0580c2f4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -45,27 +45,19 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
-  void launchRiemannKernels(HitsView const * hv,
+  void launchRiemannKernels(HitsView const *hv,
                             uint32_t nhits,
                             uint32_t maxNumberOfTuples,
                             cuda::stream_t<> &cudaStream);
-  void launchBrokenLineKernels(HitsView const * hv,
+  void launchBrokenLineKernels(HitsView const *hv,
                                uint32_t nhits,
                                uint32_t maxNumberOfTuples,
                                cuda::stream_t<> &cudaStream);
 
-  void launchRiemannKernelsOnCPU(HitsView const * hv,
-                            uint32_t nhits,
-                            uint32_t maxNumberOfTuples);
-  void launchBrokenLineKernelsOnCPU(HitsView const * hv,
-                               uint32_t nhits,
-                               uint32_t maxNumberOfTuples);
-
-
+  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
-  void allocateOnGPU(Tuples const *tuples,
-                     TupleMultiplicity const *tupleMultiplicity,
-                     OutputSoA * outputSoA);
+  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
   void deallocateOnGPU();
 
 private:
@@ -74,7 +66,7 @@ class HelixFitOnGPU {
   // fowarded
   Tuples const *tuples_d = nullptr;
   TupleMultiplicity const *tupleMultiplicity_d = nullptr;
-  OutputSoA * outputSoa_d;
+  OutputSoA *outputSoa_d;
   float bField_;
 
   const bool fit5as4_;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index 825df0cc182c7..3476362864a79 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -1,151 +1,110 @@
 #include "RiemannFitOnGPU.h"
 
-void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const * hv,
-                                         uint32_t nhits,
-                                         uint32_t maxNumberOfTuples) {
+void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
   assert(tuples_d);
 
-
   //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ =
-       std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
   auto fast_fit_resultsGPU_ =
-       std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
-  auto circle_fit_resultsGPU_holder =
-       std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
+  auto circle_fit_resultsGPU_holder = std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
   Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernelFastFit<3>(tuples_d,
-                                                                    tupleMultiplicity_d,
-                                                                    3,
-                                                                    hv,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    offset);
-    
+    kernelFastFit<3>(
+        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
 
     kernelCircleFit<3>(tupleMultiplicity_d,
-                                                                      3,
-                                                                      bField_,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
-    
+                       3,
+                       bField_,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
 
     kernelLineFit<3>(tupleMultiplicity_d,
-                                                                    3,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    circle_fit_resultsGPU_,
-                                                                    offset);
-    
+                     3,
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     circle_fit_resultsGPU_,
+                     offset);
 
     // quads
-    kernelFastFit<4>(tuples_d,
-                                                                    tupleMultiplicity_d,
-                                                                    4,
-                                                                    hv,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    offset);
-    
+    kernelFastFit<4>(
+        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
 
     kernelCircleFit<4>(tupleMultiplicity_d,
-                                                                      4,
-                                                                      bField_,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
-    
+                       4,
+                       bField_,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
 
     kernelLineFit<4>(tupleMultiplicity_d,
-                                                                    4,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    circle_fit_resultsGPU_,
-                                                                    offset);
-    
+                     4,
+                     bField_,
+                     outputSoa_d,
+                     hitsGPU_.get(),
+                     hits_geGPU_.get(),
+                     fast_fit_resultsGPU_.get(),
+                     circle_fit_resultsGPU_,
+                     offset);
 
     if (fit5as4_) {
       // penta
-      kernelFastFit<4>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      5,
-                                                                      hv,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      offset);
-      
+      kernelFastFit<4>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
 
       kernelCircleFit<4>(tupleMultiplicity_d,
-                                                                        5,
-                                                                        bField_,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
-      
+                         5,
+                         bField_,
+                         hitsGPU_.get(),
+                         hits_geGPU_.get(),
+                         fast_fit_resultsGPU_.get(),
+                         circle_fit_resultsGPU_,
+                         offset);
 
       kernelLineFit<4>(tupleMultiplicity_d,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_d,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
-      
+                       5,
+                       bField_,
+                       outputSoa_d,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
+
     } else {
       // penta all 5
-      kernelFastFit<5>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      5,
-                                                                      hv,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      offset);
-      
+      kernelFastFit<5>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
 
       kernelCircleFit<5>(tupleMultiplicity_d,
-                                                                        5,
-                                                                        bField_,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
-      
+                         5,
+                         bField_,
+                         hitsGPU_.get(),
+                         hits_geGPU_.get(),
+                         fast_fit_resultsGPU_.get(),
+                         circle_fit_resultsGPU_,
+                         offset);
 
       kernelLineFit<5>(tupleMultiplicity_d,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_d,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
-      
+                       5,
+                       bField_,
+                       outputSoa_d,
+                       hitsGPU_.get(),
+                       hits_geGPU_.get(),
+                       fast_fit_resultsGPU_.get(),
+                       circle_fit_resultsGPU_,
+                       offset);
     }
   }
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 042a9b4e6982d..690bce4edcf8f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -1,7 +1,7 @@
 #include "RiemannFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchRiemannKernels(HitsView const * hv,
+void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
                                          cuda::stream_t<> &stream) {
@@ -13,24 +13,18 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const * hv,
   //  Fit internals
   auto hitsGPU_ = cudautils::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ =
-      cudautils::make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ =
-      cudautils::make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
   auto circle_fit_resultsGPU_holder =
       cudautils::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
   Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                    tupleMultiplicity_d,
-                                                                    3,
-                                                                    hv,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    offset);
+    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
     cudaCheck(cudaGetLastError());
 
     kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
@@ -55,100 +49,82 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const * hv,
     cudaCheck(cudaGetLastError());
 
     // quads
-    kernelFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                    tupleMultiplicity_d,
-                                                                    4,
-                                                                    hv,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelCircleFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                      4,
-                                                                      bField_,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                    4,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    circle_fit_resultsGPU_,
-                                                                    offset);
+    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                          4,
+                                                                          bField_,
+                                                                          hitsGPU_.get(),
+                                                                          hits_geGPU_.get(),
+                                                                          fast_fit_resultsGPU_.get(),
+                                                                          circle_fit_resultsGPU_,
+                                                                          offset);
     cudaCheck(cudaGetLastError());
 
-    if (fit5as4_) {
-      // penta
-      kernelFastFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      5,
-                                                                      hv,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelCircleFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                        5,
+    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                        4,
                                                                         bField_,
+                                                                        outputSoa_d,
                                                                         hitsGPU_.get(),
                                                                         hits_geGPU_.get(),
                                                                         fast_fit_resultsGPU_.get(),
                                                                         circle_fit_resultsGPU_,
                                                                         offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // penta
+      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_d,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                            5,
+                                                                            bField_,
+                                                                            hitsGPU_.get(),
+                                                                            hits_geGPU_.get(),
+                                                                            fast_fit_resultsGPU_.get(),
+                                                                            circle_fit_resultsGPU_,
+                                                                            offset);
+      cudaCheck(cudaGetLastError());
+
+      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                          5,
+                                                                          bField_,
+                                                                          outputSoa_d,
+                                                                          hitsGPU_.get(),
+                                                                          hits_geGPU_.get(),
+                                                                          fast_fit_resultsGPU_.get(),
+                                                                          circle_fit_resultsGPU_,
+                                                                          offset);
       cudaCheck(cudaGetLastError());
     } else {
       // penta all 5
-      kernelFastFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tuples_d,
-                                                                      tupleMultiplicity_d,
-                                                                      5,
-                                                                      hv,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      offset);
+      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                        5,
-                                                                        bField_,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
+      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                            5,
+                                                                            bField_,
+                                                                            hitsGPU_.get(),
+                                                                            hits_geGPU_.get(),
+                                                                            fast_fit_resultsGPU_.get(),
+                                                                            circle_fit_resultsGPU_,
+                                                                            offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<5><<<numberOfBlocks/4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_d,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
+                                                                          5,
+                                                                          bField_,
+                                                                          outputSoa_d,
+                                                                          hitsGPU_.get(),
+                                                                          hits_geGPU_.get(),
+                                                                          fast_fit_resultsGPU_.get(),
+                                                                          circle_fit_resultsGPU_,
+                                                                          offset);
       cudaCheck(cudaGetLastError());
     }
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 3f4230085efe3..b46de519034d5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -45,10 +45,11 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
     printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
 #endif
 
-
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
-     auto tuple_idx = local_idx + offset;
-    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
 
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
@@ -66,7 +67,9 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
       auto hit = hitId[i];
       // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
       float ge[6];
-      hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+      hhp->cpeParams()
+          .detParams(hhp->detectorIndex(hit))
+          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
       // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
 
       hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
@@ -98,9 +101,11 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
 
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
-     auto tuple_idx = local_idx + offset;
-    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
 
     Rfit::Map3xNd<N> hits(phits + local_idx);
     Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
@@ -139,11 +144,13 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; local_idx+=gridDim.x*blockDim.x) {
-     auto tuple_idx = local_idx + offset;
-    if (tuple_idx >= tupleMultiplicity->size(nHits)) break;
+  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
 
-     // get it for the ntuple container (one to one to helix)
+    // get it for the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
     Rfit::Map3xNd<N> hits(phits + local_idx);
@@ -154,29 +161,29 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 
     Rfit::fromCircleToPerigee(circle_fit[local_idx]);
 
-    results->stateAtBS.copyFromCircle(circle_fit[local_idx].par,circle_fit[local_idx].cov,
-                                     line_fit.par,line_fit.cov,1.f/float(B),tkid);
-    results->pt(tkid) =  B/std::abs(circle_fit[local_idx].par(2));
-    results->eta(tkid) =  asinhf(line_fit.par(0));
-    results->chi2(tkid) = (circle_fit[local_idx].chi2+line_fit.chi2)/(2*N-5);
+    results->stateAtBS.copyFromCircle(
+        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(B), tkid);
+    results->pt(tkid) = B / std::abs(circle_fit[local_idx].par(2));
+    results->eta(tkid) = asinhf(line_fit.par(0));
+    results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
 
 #ifdef RIEMANN_DEBUG
-  printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
-         N,
-         nHits,
-         tkid,
-         circle_fit[local_idx].par(0),
-         circle_fit[local_idx].par(1),
-         circle_fit[local_idx].par(2));
-  printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
-  printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
-         circle_fit[local_idx].chi2,
-         line_fit.chi2,
-         circle_fit[local_idx].cov(0, 0),
-         circle_fit[local_idx].cov(1, 1),
-         circle_fit[local_idx].cov(2, 2),
-         line_fit.cov(0, 0),
-         line_fit.cov(1, 1));
+    printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+           N,
+           nHits,
+           tkid,
+           circle_fit[local_idx].par(0),
+           circle_fit[local_idx].par(1),
+           circle_fit[local_idx].par(2));
+    printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
+    printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
+           circle_fit[local_idx].chi2,
+           line_fit.chi2,
+           circle_fit[local_idx].cov(0, 0),
+           circle_fit[local_idx].cov(1, 1),
+           circle_fit[local_idx].cov(2, 2),
+           line_fit.cov(0, 0),
+           line_fit.cov(1, 1));
 #endif
   }
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index b4dff8c103d2d..f761bc3d811f1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -24,69 +24,70 @@ namespace gpuPixelDoublets {
                            GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
                            uint32_t nHits,
                            bool checkTrack) {
-  constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
+    constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
-  auto const& hh = *hhp;
-  // auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
+    auto const& hh = *hhp;
+    // auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
 
-  // x run faster...
-  auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
-  auto firstX = threadIdx.x;
+    // x run faster...
+    auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
+    auto firstX = threadIdx.x;
 
-   float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
-   uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
-   uint32_t cc[maxCellsPerHit];
+    float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
+    uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
+    uint32_t cc[maxCellsPerHit];
 
-  for (int idy = firstY, nt = nHits; idy<nt; idy += gridDim.y * blockDim.y) {
-    auto const& vc = isOuterHitOfCell[idy];
-    auto s = vc.size();
-    if (s < 2)
-      continue;
-    // if alligned kill one of the two.
-    // in principle one could try to relax the cut (only in r-z?) for jumping-doublets 
-    auto const& c0 = cells[vc[0]];
-    auto xo = c0.get_outer_x(hh);
-    auto yo = c0.get_outer_y(hh);
-    auto zo = c0.get_outer_z(hh);
-    auto sg = 0;
-    for (int32_t ic = 0; ic < s; ++ic) {
-      auto& ci = cells[vc[ic]];
-      if (0==ci.theUsed) continue; // for triplets equivalent to next 
-      if (checkTrack && ci.tracks().empty())
+    for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) {
+      auto const& vc = isOuterHitOfCell[idy];
+      auto s = vc.size();
+      if (s < 2)
         continue;
-      cc[sg] = vc[ic];
-      d[sg] = ci.get_inner_detIndex(hh);
-      //      l[sg] = layer(d[sg]);
-      x[sg] = ci.get_inner_x(hh) - xo;
-      y[sg] = ci.get_inner_y(hh) - yo;
-      z[sg] = ci.get_inner_z(hh) - zo;
-      n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
-      ++sg;
-    }
-    if (sg < 2)
-      continue;
-    // here we parallelize
-    for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) {
-      auto& ci = cells[cc[ic]];
-      for (auto jc = ic + 1; jc < sg; ++jc) {
-        auto& cj = cells[cc[jc]];
-        // must be different detectors (in the same layer)
-        //        if (d[ic]==d[jc]) continue;
-        // || l[ic]!=l[jc]) continue;
-        auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
-        if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
-          // alligned:  kill farthest  (prefer consecutive layers)
-          if (n[ic] > n[jc]) {
-            ci.theDoubletId = -1;
-            break;
-          } else {
-            cj.theDoubletId = -1;
+      // if alligned kill one of the two.
+      // in principle one could try to relax the cut (only in r-z?) for jumping-doublets
+      auto const& c0 = cells[vc[0]];
+      auto xo = c0.get_outer_x(hh);
+      auto yo = c0.get_outer_y(hh);
+      auto zo = c0.get_outer_z(hh);
+      auto sg = 0;
+      for (int32_t ic = 0; ic < s; ++ic) {
+        auto& ci = cells[vc[ic]];
+        if (0 == ci.theUsed)
+          continue;  // for triplets equivalent to next
+        if (checkTrack && ci.tracks().empty())
+          continue;
+        cc[sg] = vc[ic];
+        d[sg] = ci.get_inner_detIndex(hh);
+        //      l[sg] = layer(d[sg]);
+        x[sg] = ci.get_inner_x(hh) - xo;
+        y[sg] = ci.get_inner_y(hh) - yo;
+        z[sg] = ci.get_inner_z(hh) - zo;
+        n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
+        ++sg;
+      }
+      if (sg < 2)
+        continue;
+      // here we parallelize
+      for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) {
+        auto& ci = cells[cc[ic]];
+        for (auto jc = ic + 1; jc < sg; ++jc) {
+          auto& cj = cells[cc[jc]];
+          // must be different detectors (in the same layer)
+          //        if (d[ic]==d[jc]) continue;
+          // || l[ic]!=l[jc]) continue;
+          auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
+          if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
+            // alligned:  kill farthest  (prefer consecutive layers)
+            if (n[ic] > n[jc]) {
+              ci.theDoubletId = -1;
+              break;
+            } else {
+              cj.theDoubletId = -1;
+            }
           }
-        }
-      }  //cj
-    }    // ci
-  } // hits
- }
+        }  //cj
+      }    // ci
+    }      // hits
+  }
 }  // namespace gpuPixelDoublets
 
 #endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index f77ecb01cd416..dbe3e3f8c964e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -9,40 +9,56 @@ namespace gpuPixelDoublets {
 
   using namespace gpuPixelDoubletsAlgos;
 
-  constexpr int nPairs = 13+2+4;
-  static_assert(nPairs<=CAConstants::maxNumberOfLayerPairs());
+  constexpr int nPairs = 13 + 2 + 4;
+  static_assert(nPairs <= CAConstants::maxNumberOfLayerPairs());
 
-// start constants
-// clang-format off 
+  // start constants
+  // clang-format off
 
   CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
-      0,1, 0,4, 0,7, // BPIX1 (3)
-      1,2, 1,4, 1,7, // BPIX2 (5)
-      4,5, 7,8,      // FPIX1 (8)
-      2,3, 2,4, 2,7, 5,6, 8,9,  // BPIX3 & FPIX2 (13)
-      0,2, 1,3,      // Jumping Barrel (15)
-      0,5, 0,8,      // Jumping Forward (BPIX1,FPIX2)
-      4,6, 7,9       // Jumping Forward (19)
+      0, 1, 0, 4, 0, 7,              // BPIX1 (3)
+      1, 2, 1, 4, 1, 7,              // BPIX2 (5)
+      4, 5, 7, 8,                    // FPIX1 (8)
+      2, 3, 2, 4, 2, 7, 5, 6, 8, 9,  // BPIX3 & FPIX2 (13)
+      0, 2, 1, 3,                    // Jumping Barrel (15)
+      0, 5, 0, 8,                    // Jumping Forward (BPIX1,FPIX2)
+      4, 6, 7, 9                     // Jumping Forward (19)
   };
 
   constexpr int16_t phi0p05 = 522;  // round(521.52189...) = phi2short(0.05);
   constexpr int16_t phi0p06 = 626;  // round(625.82270...) = phi2short(0.06);
   constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
 
-  CONSTANT_VAR const int16_t phicuts[nPairs]{
-     phi0p05, phi0p07, phi0p07, 
-     phi0p05, phi0p06, phi0p06,
-     phi0p05, phi0p05,
-     phi0p06, phi0p06, phi0p06, phi0p05, phi0p05,
-     phi0p05, phi0p05, phi0p05,phi0p05, phi0p05,phi0p05};
-//   phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06};  // relaxed cuts
-
-  CONSTANT_VAR float const minz[nPairs] = {-20.,0.,-30., -22.,10.,-30., -70.,-70., -22.,15.,-30, -70.,-70., -20.,-22., 0,-30., -70.,-70.};
-  CONSTANT_VAR float const maxz[nPairs] = { 20.,30.,0.,   22.,30.,-10.,  70.,70.,   22.,30.,-15., 70., 70.,  20.,22., 30.,0.,  70.,70.};
-  CONSTANT_VAR float const maxr[nPairs] = {20.,9.,9., 20.,7.,7., 5.,5., 20.,6.,6., 5., 5., 20.,20.,9.,9., 9.,9.}; 
-
-// end constants
-// clang-format on
+  CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05,
+                                             phi0p07,
+                                             phi0p07,
+                                             phi0p05,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05};
+  //   phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06};  // relaxed cuts
+
+  CONSTANT_VAR float const minz[nPairs] = {
+      -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.};
+  CONSTANT_VAR float const maxz[nPairs] = {
+      20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.};
+  CONSTANT_VAR float const maxr[nPairs] = {
+      20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.};
+
+  // end constants
+  // clang-format on
 
   using CellNeighbors = CAConstants::CellNeighbors;
   using CellTracks = CAConstants::CellTracks;
@@ -64,24 +80,22 @@ namespace gpuPixelDoublets {
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
-  __global__ 
+  __global__
 #ifdef __CUDACC__
-  __launch_bounds__(
-      getDoubletsFromHistoMaxBlockSize,
-      getDoubletsFromHistoMinBlocksPerMP) 
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP)
 #endif
-                                          void getDoubletsFromHisto(GPUCACell* cells,
-                                                                    uint32_t* nCells,
-                                                                    CellNeighborsVector* cellNeighbors,
-                                                                    CellTracksVector* cellTracks,
-                                                                    TrackingRecHit2DSOAView const* __restrict__ hhp,
-                                                                    GPUCACell::OuterHitOfCell* isOuterHitOfCell,
-                                                                    int nActualPairs,
-                                                                    bool ideal_cond,
-                                                                    bool doClusterCut,
-                                                                    bool doZCut,
-                                                                    bool doPhiCut,
-                                                                    uint32_t maxNumOfDoublets) {
+      void getDoubletsFromHisto(GPUCACell* cells,
+                                uint32_t* nCells,
+                                CellNeighborsVector* cellNeighbors,
+                                CellTracksVector* cellTracks,
+                                TrackingRecHit2DSOAView const* __restrict__ hhp,
+                                GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                int nActualPairs,
+                                bool ideal_cond,
+                                bool doClusterCut,
+                                bool doZCut,
+                                bool doPhiCut,
+                                uint32_t maxNumOfDoublets) {
     auto const& __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs,
                       nActualPairs,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 52b0cde5f7e91..be668dfbb2e04 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -46,7 +46,7 @@ namespace gpuPixelDoubletsAlgos {
     constexpr int maxDYsize12 = 28;
     constexpr int maxDYsize = 20;
     constexpr int maxDYPred = 20;
-    constexpr float dzdrFact = 8*0.0285/0.015;  // from dz/dr to "DY"
+    constexpr float dzdrFact = 8 * 0.0285 / 0.015;  // from dz/dr to "DY"
 
     bool isOuterLadder = ideal_cond;
 
@@ -78,10 +78,11 @@ namespace gpuPixelDoubletsAlgos {
     auto idy = blockIdx.y * blockDim.y + threadIdx.y;
     auto first = threadIdx.x;
     auto stride = blockDim.x;
-      
-    uint32_t pairLayerId = 0; // cannot go backward 
+
+    uint32_t pairLayerId = 0;  // cannot go backward
     for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
-      while (j >= innerLayerCumulativeSize[pairLayerId++]);
+      while (j >= innerLayerCumulativeSize[pairLayerId++])
+        ;
       --pairLayerId;  // move to lower_bound ??
 
       assert(pairLayerId < nPairs);
@@ -104,14 +105,15 @@ namespace gpuPixelDoubletsAlgos {
 
       // found hit corresponding to our cuda thread, now do the job
       auto mi = hh.detectorIndex(i);
-      if (mi>2000) continue; // invalid
+      if (mi > 2000)
+        continue;  // invalid
 
       auto mez = hh.zGlobal(i);
 
       if (doZCut && (mez < minz[pairLayerId] || mez > maxz[pairLayerId]))
         continue;
 
-      int16_t mes=-1;  // make compiler happy 
+      int16_t mes = -1;  // make compiler happy
       if (doClusterCut) {
         // if ideal treat inner ladder as outer
         if (inner == 0)
@@ -121,7 +123,7 @@ namespace gpuPixelDoubletsAlgos {
         // in any case we always test mes>0 ...
         mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1;
 
-        if (inner == 0 && outer > 3 )  // B1 and F1
+        if (inner == 0 && outer > 3)  // B1 and F1
           if (mes > 0 && mes < minYsizeB1)
             continue;                 // only long cluster  (5*8)
         if (inner == 1 && outer > 3)  // B2 and F1
@@ -155,16 +157,15 @@ namespace gpuPixelDoubletsAlgos {
       auto zsizeCut = [&](int j) {
         auto onlyBarrel = outer < 4;
         auto so = hh.clusterSizeY(j);
-        auto dy = inner == 0 ? maxDYsize12  : maxDYsize;
+        auto dy = inner == 0 ? maxDYsize12 : maxDYsize;
         // in the barrel cut on difference in size
         // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well)
         // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ...
         auto zo = hh.zGlobal(j);
         auto ro = hh.rGlobal(j);
-        return onlyBarrel ?
-                     mes > 0 && so > 0 && std::abs(so - mes) > dy :
-                     (inner<4) && mes>0 
-                     && std::abs(mes - int(std::abs((mez-zo)/(mer-ro))*dzdrFact+0.5f)) > maxDYPred;
+        return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy
+                          : (inner < 4) && mes > 0 &&
+                                std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred;
       };
 
       auto iphicut = phicuts[pairLayerId];
@@ -194,14 +195,15 @@ namespace gpuPixelDoubletsAlgos {
           assert(oi >= offsets[outer]);
           assert(oi < offsets[outer + 1]);
           auto mo = hh.detectorIndex(oi);
-          if (mo>2000) continue; //    invalid
+          if (mo > 2000)
+            continue;  //    invalid
           auto mop = hh.iphi(oi);
           if (std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))) > iphicut)
             continue;
           if (doPhiCut) {
             if (doClusterCut && zsizeCut(oi))
-               continue;
-            if (z0cutoff(oi) || ptcut(oi,mop))
+              continue;
+            if (z0cutoff(oi) || ptcut(oi, mop))
               continue;
           }
           auto ind = atomicAdd(nCells, 1);
diff --git a/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp
index cbbcea96d1ee8..504f9c144b284 100644
--- a/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp
+++ b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp
@@ -1,99 +1,77 @@
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
 #include <cassert>
 
-
 struct OriCircle {
+  using T = float;
 
-   using T = float;
-
-   float radius=0;
-   float x_center=0;
-   float y_center=0;
+  float radius = 0;
+  float x_center = 0;
+  float y_center = 0;
 
-
-  constexpr OriCircle(T x1, T y1,
-         T x2, T y2,
-         T x3, T y3) {
-    compute(x1,y1,x2,y2,x3,y3);
-  }
+  constexpr OriCircle(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); }
 
   // dca to origin
-  constexpr T dca0() const {
-   return std::sqrt(x_center*x_center + y_center*y_center) - radius;
-  }
+  constexpr T dca0() const { return std::sqrt(x_center * x_center + y_center * y_center) - radius; }
 
-   // dca to given point
+  // dca to given point
   constexpr T dca(T x, T y) const {
-    x-=x_center;
-    y-=y_center;
-   return std::sqrt(x*x+y*y)-radius;
+    x -= x_center;
+    y -= y_center;
+    return std::sqrt(x * x + y * y) - radius;
   }
 
+  constexpr void compute(T x1, T y1, T x2, T y2, T x3, T y3) {
+    auto det = (x1 - x2) * (y2 - y3) - (x2 - x3) * (y1 - y2);
 
-  constexpr void compute(T x1, T y1,
-           T x2, T y2,
-           T x3, T y3) {
-
-      auto det = (x1 - x2) * (y2 - y3) - (x2 - x3) * (y1 - y2);
+    auto offset = x2 * x2 + y2 * y2;
 
-      auto offset = x2 * x2 + y2 * y2;
+    auto bc = (x1 * x1 + y1 * y1 - offset) * 0.5f;
 
-      auto bc = (x1 * x1 + y1 * y1 - offset) * 0.5f;
- 
-      auto cd = (offset - x3 * x3 - y3 * y3) * 0.5f;
+    auto cd = (offset - x3 * x3 - y3 * y3) * 0.5f;
 
-      auto idet = 1.f / det;
+    auto idet = 1.f / det;
 
-      x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
-      y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;
+    x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
+    y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;
 
-      radius = std::sqrt((x2 - x_center) * (x2 - x_center) +
-                              (y2 - y_center) * (y2 - y_center));
-
-    }
+    radius = std::sqrt((x2 - x_center) * (x2 - x_center) + (y2 - y_center) * (y2 - y_center));
+  }
 };
 
+#include <iostream>
 
-#include<iostream>
-
-template<typename T>
+template <typename T>
 bool equal(T a, T b) {
   //  return float(a-b)==0;
-  return std::abs(float(a-b)) < std::abs(0.01f*a);
+  return std::abs(float(a - b)) < std::abs(0.01f * a);
 }
 
-
-
 int main() {
-
-  float r1=4, r2=8, r3=15;
-  for(float phi=-3; phi<3.1; phi+=0.5) {
-    float x1=r1*cos(phi);
-    float x2=r2*cos(phi);
-    float y1=r1*sin(phi);
-    float y2=r2*sin(phi);
-    for(float phi3=phi-0.31; phi3<phi+0.31; phi3+=0.05) {
-      float x3=r3*cos(phi3);
-      float y3=r3*sin(phi3);
-
-      OriCircle ori(x1,y1,x2,y2,x3,y3);
-      CircleEq<float> eq(x1,y1,x2,y2,x3,y3);
+  float r1 = 4, r2 = 8, r3 = 15;
+  for (float phi = -3; phi < 3.1; phi += 0.5) {
+    float x1 = r1 * cos(phi);
+    float x2 = r2 * cos(phi);
+    float y1 = r1 * sin(phi);
+    float y2 = r2 * sin(phi);
+    for (float phi3 = phi - 0.31; phi3 < phi + 0.31; phi3 += 0.05) {
+      float x3 = r3 * cos(phi3);
+      float y3 = r3 * sin(phi3);
+
+      OriCircle ori(x1, y1, x2, y2, x3, y3);
+      CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
       // std::cout << "r " << ori.radius <<' '<< eq.radius() << std::endl;
-      assert( equal(ori.radius, std::abs(eq.radius())) );
+      assert(equal(ori.radius, std::abs(eq.radius())));
       auto c = eq.center();
       auto dir = eq.cosdir();
-      assert (equal(1.f,dir.first*dir.first+dir.second*dir.second));
-      assert( equal(ori.x_center,c.first) );
-      assert( equal(ori.y_center,c.second) );
+      assert(equal(1.f, dir.first * dir.first + dir.second * dir.second));
+      assert(equal(ori.x_center, c.first));
+      assert(equal(ori.y_center, c.second));
       // std::cout << "dca " << ori.dca0() <<' '<< eq.radius()*eq.dca0() << std::endl;
-      assert( equal( std::abs(ori.dca0()), std::abs(eq.radius()*eq.dca0())) );
+      assert(equal(std::abs(ori.dca0()), std::abs(eq.radius() * eq.dca0())));
       // std::cout << "dca " << ori.dca(1.,1.) <<' '<< eq.radius()*eq.dca(1.,1.) << std::endl;
-      assert( equal( std::abs(ori.dca(1.,1.)), std::abs(eq.radius()*eq.dca(1.,1.))) );
-
+      assert(equal(std::abs(ori.dca(1., 1.)), std::abs(eq.radius() * eq.dca(1., 1.))));
     }
   }
 
-
-
   return 0;
 }
diff --git a/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp
index 58c7f832627fb..8538970a196ff 100644
--- a/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp
+++ b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp
@@ -2,10 +2,10 @@
 //
 //
 //
-#include<cmath>
-#include<algorithm>
-#include<numeric>
-#include<cassert>
+#include <cmath>
+#include <algorithm>
+#include <numeric>
+#include <cassert>
 
 /**
 | 1) circle is parameterized as:                                              |
@@ -25,84 +25,67 @@
 |
 */
 
-template<typename T>
+template <typename T>
 class FastCircle {
-
 public:
+  FastCircle() {}
+  FastCircle(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); }
 
-  FastCircle(){}
-  FastCircle(T x1, T y1,
-	     T x2, T y2,
-	     T x3, T y3) { 
-    compute(x1,y1,x2,y2,x3,y3);
-  }
-
-  void compute(T x1, T y1,
-	       T x2, T y2,
-	       T x3, T y3);
-  
+  void compute(T x1, T y1, T x2, T y2, T x3, T y3);
 
   T m_xp;
   T m_yp;
   T m_c;
   T m_alpha;
   T m_beta;
-
 };
 
+template <typename T>
+void FastCircle<T>::compute(T x1, T y1, T x2, T y2, T x3, T y3) {
+  bool flip = std::abs(x3 - x1) > std::abs(y3 - y1);
 
-template<typename T>
-void FastCircle<T>::compute(T x1, T y1,
-			    T x2, T y2,
-			    T x3, T y3) {
-  bool flip = std::abs(x3-x1) > std::abs(y3-y1);
-   
-  auto x1p = x1-x2;
-  auto y1p = y1-y2;
-  auto d12 = x1p*x1p + y1p*y1p;
-  auto x3p = x3-x2;
-  auto y3p = y3-y2;
-  auto d32 = x3p*x3p + y3p*y3p;
+  auto x1p = x1 - x2;
+  auto y1p = y1 - y2;
+  auto d12 = x1p * x1p + y1p * y1p;
+  auto x3p = x3 - x2;
+  auto y3p = y3 - y2;
+  auto d32 = x3p * x3p + y3p * y3p;
 
   if (flip) {
-    std::swap(x1p,y1p);
-    std::swap(x3p,y3p);
+    std::swap(x1p, y1p);
+    std::swap(x3p, y3p);
   }
 
-  auto num = x1p*y3p-y1p*x3p;  // num also gives correct sign for CT
-  auto det = d12*y3p-d32*y1p;
-  if( std::abs(det)==0 ) {
+  auto num = x1p * y3p - y1p * x3p;  // num also gives correct sign for CT
+  auto det = d12 * y3p - d32 * y1p;
+  if (std::abs(det) == 0) {
     // and why we flip????
   }
-  auto ct  = num/det;
-  auto sn  = det>0 ? T(1.) : T(-1.);  
-  auto st2 = (d12*x3p-d32*x1p)/det;
-  auto seq = T(1.) +st2*st2;
-  auto al2 = sn/std::sqrt(seq);
-  auto be2 = -st2*al2;
-  ct *= T(2.)*al2;
-  
+  auto ct = num / det;
+  auto sn = det > 0 ? T(1.) : T(-1.);
+  auto st2 = (d12 * x3p - d32 * x1p) / det;
+  auto seq = T(1.) + st2 * st2;
+  auto al2 = sn / std::sqrt(seq);
+  auto be2 = -st2 * al2;
+  ct *= T(2.) * al2;
+
   if (flip) {
-    std::swap(x1p,y1p);
-    std::swap(al2,be2);
+    std::swap(x1p, y1p);
+    std::swap(al2, be2);
     al2 = -al2;
     be2 = -be2;
     ct = -ct;
   }
-  
+
   m_xp = x1;
   m_yp = y1;
-  m_c= ct;
-  m_alpha = al2 - ct*x1p;
-  m_beta = be2 - ct*y1p;
-  
+  m_c = ct;
+  m_alpha = al2 - ct * x1p;
+  m_beta = be2 - ct * y1p;
 }
 
-
-
 // compute curvature given two points (and origin)
 float fastDPHI(float ri, float ro, float dphi) {
-
   /*
   x3=0 y1=0 x1=0;
   y3=ro
@@ -111,7 +94,6 @@ float fastDPHI(float ri, float ro, float dphi) {
   // auto x2 = ri*dphi;
   // auto y2 = ri*(1.f-0.5f*dphi*dphi);
 
-
   /*
   auto x1p = x1-x2;
   auto y1p = y1-y2;
@@ -120,7 +102,7 @@ float fastDPHI(float ri, float ro, float dphi) {
   auto y3p = y3-y2;
   auto d32 = x3p*x3p + y3p*y3p;
   */
-   
+
   /*
   auto x1p = -x2;
   auto y1p = -y2;
@@ -129,7 +111,6 @@ float fastDPHI(float ri, float ro, float dphi) {
   auto y3p = ro-y2;
   auto d32 = ri*ri + ro*ro - 2.f*ro*y2;
   */
-  
 
   // auto rat = (ro -2.f*y2);
   // auto det =  ro - ri - (ro - 2.f*ri -0.5f*ro)*dphi*dphi;
@@ -138,60 +119,47 @@ float fastDPHI(float ri, float ro, float dphi) {
   // auto seq = det2 +  dphi*dphi*(ro-2.f*ri)*(ro-2.f*ri);    // *rat2;
   // auto seq = (ro-ri)*(ro-ri) +  dphi*dphi*ri*ro;
 
-  // and little by little simplifing and removing higher over terms 
+  // and little by little simplifing and removing higher over terms
   // we get
-  auto r2 = (ro-ri)*(ro-ri)/(dphi*dphi) + ri*ro;
+  auto r2 = (ro - ri) * (ro - ri) / (dphi * dphi) + ri * ro;
 
-
-  // d2 = (ro-ri)*(ro-ri)/(4.f*r2 -ri*ro);  
+  // d2 = (ro-ri)*(ro-ri)/(4.f*r2 -ri*ro);
   // return -2.f*dphi/std::sqrt(seq);
 
-  return -1.f/std::sqrt(r2/4.f);
-  
+  return -1.f / std::sqrt(r2 / 4.f);
 }
 
+#include <iostream>
 
-
-#include<iostream>
-
-template<typename T>
+template <typename T>
 bool equal(T a, T b) {
   //  return float(a-b)==0;
-  return std::abs(float(a-b)) < std::abs(0.01f*a);
+  return std::abs(float(a - b)) < std::abs(0.01f * a);
 }
 
-
-
-int n=0;
-void go(float ri, float ro, float dphi, bool print=false) {
+int n = 0;
+void go(float ri, float ro, float dphi, bool print = false) {
   ++n;
   float x3 = 0.f, y3 = ro;
-  float x2 = ri*sin(dphi);
-  float y2 = ri*cos(dphi);
-
-  
-  FastCircle<float> c(0,0,
-		  x2,y2,
-                  x3,y3);
+  float x2 = ri * sin(dphi);
+  float y2 = ri * cos(dphi);
 
-  auto cc = fastDPHI(ri,ro,dphi);
-  if (print) std::cout << c.m_c << ' ' << cc << std::endl;
-  assert(equal(c.m_c,cc));
+  FastCircle<float> c(0, 0, x2, y2, x3, y3);
 
-  
+  auto cc = fastDPHI(ri, ro, dphi);
+  if (print)
+    std::cout << c.m_c << ' ' << cc << std::endl;
+  assert(equal(c.m_c, cc));
 }
 
 int main() {
+  go(4., 7., 0.1, true);
 
-
-  go(4.,7.,0.1, true);
-
-  for (float r1=2; r1<15; r1+=1)
-    for (float dr=0.5; dr<10; dr+=0.5)
-      for (float dphi=0.02; dphi<0.2; dphi+=0.2)
-	go(r1,r1+dr,dphi);
+  for (float r1 = 2; r1 < 15; r1 += 1)
+    for (float dr = 0.5; dr < 10; dr += 0.5)
+      for (float dphi = 0.02; dphi < 0.2; dphi += 0.2)
+        go(r1, r1 + dr, dphi);
 
   std::cout << "done " << n << std::endl;
   return 0;
 };
-
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index fe78853d568ee..29a67c255ee1b 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -37,11 +37,10 @@
 */
 class SeedProducerFromSoA : public edm::global::EDProducer<> {
 public:
-
-  explicit SeedProducerFromSoA(const edm::ParameterSet &iConfig);
+  explicit SeedProducerFromSoA(const edm::ParameterSet& iConfig);
   ~SeedProducerFromSoA() override = default;
 
-  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
@@ -52,16 +51,16 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
   int32_t minNumberOfHits_;
 };
 
-SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet &iConfig) :
-      tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig)
+    : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
       tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
       minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
 
 {
-    produces<TrajectorySeedCollection>();
+  produces<TrajectorySeedCollection>();
 }
 
-void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
   desc.add<edm::InputTag>("src", edm::InputTag("pixelTrackSoA"));
@@ -71,102 +70,100 @@ void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descr
 }
 
 void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-
   // std::cout << "Converting gpu helix to trajectory seed" << std::endl;
   auto result = std::make_unique<TrajectorySeedCollection>();
 
-
   edm::ESHandle<MagneticField> fieldESH;
   iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
 
   edm::ESHandle<TrackerGeometry> tracker;
   iSetup.get<TrackerDigiGeometryRecord>().get(tracker);
-  auto const & dus = tracker->detUnits();
+  auto const& dus = tracker->detUnits();
 
-  edm::ESHandle<Propagator>  propagatorHandle;
-  iSetup.get<TrackingComponentsRecord>().get("PropagatorWithMaterial",propagatorHandle);
-  const Propagator*  propagator = &(*propagatorHandle);
+  edm::ESHandle<Propagator> propagatorHandle;
+  iSetup.get<TrackingComponentsRecord>().get("PropagatorWithMaterial", propagatorHandle);
+  const Propagator* propagator = &(*propagatorHandle);
 
   edm::ESHandle<TrackerTopology> httopo;
   iSetup.get<TrackerTopologyRcd>().get(httopo);
 
-
-  const auto &bsh = iEvent.get(tBeamSpot_);
+  const auto& bsh = iEvent.get(tBeamSpot_);
   // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
 
-  const auto & tsoa = *(iEvent.get(tokenTrack_));
+  const auto& tsoa = *(iEvent.get(tokenTrack_));
 
-  auto const * quality = tsoa.qualityData();
-  auto const & fit = tsoa.stateAtBS;
-  auto const & detIndices = tsoa.detIndices;
+  auto const* quality = tsoa.qualityData();
+  auto const& fit = tsoa.stateAtBS;
+  auto const& detIndices = tsoa.detIndices;
   auto maxTracks = tsoa.stride();
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
     auto nHits = tsoa.nHits(it);
-    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
 
     auto q = quality[it];
     if (q != trackQuality::loose)
-      continue;                           // FIXME
-   if (nHits< minNumberOfHits_) continue;
+      continue;  // FIXME
+    if (nHits < minNumberOfHits_)
+      continue;
     ++nt;
 
     // fill hits with invalid just to hold the detId
     auto b = detIndices.begin(it);
     edm::OwnVector<TrackingRecHit> hits;
     for (int iHit = 0; iHit < nHits; ++iHit) {
-      auto const * det =  dus[*(b+iHit)];
+      auto const* det = dus[*(b + iHit)];
       // FIXME at some point get a proper type ...
-      hits.push_back(new InvalidTrackingRecHit(*det,TrackingRecHit::bad));
+      hits.push_back(new InvalidTrackingRecHit(*det, TrackingRecHit::bad));
     }
 
-
-   // mind: this values are respect the beamspot!
+    // mind: this values are respect the beamspot!
 
     float phi = tsoa.phi(it);
 
-    Rfit::Vector5d ipar,opar;
-    Rfit::Matrix5d icov,ocov;
-    fit.copyToDense(ipar,icov,it);
-    Rfit::transformToPerigeePlane(ipar,icov,opar,ocov);
+    Rfit::Vector5d ipar, opar;
+    Rfit::Matrix5d icov, ocov;
+    fit.copyToDense(ipar, icov, it);
+    Rfit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
-    LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.);
+    LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
     AlgebraicSymMatrix55 m;
-    for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j);
+    for (int i = 0; i < 5; ++i)
+      for (int j = i; j < 5; ++j)
+        m(i, j) = ocov(i, j);
 
     float sp = std::sin(phi);
     float cp = std::cos(phi);
-    Surface::RotationType rot(
-                              sp, -cp,    0,
-                               0,   0, -1.f,
-                              cp,  sp,    0);
+    Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
 
-    Plane impPointPlane(bs,rot);
+    Plane impPointPlane(bs, rot);
     GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()),
-                                  impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product());
+                                  impPointPlane.toGlobal(lpar.momentum()),
+                                  lpar.charge(),
+                                  fieldESH.product());
 
-    JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product());
+    JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, *fieldESH.product());
 
-    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
 
     FreeTrajectoryState fts(gp, CurvilinearTrajectoryError(mo));
 
-    auto const & lastHit = hits.back();
+    auto const& lastHit = hits.back();
 
     TrajectoryStateOnSurface outerState = propagator->propagate(fts, *lastHit.surface());
 
-    if (!outerState.isValid()){
-      edm::LogError("SeedFromGPU")<<" was trying to create a seed from:\n"<<fts<<"\n propagating to: " 
-                                         << lastHit.geographicalId().rawId();
+    if (!outerState.isValid()) {
+      edm::LogError("SeedFromGPU") << " was trying to create a seed from:\n"
+                                   << fts << "\n propagating to: " << lastHit.geographicalId().rawId();
       continue;
     }
 
-    auto const & pTraj = trajectoryStateTransform::persistentState(outerState, lastHit.geographicalId().rawId());
+    auto const& pTraj = trajectoryStateTransform::persistentState(outerState, lastHit.geographicalId().rawId());
 
     result->emplace_back(pTraj, hits, alongMomentum);
-
   }
 
   iEvent.put(std::move(result));

From 468e9ac1493138c4be196b8551eef5db4fff9df5 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 12 Sep 2019 05:45:55 +0200
Subject: [PATCH 058/102] Synchronise with CMSSW_11_0_0_pre7

---
 .../python/PostProcessorTracker_cfi.py        | 70 +++++++++++++++----
 .../RecoTrack/python/TrackValidation_cff.py   | 40 ++++++++++-
 Validation/RecoTrack/python/plotting/html.py  |  7 +-
 .../python/plotting/trackingPlots.py          |  4 +-
 4 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index 9cd28e6512bf0..91adbee0f9bba 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -1,11 +1,26 @@
 import FWCore.ParameterSet.Config as cms
 from DQMServices.Core.DQMEDHarvester import DQMEDHarvester
 
+def _addNoFlow(module):
+    _noflowSeen = set()
+    for eff in module.efficiency.value():
+        tmp = eff.split(" ")
+        if "cut" in tmp[0]:
+            continue
+        ind = -1
+        if tmp[ind] == "fake" or tmp[ind] == "simpleratio":
+            ind = -2
+        if not tmp[ind] in _noflowSeen:
+            module.noFlowDists.append(tmp[ind])
+        if not tmp[ind-1] in _noflowSeen:
+            module.noFlowDists.append(tmp[ind-1])
+
 postProcessorTrack = DQMEDHarvester("DQMGenericClient",
     subDirs = cms.untracked.vstring("Tracking/Track/*", "Tracking/TrackTPPtLess09/*", "Tracking/TrackFromPV/*", "Tracking/TrackFromPVAllTP/*", "Tracking/TrackAllTPEffic/*", "Tracking/TrackBuilding/*", "Tracking/TrackConversion/*", "Tracking/TrackGsf/*", "Tracking/TrackBHadron/*"),
     efficiency = cms.vstring(
     "effic 'Efficiency vs #eta' num_assoc(simToReco)_eta num_simul_eta",
     "efficPt 'Efficiency vs p_{T}' num_assoc(simToReco)_pT num_simul_pT",
+#    "efficPtvseta 'Efficiency in p_{T}-#eta plane' num_assoc(simToReco)_pTvseta num_simul_pTvseta",
     "effic_vs_hit 'Efficiency vs hit' num_assoc(simToReco)_hit num_simul_hit",
     "effic_vs_layer 'Efficiency vs layer' num_assoc(simToReco)_layer num_simul_layer",
     "effic_vs_pixellayer 'Efficiency vs pixel layer' num_assoc(simToReco)_pixellayer num_simul_pixellayer",
@@ -20,6 +35,7 @@
     "effic_vs_dzpv_zoomed 'Efficiency vs Dz(PV)' num_assoc(simToReco)_dzpv_zoomed num_simul_dzpv_zoomed",
     "duplicatesRate 'Duplicates Rate vs #eta' num_duplicate_eta num_reco_eta",
     "duplicatesRate_Pt 'Duplicates Rate vs p_{T}' num_duplicate_pT num_reco_pT",
+#    "duplicatesRate_Ptvseta 'Duplicates Rate in (p_{T}-#eta) plane' num_duplicate_pTvseta num_reco_pTvseta",
     "duplicatesRate_hit 'Duplicates Rate vs hit' num_duplicate_hit num_reco_hit",
     "duplicatesRate_layer 'Duplicates Rate vs layer' num_duplicate_layer num_reco_layer",
     "duplicatesRate_pixellayer 'Duplicates Rate vs pixel layer' num_duplicate_pixellayer num_reco_pixellayer",
@@ -39,7 +55,7 @@
     "duplicatesRate_chi2 'Duplicates Rate vs normalized #chi^{2}' num_duplicate_chi2 num_reco_chi2",
     "duplicatesRate_seedingLayerSet 'Duplicates rate vs. seedingLayerSet' num_duplicate_seedingLayerSet num_reco_seedingLayerSet",
     "chargeMisIdRate 'Charge MisID Rate vs #eta' num_chargemisid_eta num_reco_eta",
-    "chargeMisIdRate_Pt 'Charge MisID Rate vs p_{T}' num_chargemisid_pT num_reco_pT",
+#    "chargeMisIdRate_Ptvseta 'Charge MisID Rate in (p_{T}-#eta) plane' num_chargemisid_pTvseta num_reco_pTvseta",
     "chargeMisIdRate_hit 'Charge MisID Rate vs hit' num_chargemisid_hit num_reco_hit",
     "chargeMisIdRate_layer 'Charge MisID Rate vs layer' num_chargemisid_hit num_reco_layer",
     "chargeMisIdRate_pixellayer 'Charge MisID Rate vs pixel layer' num_chargemisid_hit num_reco_pixellayer",
@@ -65,6 +81,7 @@
     "effic_vertz_fwdneg 'efficiency in endcap(-) vs z of primary interaction vertex' num_assoc(simToReco)_vertz_fwdneg num_simul_vertz_fwdneg",
     "pileuprate 'Pileup Rate vs #eta' num_pileup_eta num_reco_eta",
     "pileuprate_Pt 'Pileup rate vs p_{T}' num_pileup_pT num_reco_pT",
+#    "pileuprate_Ptvseta 'Pileup rate in (p_{T}-#eta) plane' num_pileup_pTvseta num_reco_pTvseta",
     "pileuprate_hit 'Pileup rate vs hit' num_pileup_hit num_reco_hit",
     "pileuprate_layer 'Pileup rate vs layer' num_pileup_layer num_reco_layer",
     "pileuprate_pixellayer 'Pileup rate vs layer' num_pileup_pixellayer num_reco_pixellayer",
@@ -85,6 +102,7 @@
     "pileuprate_seedingLayerSet 'Pileup rate vs. seedingLayerSet' num_pileup_seedingLayerSet num_reco_seedingLayerSet",
     "fakerate 'Fake rate vs #eta' num_assoc(recoToSim)_eta num_reco_eta fake",
     "fakeratePt 'Fake rate vs p_{T}' num_assoc(recoToSim)_pT num_reco_pT fake",
+#    "fakeratePtvseta 'Fake rate in (p_{T}-#eta) plane' num_assoc(recoToSim)_pTvseta num_reco_pTvseta fake",
     "fakerate_vs_hit 'Fake rate vs hit' num_assoc(recoToSim)_hit num_reco_hit fake",
     "fakerate_vs_layer 'Fake rate vs layer' num_assoc(recoToSim)_layer num_reco_layer fake",
     "fakerate_vs_pixellayer 'Fake rate vs layer' num_assoc(recoToSim)_pixellayer num_reco_pixellayer fake",
@@ -223,21 +241,24 @@
     noFlowDists = cms.untracked.vstring(),
     outputFileName = cms.untracked.string("")
 )
-def _addNoFlow(module):
-    _noflowSeen = set()
-    for eff in module.efficiency.value():
-        tmp = eff.split(" ")
-        if "cut" in tmp[0]:
-            continue
-        ind = -1
-        if tmp[ind] == "fake" or tmp[ind] == "simpleratio":
-            ind = -2
-        if not tmp[ind] in _noflowSeen:
-            module.noFlowDists.append(tmp[ind])
-        if not tmp[ind-1] in _noflowSeen:
-            module.noFlowDists.append(tmp[ind-1])
 _addNoFlow(postProcessorTrack)
 
+postProcessorTrack2D = DQMEDHarvester("DQMGenericClient",
+    makeGlobalEffienciesPlot = cms.untracked.bool(False),
+    subDirs = cms.untracked.vstring("Tracking/Track/*", "Tracking/TrackTPPtLess09/*", "Tracking/TrackFromPV/*", "Tracking/TrackFromPVAllTP/*", "Tracking/TrackAllTPEffic/*", "Tracking/TrackBuilding/*", "Tracking/TrackConversion/*", "Tracking/TrackGsf/*", "Tracking/TrackBHadron/*"),
+    efficiency = cms.vstring(
+    "efficPtvseta 'Efficiency in p_{T}-#eta plane' num_assoc(simToReco)_pTvseta num_simul_pTvseta",
+    "duplicatesRate_Ptvseta 'Duplicates Rate in (p_{T}-#eta) plane' num_duplicate_pTvseta num_reco_pTvseta",
+    "chargeMisIdRate_Ptvseta 'Charge MisID Rate in (p_{T}-#eta) plane' num_chargemisid_pTvseta num_reco_pTvseta",
+    "pileuprate_Ptvseta 'Pileup rate in (p_{T}-#eta) plane' num_pileup_pTvseta num_reco_pTvseta",
+    "fakeratePtvseta 'Fake rate in (p_{T}-#eta) plane' num_assoc(recoToSim)_pTvseta num_reco_pTvseta fake",
+    ),
+    resolution = cms.vstring(),
+    noFlowDists = cms.untracked.vstring(),
+    outputFileName = cms.untracked.string("")
+)
+_addNoFlow(postProcessorTrack2D)
+
 # nrec/nsim makes sense only for
 # - all tracks vs. all in-time TrackingParticles
 # - PV tracks vs. signal TrackingParticles
@@ -246,12 +267,24 @@ def _addNoFlow(module):
     efficiency = cms.vstring(
         "nrecPerNsim 'Tracks/TrackingParticles vs #eta' num_reco2_eta num_simul_eta simpleratio",
         "nrecPerNsimPt 'Tracks/TrackingParticles vs p_{T}' num_reco2_pT num_simul_pT simpleratio",
+#        "nrecPerNsimPtvseta 'Tracks/TrackingParticles in (p_{T}-#eta) plane' num_reco2_pTvseta num_simul_pTvseta simpleratio",
         "nrecPerNsim_vs_pu 'Tracks/TrackingParticles vs pu' num_reco2_pu num_simul_pu simpleratio",
     ),
     resolution = cms.vstring(),
     noFlowDists = cms.untracked.vstring(),
 )
 _addNoFlow(postProcessorTrackNrecVsNsim)
+postProcessorTrackNrecVsNsim2D = DQMEDHarvester("DQMGenericClient",
+    makeGlobalEffienciesPlot = cms.untracked.bool(False),
+    subDirs = cms.untracked.vstring("Tracking/TrackFromPV/*", "Tracking/TrackAllTPEffic/*"),
+    efficiency = cms.vstring(
+        "nrecPerNsimPtvseta 'Tracks/TrackingParticles in (p_{T}-#eta) plane' num_reco2_pTvseta num_simul_pTvseta simpleratio",
+    ),
+    resolution = cms.vstring(),
+    noFlowDists = cms.untracked.vstring(),
+)
+_addNoFlow(postProcessorTrackNrecVsNsim2D)
+
 
 postProcessorTrackSummary = DQMEDHarvester("DQMGenericClient",
     subDirs = cms.untracked.vstring("Tracking/Track", "Tracking/TrackTPPtLess09", "Tracking/TrackFromPV", "Tracking/TrackFromPVAllTP", "Tracking/TrackAllTPEffic", "Tracking/TrackBuilding", "Tracking/TrackConversion", "Tracking/TrackGsf", "Tracking/TrackBHadron"),
@@ -273,6 +306,15 @@ def _addNoFlow(module):
     postProcessorTrackSummary
 )
 
+postProcessorTrackPhase2 = postProcessorTrack.clone()
+postProcessorTrackPhase2.subDirs.extend(["Tracking/TrackTPEtaGreater2p7/*"])
+postProcessorTrackSummaryPhase2 = postProcessorTrackSummary.clone()
+postProcessorTrackSummaryPhase2.subDirs.extend(["Tracking/TrackTPEtaGreater2p7/*"])
+
+from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
+phase2_tracker.toReplaceWith(postProcessorTrack,postProcessorTrackPhase2)
+phase2_tracker.toReplaceWith(postProcessorTrackSummary,postProcessorTrackSummaryPhase2)
+
 postProcessorTrackTrackingOnly = postProcessorTrack.clone()
 postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"])
 postProcessorTrackSummaryTrackingOnly = postProcessorTrackSummary.clone()
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 52bb93d4ee858..ed460d0d8c3f7 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -9,7 +9,8 @@
 from SimTracker.TrackAssociation.LhcParametersDefinerForTP_cfi import *
 from SimTracker.TrackAssociation.CosmicParametersDefinerForTP_cfi import *
 from Validation.RecoTrack.PostProcessorTracker_cfi import *
-from . import cutsRecoTracks_cfi
+import Validation.RecoTrack.cutsRecoTracks_cfi as cutsRecoTracks_cfi
+#from . import cutsRecoTracks_cfi
 
 from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import *
 from SimTracker.VertexAssociation.VertexAssociatorByPositionAndTracks_cfi import *
@@ -403,6 +404,34 @@ def _getMVASelectors(postfix):
     doResolutionPlotsForLabels = ["disabled"], # resolutions are same as in trackValidator, no need to repeat here
 )
 
+# for high-eta (phase2 : |eta| > 2.7)
+trackValidatorTPEtaGreater2p7 = trackValidator.clone(
+    dirName = "Tracking/TrackTPEtaGreater2p7/",
+    label = [x for x in trackValidator.label.value() if ("Pt09" not in x) and ("BtvLike" not in x) and ("AK4PFJets" not in x)],
+    dodEdxPlots = False,
+#    doPVAssociationPlots = False,
+    minRapidityTP = -2.7,
+    maxRapidityTP = 2.7,
+    invertRapidityCutTP = True,
+#    ptMaxTP = 0.9, # set maximum pT globally
+    histoProducerAlgoBlock = dict(
+        TpSelectorForEfficiencyVsPt   = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True), # enough to set min pT here
+        TpSelectorForEfficiencyVsEta  = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True), # enough to set min pT here
+        TpSelectorForEfficiencyVsPhi  = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
+        TpSelectorForEfficiencyVsVTXR = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
+        TpSelectorForEfficiencyVsVTXZ = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
+        generalTpSelector             = dict(ptMin=0.005,minRapidity=-2.7,maxRapidity=2.7,invertRapidityCut=True),
+#        minEta  = -4.5,
+#        maxEta  =  4.5,
+#        nintEta = 90,
+        #    minPt  = 0.01,
+    ),
+    doSimPlots = False,       # same as in trackValidator, no need to repeat here
+    doRecoTrackPlots = False, # fake rates are same as in trackValidator, no need to repeat here
+    doResolutionPlotsForLabels = ["disabled"] # resolutions are same as in trackValidator, no need to repeat here
+)
+
+
 # For efficiency of signal TPs vs. signal tracks, and fake rate of
 # signal tracks vs. signal TPs
 trackValidatorFromPV = trackValidator.clone(
@@ -604,6 +633,14 @@ def _uniqueFirstLayers(layerList):
     trackValidatorGsfTracks,
     tracksPreValidation
 )
+
+from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
+#tracksValidationPhase2 = cms.Sequence(tracksValidation+trackValidatorTPEtaGreater2p7) # it does not work
+tracksValidationPhase2 = tracksValidation.copy()
+tracksValidationPhase2+=trackValidatorTPEtaGreater2p7
+phase2_tracker.toReplaceWith(tracksValidation, tracksValidationPhase2)
+
+
 fastSim.toReplaceWith(tracksValidation, tracksValidation.copyAndExclude([
     trackValidatorBuildingPreSplitting,
     trackValidatorConversion,
@@ -661,7 +698,6 @@ def _uniqueFirstLayers(layerList):
 trackValidatorAllTPEfficStandalone = trackValidatorAllTPEffic.clone(
     label = [ x for x in trackValidator.label.value() if x not in ["cutsRecoTracksBtvLike", "cutsRecoTracksAK4PFJets"] and "Pt09" not in x],
     cores = "highPtJets"
-
 )
 
 trackValidatorConversionStandalone = trackValidatorConversion.clone(
diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py
index b884c4682c4d5..d4f83995b8bcb 100644
--- a/Validation/RecoTrack/python/plotting/html.py
+++ b/Validation/RecoTrack/python/plotting/html.py
@@ -184,6 +184,8 @@ def _toPixel(s):
     "miniaod": "MiniAOD",
     "timing": "Timing",
     "hlt": "HLT",
+    "pixel": "Pixel tracks",
+    "pf": "PF",
 }
 
 _sectionNameMapOrder = collections.OrderedDict([
@@ -329,6 +331,7 @@ class MiniAOD: pass
     class Timing: pass
     class HLT: pass
     class Pixel: pass
+    class PF: pass
 
 class Page(object):
     def __init__(self, title, sampleName):
@@ -712,6 +715,7 @@ def __init__(self, sample, title, fastVsFull, pileupComparison):
         self._vertexPage = PageSet(*params)
         self._miniaodPage = PageSet(*params)
         self._timingPage = PageSet(*params)
+        self._pfPages = PageSet(*params)
         self._hltPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0])
         self._pixelPages = TrackingPageSet(*params)
         self._otherPages = PageSet(*params)
@@ -722,6 +726,7 @@ def __init__(self, sample, title, fastVsFull, pileupComparison):
             PlotPurpose.Vertexing: self._vertexPage,
             PlotPurpose.MiniAOD: self._miniaodPage,
             PlotPurpose.Timing: self._timingPage,
+            PlotPurpose.PF: self._pfPages,
             PlotPurpose.HLT: self._hltPages,
             PlotPurpose.Pixel: self._pixelPages,
         }
@@ -745,7 +750,7 @@ def write(self, baseDir):
             "  <ul>",
             ]
 
-        for pages in [self._summaryPage, self._iterationPages, self._pixelPages, self._vertexPage, self._miniaodPage, self._timingPage, self._hltPages, self._otherPages]:
+        for pages in [self._summaryPage, self._iterationPages, self._pixelPages, self._vertexPage, self._miniaodPage, self._timingPage, self._hltPages, self._pfPages, self._otherPages]:
             labelFiles = pages.write(baseDir)
             for label, fname in labelFiles:
                 ret.append('   <li><a href="%s">%s</a></li>' % (fname, label))
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index 2250b736bc00a..36aff86208761 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -371,7 +371,9 @@ def _makeMVAPlots(num, hp=False):
     Plot("chi2_prob", stat=True, normalizeToUnitArea=True, drawStyle="hist", xtitle="Prob(#chi^{2})"),
     Plot("chi2mean", title="", xtitle="#eta", ytitle="< #chi^{2} / ndf >", ymin=[0, 0.5], ymax=[2, 2.5, 3, 5],
          fallback={"name": "chi2_vs_eta", "profileX": True}),
-    Plot("ptres_vs_eta_Mean", scale=100, title="", xtitle="TP #eta (PCA to beamline)", ytitle="< #delta p_{T} / p_{T} > (%)", ymin=_minResidualPt, ymax=_maxResidualPt)
+    Plot("ptres_vs_eta_Mean", scale=100, title="", xtitle="TP #eta (PCA to beamline)", ytitle="< #delta p_{T} / p_{T} > (%)", ymin=_minResidualPt, ymax=_maxResidualPt),
+    Plot("chi2mean_vs_pt", title="", xtitle="p_{T}", ytitle="< #chi^{2} / ndf >", ymin=[0, 0.5], ymax=[2, 2.5, 3, 5], xlog=True, fallback={"name": "chi2_vs_pt", "profileX": True}),
+    Plot("ptres_vs_pt_Mean", title="", xtitle="p_{T}", ytitle="< #delta p_{T}/p_{T} > (%)", scale=100, ymin=_minResidualPt, ymax=_maxResidualPt,xlog=True)
 ])
 _common = {"stat": True, "fit": True, "normalizeToUnitArea": True, "drawStyle": "hist", "drawCommand": "", "xmin": -10, "xmax": 10, "ylog": True, "ymin": 5e-5, "ymax": [0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1.025], "ratioUncertainty": False}
 _pulls = PlotGroup("pulls", [

From 604a797342ff6258a09d251414e0ba7aa0e154c2 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 23 Oct 2019 02:13:49 -0500
Subject: [PATCH 059/102] Fix clang warnings (cms-patatrack#387)

---
 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc    | 2 +-
 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h     | 2 +-
 .../PixelTriplets/plugins/gpuPixelDoubletsAlgos.h               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index f70b61d406d1a..11b644d466768 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -74,7 +74,7 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
     CUDAScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
 
-    ctx.emplace(iEvent, tokenTrackGPU_, std::move(gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())));
+    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
   } else {
     auto const& hits = iEvent.get(tokenHitCPU_);
     iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index dbe3e3f8c964e..6f3a15ad84d90 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -1,5 +1,5 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
 
 #include "RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h"
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index be668dfbb2e04..b14434d42bed9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -1,5 +1,5 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
 
 #include <algorithm>
 #include <cmath>

From 80ec6eb0744a03e0ada55c49f515c40fafdf8563 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Sat, 26 Oct 2019 13:57:43 -0500
Subject: [PATCH 060/102] Replace use of API wrapper stream and event with
 plain CUDA, part 1 (cms-patatrack#389)

Replace cuda::stream_t<> with cudaStream_t in client code
Replace cuda::event_t with cudaEvent_t in the client code
Clean up BuildFiles
---
 CUDADataFormats/Track/BuildFile.xml           |   2 +-
 .../PixelTrackFitting/test/BuildFile.xml      |   3 -
 .../plugins/BrokenLineFitOnGPU.cu             |  74 ++++-----
 .../PixelTriplets/plugins/BuildFile.xml       |   1 -
 .../plugins/CAHitNtupletGeneratorKernels.cc   |   2 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   |  38 ++---
 .../plugins/CAHitNtupletGeneratorKernels.h    |   4 +-
 .../CAHitNtupletGeneratorKernelsAlloc.h       |  10 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  19 ++-
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  10 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  10 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  | 146 +++++++++---------
 12 files changed, 153 insertions(+), 166 deletions(-)

diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
index 2aa4baedb0bb2..bf606ba2330e1 100644
--- a/CUDADataFormats/Track/BuildFile.xml
+++ b/CUDADataFormats/Track/BuildFile.xml
@@ -1,4 +1,4 @@
-<use name="cuda-api-wrappers"/>
+<use name="cuda"/>
 <use name="rootcore"/>
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 03c2713760fe6..68349ca4f45a4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -36,7 +36,6 @@
 <bin file="testEigenGPU.cu" name="testRiemannFitGPU_t">
   <use name="eigen"/>
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g"/>
 </bin>
@@ -44,7 +43,6 @@
 <bin file="testEigenGPU.cu" name="testBrokenLineFitGPU_t">
   <use name="eigen"/>
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g -DUSE_BL"/>
 </bin>
@@ -52,7 +50,6 @@
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
   <use name="eigen"/>
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 3825c6d812cfb..660cf75e1f460 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -4,7 +4,7 @@
 void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
-                                            cuda::stream_t<> &stream) {
+                                            cudaStream_t stream) {
   assert(tuples_d);
 
   auto blockSize = 64;
@@ -20,64 +20,64 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
         tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                  bField_,
-                                                                  outputSoa_d,
-                                                                  hitsGPU_.get(),
-                                                                  hits_geGPU_.get(),
-                                                                  fast_fit_resultsGPU_.get(),
-                                                                  3,
-                                                                  offset);
+    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                             bField_,
+                                                             outputSoa_d,
+                                                             hitsGPU_.get(),
+                                                             hits_geGPU_.get(),
+                                                             fast_fit_resultsGPU_.get(),
+                                                             3,
+                                                             offset);
     cudaCheck(cudaGetLastError());
 
     // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
         tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                      bField_,
-                                                                      outputSoa_d,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      4,
-                                                                      offset);
+    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                 bField_,
+                                                                 outputSoa_d,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 4,
+                                                                 offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                        bField_,
-                                                                        outputSoa_d,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        5,
-                                                                        offset);
+      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
       cudaCheck(cudaGetLastError());
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                        bField_,
-                                                                        outputSoa_d,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        5,
-                                                                        offset);
+      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
       cudaCheck(cudaGetLastError());
     }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index da08ce124941c..1554e515ad437 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,5 +1,4 @@
 <use name="cuda"/>
-<use name="cuda-api-wrappers"/>
 <use name="ofast-flag"/>
 <use name="CommonTools/RecoAlgos"/>
 <use name="FWCore/Framework"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index d85e09d7e2df6..83dab9c5b9c28 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -11,7 +11,7 @@ void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSo
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 93af3d43ff06e..a9436aeb32d23 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -147,7 +147,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
@@ -166,12 +166,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
     int threadsPerBlock = 128;
     // at least one block!
     int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
-    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream.id()>>>(device_isOuterHitOfCell_.get(),
-                                                                                nhits,
-                                                                                device_theCellNeighbors_,
-                                                                                device_theCellNeighborsContainer_.get(),
-                                                                                device_theCellTracks_,
-                                                                                device_theCellTracksContainer_.get());
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
+                                                                           nhits,
+                                                                           device_theCellNeighbors_,
+                                                                           device_theCellNeighborsContainer_.get(),
+                                                                           device_theCellTracks_,
+                                                                           device_theCellTracksContainer_.get());
     cudaCheck(cudaGetLastError());
   }
 
@@ -199,18 +199,18 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cuda::s
   int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
   dim3 blks(1, blocks, 1);
   dim3 thrs(stride, threadsPerBlock, 1);
-  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream.id()>>>(device_theCells_.get(),
-                                                                         device_nCells_,
-                                                                         device_theCellNeighbors_,
-                                                                         device_theCellTracks_,
-                                                                         hh.view(),
-                                                                         device_isOuterHitOfCell_.get(),
-                                                                         nActualPairs,
-                                                                         m_params.idealConditions_,
-                                                                         m_params.doClusterCut_,
-                                                                         m_params.doZCut_,
-                                                                         m_params.doPhiCut_,
-                                                                         m_params.maxNumberOfDoublets_);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
+                                                                    device_nCells_,
+                                                                    device_theCellNeighbors_,
+                                                                    device_theCellTracks_,
+                                                                    hh.view(),
+                                                                    device_isOuterHitOfCell_.get(),
+                                                                    nActualPairs,
+                                                                    m_params.idealConditions_,
+                                                                    m_params.doClusterCut_,
+                                                                    m_params.doZCut_,
+                                                                    m_params.doPhiCut_,
+                                                                    m_params.maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 2cbea06e66c55..dfccaf33e904d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -170,8 +170,8 @@ class CAHitNtupletGeneratorKernels {
 
   void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
-  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream);
-  void allocateOnGPU(cuda::stream_t<>& stream);
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void allocateOnGPU(cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index 42a89a13ff78e..b91911c66924e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -4,9 +4,9 @@
 
 template <>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cuda::stream_t<>& stream) {
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
@@ -42,10 +42,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cuda::stream_t<>& stream) {
       constexpr
 #endif
       (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
-    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
+    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
   }
-  cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
-  cudautils::launchZero(device_hitToTuple_.get(), stream.id());  // we may wish to keep it in the edm...
+  cudautils::launchZero(device_tupleMultiplicity_.get(), stream);
+  cudautils::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 0114795db49d6..f276b0ccf77c8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -161,7 +161,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
                                                                     float bfield,
-                                                                    cuda::stream_t<>& stream) const {
+                                                                    cudaStream_t stream) const {
   PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
 
   auto* soa = tracks.get();
@@ -174,32 +174,31 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   kernels.buildDoublets(hits_d, stream);
-  kernels.launchKernels(hits_d, soa, stream.id());
-  kernels.fillHitDetIndices(hits_d.view(), soa, stream.id());  // in principle needed only if Hits not "available"
+  kernels.launchKernels(hits_d, soa, stream);
+  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   } else {
     fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   }
-  kernels.classifyTuples(hits_d, soa, stream.id());
+  kernels.classifyTuples(hits_d, soa, stream);
 
   return tracks;
 }
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
   PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
-  auto dummyStream = cuda::stream::wrap(0, 0, false);
 
   auto* soa = tracks.get();
   assert(soa);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.counters_ = m_counters;
-  kernels.allocateOnGPU(dummyStream);
+  kernels.allocateOnGPU(nullptr);
 
-  kernels.buildDoublets(hits_d, dummyStream);
-  kernels.launchKernels(hits_d, soa, dummyStream.id());
-  kernels.fillHitDetIndices(hits_d.view(), soa, dummyStream.id());  // in principle needed only if Hits not "available"
+  kernels.buildDoublets(hits_d, nullptr);
+  kernels.launchKernels(hits_d, soa, nullptr);
+  kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
 
   if (0 == hits_d.nHits())
     return tracks;
@@ -214,7 +213,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
     fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
   }
 
-  kernels.classifyTuples(hits_d, soa, dummyStream.id());
+  kernels.classifyTuples(hits_d, soa, nullptr);
 
   return tracks;
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 1cb8bb31fb0b6..de2e1913dd18b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -47,18 +47,16 @@ class CAHitNtupletGeneratorOnGPU {
   static void fillDescriptions(edm::ParameterSetDescription& desc);
   static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                          float bfield,
-                                          cuda::stream_t<>& stream) const;
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
 
   PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
 private:
-  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream) const;
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
 
-  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cuda::stream_t<>& cudaStream);
+  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
 
-  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const;
+  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
 
   Params m_params;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 50f4d0580c2f4..3dc8beb65d9bb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -45,14 +45,8 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
-  void launchRiemannKernels(HitsView const *hv,
-                            uint32_t nhits,
-                            uint32_t maxNumberOfTuples,
-                            cuda::stream_t<> &cudaStream);
-  void launchBrokenLineKernels(HitsView const *hv,
-                               uint32_t nhits,
-                               uint32_t maxNumberOfTuples,
-                               cuda::stream_t<> &cudaStream);
+  void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
   void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 690bce4edcf8f..cb5d32b47aea3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -4,7 +4,7 @@
 void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
-                                         cuda::stream_t<> &stream) {
+                                         cudaStream_t stream) {
   assert(tuples_d);
 
   auto blockSize = 64;
@@ -23,108 +23,108 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
         tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                      3,
-                                                                      bField_,
-                                                                      hitsGPU_.get(),
-                                                                      hits_geGPU_.get(),
-                                                                      fast_fit_resultsGPU_.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                 3,
+                                                                 bField_,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 circle_fit_resultsGPU_,
+                                                                 offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                    3,
-                                                                    bField_,
-                                                                    outputSoa_d,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    circle_fit_resultsGPU_,
-                                                                    offset);
+    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                               3,
+                                                               bField_,
+                                                               outputSoa_d,
+                                                               hitsGPU_.get(),
+                                                               hits_geGPU_.get(),
+                                                               fast_fit_resultsGPU_.get(),
+                                                               circle_fit_resultsGPU_,
+                                                               offset);
     cudaCheck(cudaGetLastError());
 
     // quads
-    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
         tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                          4,
-                                                                          bField_,
-                                                                          hitsGPU_.get(),
-                                                                          hits_geGPU_.get(),
-                                                                          fast_fit_resultsGPU_.get(),
-                                                                          circle_fit_resultsGPU_,
-                                                                          offset);
+    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                     4,
+                                                                     bField_,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                        4,
-                                                                        bField_,
-                                                                        outputSoa_d,
-                                                                        hitsGPU_.get(),
-                                                                        hits_geGPU_.get(),
-                                                                        fast_fit_resultsGPU_.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
+    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                   4,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   circle_fit_resultsGPU_,
+                                                                   offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // penta
-      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                            5,
-                                                                            bField_,
-                                                                            hitsGPU_.get(),
-                                                                            hits_geGPU_.get(),
-                                                                            fast_fit_resultsGPU_.get(),
-                                                                            circle_fit_resultsGPU_,
-                                                                            offset);
+      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                          5,
-                                                                          bField_,
-                                                                          outputSoa_d,
-                                                                          hitsGPU_.get(),
-                                                                          hits_geGPU_.get(),
-                                                                          fast_fit_resultsGPU_.get(),
-                                                                          circle_fit_resultsGPU_,
-                                                                          offset);
+      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
       cudaCheck(cudaGetLastError());
     } else {
       // penta all 5
-      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(
+      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                            5,
-                                                                            bField_,
-                                                                            hitsGPU_.get(),
-                                                                            hits_geGPU_.get(),
-                                                                            fast_fit_resultsGPU_.get(),
-                                                                            circle_fit_resultsGPU_,
-                                                                            offset);
+      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
-                                                                          5,
-                                                                          bField_,
-                                                                          outputSoa_d,
-                                                                          hitsGPU_.get(),
-                                                                          hits_geGPU_.get(),
-                                                                          fast_fit_resultsGPU_.get(),
-                                                                          circle_fit_resultsGPU_,
-                                                                          offset);
+      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
       cudaCheck(cudaGetLastError());
     }
   }

From 418100730c8b719de7151535446cb300c7243708 Mon Sep 17 00:00:00 2001
From: waredjeb <39335169+waredjeb@users.noreply.github.com>
Date: Tue, 29 Oct 2019 07:09:04 +0100
Subject: [PATCH 061/102] Replace CUDA API wrapper memory operations with
 native CUDA calls (cms-patatrack#395)

---
 .../PixelTrackFitting/test/testEigenGPU.cu    | 13 ++---
 .../test/testEigenGPUNoFit.cu                 | 52 ++++++++++---------
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index a206feca83b52..7b02a23c41dca 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -261,10 +261,10 @@ void testFit() {
   kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(fast_fit_resultsGPUret,
-             fast_fit_resultsGPU,
-             Rfit::maxNumberOfTracks() * sizeof(Vector4d),
-             cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
+                       fast_fit_resultsGPU,
+                       Rfit::maxNumberOfTracks() * sizeof(Vector4d),
+                       cudaMemcpyDeviceToHost));
   Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
@@ -311,13 +311,14 @@ void testFit() {
 
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  cudaCheck(
+      cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
 
   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
   // LINE_FIT GPU
-  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index ebaea2037eb2a..e16ac3dbbcbc3 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -73,18 +73,19 @@ void testMultiply() {
   Eigen::Matrix<double, row1, col2> *multiply_resultGPU = nullptr;
   Eigen::Matrix<double, row1, col2> *multiply_resultGPUret = new Eigen::Matrix<double, row1, col2>();
 
-  cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>));
-  cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>));
-  cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>));
-  cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice);
-  cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice);
-  cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>)));
+  cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>)));
+  cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>)));
+  cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));
 
   kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(
-      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost));
   printIt(multiply_resultGPUret);
   assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
 }
@@ -104,14 +105,14 @@ void testInverse3x3() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix3d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
 
   kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -133,14 +134,14 @@ void testInverse4x4() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix4d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix4d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice));
 
   kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -162,14 +163,14 @@ void testInverse5x5() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix5d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix5d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice));
 
   kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -195,15 +196,16 @@ void testEigenvalues() {
   std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
   std::cout << "*************************\n\n" << std::endl;
 #endif
-  cudaMalloc((void **)&m_gpu, sizeof(Matrix3d));
-  cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType));
-  cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType)));
+  cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
 
   kernel<<<1, 1>>>(m_gpu, ret_gpu);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
-  cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(
+      ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
   std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;

From 37567862ad499170bf3f62a7cd20e2c2446faf4f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Oct 2019 05:10:07 -0500
Subject: [PATCH 062/102] Synchronize event in the CUDAProductBase destructor
 (cms-patatrack#391)

Otherwise there are possibilities for weird races, e.g. combination of
non-ExternalWork producers, consumed-but-not-read CUDAProducts, CUDA
streams executing work later than expected (= on the next event).
---
 CUDADataFormats/Track/BuildFile.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
index bf606ba2330e1..e3f9a0910bbd8 100644
--- a/CUDADataFormats/Track/BuildFile.xml
+++ b/CUDADataFormats/Track/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="cuda"/>
 <use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="eigen"/>

From 5a135cb909acef9bdadde9a80105aceb628cfaaa Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 4 Nov 2019 11:48:34 +0100
Subject: [PATCH 063/102] Synchronise with CMSSW_11_0_0_pre11

---
 Validation/RecoTrack/python/plotting/trackingPlots.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index 36aff86208761..0a2ea8a35ee1b 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -1482,6 +1482,9 @@ def modules(self):
                          "initialStepClassifier3",
                          "initialStep",
                          "initialStepSelector"],
+              building=["initialStepTrackCandidatesMkFitInput",
+                        "initialStepTrackCandidatesMkFit",
+                        "initialStepTrackCandidates"],
               other=["firstStepPrimaryVerticesUnsorted",
                      "initialStepTrackRefsForJets",
                      "caloTowerForTrk",

From f53290113eb6827564a9e221d0b6be13aa1d7fc6 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 26 Nov 2019 18:50:17 +0100
Subject: [PATCH 064/102] Optimize doublet reconstruction and cuts
 (cms-patatrack#411)

Reorder cuts and some factorize code to speed up doublets.
Increase various buffers size not to overflow in case of very relaxed cuts.
Rename some parameters to better reflect their actual action in code.
---
 .../PixelTriplets/plugins/CAConstants.h       | 13 +++++--
 .../plugins/CAHitNtupletGeneratorKernels.cc   |  4 +--
 .../plugins/CAHitNtupletGeneratorKernels.cu   |  8 ++---
 .../plugins/CAHitNtupletGeneratorKernels.h    | 12 +++----
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  8 ++---
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  8 ++---
 .../plugins/gpuPixelDoubletsAlgos.h           | 35 +++++++++++--------
 7 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index d93d89965607f..1170550032068 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -15,10 +15,14 @@
 namespace CAConstants {
 
   // constants
+#ifndef ONLY_PHICUT
 #ifdef GPU_SMALL_EVENTS
   constexpr uint32_t maxNumberOfTuples() { return 3 * 1024; }
 #else
   constexpr uint32_t maxNumberOfTuples() { return 24 * 1024; }
+#endif
+#else
+  constexpr uint32_t maxNumberOfTuples() { return 48 * 1024; }
 #endif
   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
@@ -30,8 +34,8 @@ namespace CAConstants {
   constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
 #endif
 #else
-  constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 4 * 128; }
+  constexpr uint32_t maxNumberOfDoublets() { return 2*1024 * 1024; }
+  constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
 #endif
   constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
 
@@ -43,8 +47,13 @@ namespace CAConstants {
   using hindex_type = uint16_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
   using tindex_type = uint16_t;  //  for tuples
 
+#ifndef ONLY_PHICUT
   using CellNeighbors = GPU::VecArray<uint32_t, 36>;
   using CellTracks = GPU::VecArray<tindex_type, 42>;
+#else
+  using CellNeighbors = GPU::VecArray<uint32_t, 64>;
+  using CellTracks = GPU::VecArray<tindex_type, 64>;
+#endif
 
   using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
   using CellTracksVector = GPU::SimpleVector<CellTracks>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 83dab9c5b9c28..75066458dc170 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -54,8 +54,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                          nActualPairs,
                                          m_params.idealConditions_,
                                          m_params.doClusterCut_,
-                                         m_params.doZCut_,
-                                         m_params.doPhiCut_,
+                                         m_params.doZ0Cut_,
+                                         m_params.doPtCut_,
                                          m_params.maxNumberOfDoublets_);
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index a9436aeb32d23..aaf882633f17d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -194,9 +194,9 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   }
 
   assert(nActualPairs <= gpuPixelDoublets::nPairs);
-  int stride = 1;
+  int stride = 4;
   int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
-  int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
   dim3 blks(1, blocks, 1);
   dim3 thrs(stride, threadsPerBlock, 1);
   gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
@@ -208,8 +208,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                                                     nActualPairs,
                                                                     m_params.idealConditions_,
                                                                     m_params.doClusterCut_,
-                                                                    m_params.doZCut_,
-                                                                    m_params.doPhiCut_,
+                                                                    m_params.doZ0Cut_,
+                                                                    m_params.doPtCut_,
                                                                     m_params.maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index dfccaf33e904d..0140958aa9ef2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -62,8 +62,8 @@ namespace cAHitNtupletGenerator {
            bool idealConditions,
            bool doStats,
            bool doClusterCut,
-           bool doZCut,
-           bool doPhiCut,
+           bool doZ0Cut,
+           bool doPtCut,
            float ptmin,
            float CAThetaCutBarrel,
            float CAThetaCutForward,
@@ -82,8 +82,8 @@ namespace cAHitNtupletGenerator {
           idealConditions_(idealConditions),
           doStats_(doStats),
           doClusterCut_(doClusterCut),
-          doZCut_(doZCut),
-          doPhiCut_(doPhiCut),
+          doZ0Cut_(doZ0Cut),
+          doPtCut_(doPtCut),
           ptmin_(ptmin),
           CAThetaCutBarrel_(CAThetaCutBarrel),
           CAThetaCutForward_(CAThetaCutForward),
@@ -103,8 +103,8 @@ namespace cAHitNtupletGenerator {
     const bool idealConditions_;
     const bool doStats_;
     const bool doClusterCut_;
-    const bool doZCut_;
-    const bool doPhiCut_;
+    const bool doZ0Cut_;
+    const bool doPtCut_;
     const float ptmin_;
     const float CAThetaCutBarrel_;
     const float CAThetaCutForward_;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index f276b0ccf77c8..2e875caba7130 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -63,8 +63,8 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
                cfg.getParameter<bool>("idealConditions"),
                cfg.getParameter<bool>("fillStatistics"),
                cfg.getParameter<bool>("doClusterCut"),
-               cfg.getParameter<bool>("doZCut"),
-               cfg.getParameter<bool>("doPhiCut"),
+               cfg.getParameter<bool>("doZ0Cut"),
+               cfg.getParameter<bool>("doPtCut"),
                cfg.getParameter<double>("ptmin"),
                cfg.getParameter<double>("CAThetaCutBarrel"),
                cfg.getParameter<double>("CAThetaCutForward"),
@@ -135,8 +135,8 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
   desc.add<bool>("includeJumpingForwardDoublets", false);
   desc.add<bool>("fit5as4", true);
   desc.add<bool>("doClusterCut", true);
-  desc.add<bool>("doZCut", true);
-  desc.add<bool>("doPhiCut", true);
+  desc.add<bool>("doZ0Cut", true);
+  desc.add<bool>("doPtCut", true);
   desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
 
   edm::ParameterSetDescription trackQualityCuts;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 6f3a15ad84d90..5f9cb4f79aa63 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -93,8 +93,8 @@ namespace gpuPixelDoublets {
                                 int nActualPairs,
                                 bool ideal_cond,
                                 bool doClusterCut,
-                                bool doZCut,
-                                bool doPhiCut,
+                                bool doZ0Cut,
+                                bool doPtCut,
                                 uint32_t maxNumOfDoublets) {
     auto const& __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs,
@@ -111,8 +111,8 @@ namespace gpuPixelDoublets {
                       maxr,
                       ideal_cond,
                       doClusterCut,
-                      doZCut,
-                      doPhiCut,
+                      doZ0Cut,
+                      doPtCut,
                       maxNumOfDoublets);
   }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index b14434d42bed9..ae722471e3ab7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -36,8 +36,8 @@ namespace gpuPixelDoubletsAlgos {
                                                     float const* __restrict__ maxr,
                                                     bool ideal_cond,
                                                     bool doClusterCut,
-                                                    bool doZCut,
-                                                    bool doPhiCut,
+                                                    bool doZ0Cut,
+                                                    bool doPtCut,
                                                     uint32_t maxNumOfDoublets) {
     // ysize cuts (z in the barrel)  times 8
     // these are used if doClusterCut is true
@@ -108,9 +108,15 @@ namespace gpuPixelDoubletsAlgos {
       if (mi > 2000)
         continue;  // invalid
 
+      /* maybe clever, not effective when zoCut is on
+      auto bpos = (mi%8)/4;  // if barrel is 1 for z>0
+      auto fpos = (outer>3) & (outer<7);
+      if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
+      */
+
       auto mez = hh.zGlobal(i);
 
-      if (doZCut && (mez < minz[pairLayerId] || mez > maxz[pairLayerId]))
+      if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
         continue;
 
       int16_t mes = -1;  // make compiler happy
@@ -139,12 +145,11 @@ namespace gpuPixelDoubletsAlgos {
       constexpr float minRadius =
           hardPtCut * 87.78f;  // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
       constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
-      auto ptcut = [&](int j, int16_t mop) {
+      auto ptcut = [&](int j, int16_t idphi) {
         auto r2t4 = minRadius2T4;
         auto ri = mer;
         auto ro = hh.rGlobal(j);
-        // auto mop = hh.iphi(j);
-        auto dphi = short2phi(std::min(std::abs(int16_t(mep - mop)), std::abs(int16_t(mop - mep))));
+        auto dphi = short2phi(idphi);
         return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
       };
       auto z0cutoff = [&](int j) {
@@ -173,6 +178,7 @@ namespace gpuPixelDoubletsAlgos {
       auto kl = Hist::bin(int16_t(mep - iphicut));
       auto kh = Hist::bin(int16_t(mep + iphicut));
       auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
+      // bool piWrap = std::abs(kh-kl) > Hist::nbins()/2;
 
 #ifdef GPU_DEBUG
       int tot = 0;
@@ -197,15 +203,16 @@ namespace gpuPixelDoubletsAlgos {
           auto mo = hh.detectorIndex(oi);
           if (mo > 2000)
             continue;  //    invalid
+          
+          if (doZ0Cut && z0cutoff(oi)) continue;
+
           auto mop = hh.iphi(oi);
-          if (std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))) > iphicut)
-            continue;
-          if (doPhiCut) {
-            if (doClusterCut && zsizeCut(oi))
-              continue;
-            if (z0cutoff(oi) || ptcut(oi, mop))
-              continue;
-          }
+          uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
+          if (idphi > iphicut)  continue;
+
+         if (doClusterCut && zsizeCut(oi)) continue;
+         if (doPtCut && ptcut(oi, idphi)) continue;
+      
           auto ind = atomicAdd(nCells, 1);
           if (ind >= maxNumOfDoublets) {
             atomicSub(nCells, 1);

From 2b7e4cb40d2a342db2ecfe1989c4b6d6942deee3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 19 Nov 2019 10:16:18 +0100
Subject: [PATCH 065/102] Migrate cluster track associator (cms-patatrack#409)

Migrate ClusterTPAssociationHeterogeneous using the depreacted HeterogeneousEDProducer to
ClusterTPAssociationProducerCUDA, and implement a simple analyzer to consume its procuct.

To test it, add a dummy analyzer to an MC workflow:

    process.load("SimTracker.TrackerHitAssociation.clusterTPCUDAdump_cfi")
    process.validation_step = cms.EndPath(process.globalValidationPixelTrackingOnly + process.clusterTPCUDAdump)
    process.tpClusterProducerCUDAPreSplitting.dumpCSV = True
---
 .../RecoTrack/python/TrackValidation_cff.py   | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index ed460d0d8c3f7..c6bdad88b5e50 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -485,13 +485,11 @@ def _getMVASelectors(postfix):
 # Built tracks, in the standard sequence mainly for monitoring the track selection MVA
 tpClusterProducerPreSplitting = tpClusterProducer.clone(pixelClusterSrc = "siPixelClustersPreSplitting")
 quickTrackAssociatorByHitsPreSplitting = quickTrackAssociatorByHits.clone(cluster2TPSrc = "tpClusterProducerPreSplitting")
-tpClusterProducerHeterogeneousPreSplitting = tpClusterProducerHeterogeneous.clone(
+
+tpClusterProducerCUDAPreSplitting = tpClusterProducerCUDA.clone(
    pixelClusterSrc = "siPixelClustersPreSplitting"
 )
-from Configuration.ProcessModifiers.gpu_cff import gpu
-gpu.toReplaceWith(tpClusterProducerPreSplitting, tpClusterProducerConverter.clone(
-    src = "tpClusterProducerHeterogeneousPreSplitting"
-))
+
 _trackValidatorSeedingBuilding = trackValidator.clone( # common for built tracks and seeds (in trackingOnly)
     associators = ["quickTrackAssociatorByHits"],
     UseAssociators = True,
@@ -595,7 +593,6 @@ def _uniqueFirstLayers(layerList):
 )
 tracksValidationTruth = cms.Task(
     tpClusterProducer,
-    tpClusterProducerHeterogeneousPreSplitting,
     tpClusterProducerPreSplitting,
     quickTrackAssociatorByHits,
     quickTrackAssociatorByHitsPreSplitting,
@@ -603,6 +600,16 @@ def _uniqueFirstLayers(layerList):
     VertexAssociatorByPositionAndTracks,
     trackingParticleNumberOfLayersProducer
 )
+
+#gpu tp ???
+from Configuration.ProcessModifiers.gpu_cff import gpu
+tpClusterProducerPreSplittingCUDA = cms.Task(
+  tpClusterProducerCUDAPreSplitting
+)
+_tracksValidationTruth_gpu = tracksValidationTruth.copy()
+_tracksValidationTruth_gpu.add(tpClusterProducerPreSplittingCUDA)
+gpu.toReplaceWith(tracksValidationTruth,_tracksValidationTruth_gpu)
+
 fastSim.toModify(tracksValidationTruth, lambda x: x.remove(tpClusterProducer))
 
 tracksPreValidation = cms.Task(

From ae764c529dc191e21f2d0896ec967fd0107f6b40 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 27 Nov 2019 15:17:05 +0100
Subject: [PATCH 066/102] Drop obsolete heterogenous framework
 (cms-patatrack#416)

---
 .../PixelTrackFitting/plugins/BuildFile.xml   |  4 +--
 .../plugins/PixelTrackSoAFromCUDA.cc          |  8 +++---
 .../PixelTriplets/plugins/BuildFile.xml       |  4 +--
 .../TkSeedGenerator/plugins/BuildFile.xml     | 20 +++++++-------
 .../plugins/SeedProducerFromSoA.cc            | 26 +++++++++----------
 5 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
index 8c0261ee0d999..ecfbd99b667fc 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
@@ -1,8 +1,6 @@
 <use name="cuda"/>
-<use name="HeterogeneousCore/CUDACore"/>
-<use name="HeterogeneousCore/Producer"/>
-<use name="HeterogeneousCore/Product"/>
 <use name="CUDADataFormats/Track"/>
+<use name="HeterogeneousCore/CUDACore"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
 <library file="*.cc" name="RecoPixelVertexingPixelTrackFittingPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 2d6da6a631151..3e73cfd7a4e96 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -2,22 +2,20 @@
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 
 class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 1554e515ad437..cacfe4662b51f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,17 +1,15 @@
 <use name="cuda"/>
 <use name="ofast-flag"/>
+<use name="CUDADataFormats/Track"/>
 <use name="CommonTools/RecoAlgos"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/PluginManager"/>
 <use name="HeterogeneousCore/CUDACore"/>
-<use name="HeterogeneousCore/Producer"/>
-<use name="HeterogeneousCore/Product"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
-<use name="CUDADataFormats/Track"/>
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
index c10ee14dc3638..26ef004940306 100644
--- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
+++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
@@ -1,14 +1,12 @@
 <use name="cuda"/>
+<use name="CommonTools/RecoAlgos"/>
+<use name="DataFormats/TrackerRecHit2D"/>
 <use name="HeterogeneousCore/CUDACore"/>
-<use name="HeterogeneousCore/Producer"/>
-<use name="HeterogeneousCore/Product"/>
-<use   name="CommonTools/RecoAlgos"/>
-<use   name="RecoTracker/TkSeedGenerator"/>
-<use   name="RecoTracker/TkTrackingRegions"/>
-<use   name="RecoPixelVertexing/PixelTriplets"/>
-<use   name="RecoPixelVertexing/PixelTrackFitting"/>
-<use   name="RecoPixelVertexing/PixelLowPtUtilities"/>
-<library   file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
-  <flags   EDM_PLUGIN="1"/>
-  <use   name="DataFormats/TrackerRecHit2D"/>
+<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
+<use name="RecoPixelVertexing/PixelTrackFitting"/>
+<use name="RecoPixelVertexing/PixelTriplets"/>
+<use name="RecoTracker/TkSeedGenerator"/>
+<use name="RecoTracker/TkTrackingRegions"/>
+<library file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
+  <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index 29a67c255ee1b..5c5e5fecf41cf 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -1,37 +1,35 @@
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
-#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
-#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
 #include "DataFormats/GeometrySurface/interface/Plane.h"
-#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
 #include "DataFormats/TrackingRecHit/interface/InvalidTrackingRecHit.h"
+#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
-#include "Geometry/Records/interface/TrackerTopologyRcd.h"
-#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/CommonDetUnit/interface/GeomDet.h"
-#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
 #include "TrackingTools/MaterialEffects/interface/PropagatorWithMaterial.h"
 #include "TrackingTools/Records/interface/TrackingComponentsRecord.h"
-#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
-#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
 #include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
 #include "TrackingTools/TrajectoryState/interface/TrajectoryStateTransform.h"
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
-
 /*
   produces seeds directly from cuda produced tuples
 */

From 7df797f425ab38904d5560d1b8add45ffe409282 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 27 Nov 2019 15:50:28 +0100
Subject: [PATCH 067/102] Remove last references to CUDA API Wrappers
 (cms-patatrack#417)

---
 RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 3dc8beb65d9bb..05b399e870f58 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,11 +1,10 @@
 #ifndef RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
 #define RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
 
-#include <cuda/api_wrappers.h>
-
-#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+
 #include "CAConstants.h"
 
 namespace Rfit {

From b43a1edbe466a2a10daa4c5b1337e015fefd5ab9 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 29 Nov 2019 12:10:12 +0100
Subject: [PATCH 068/102] Apply code checks and code formatting

---
 .../PixelTriplets/plugins/CAConstants.h          |  2 +-
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h   |  2 +-
 .../plugins/gpuPixelDoubletsAlgos.h              | 16 ++++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 1170550032068..a9028edf98a6b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -34,7 +34,7 @@ namespace CAConstants {
   constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
 #endif
 #else
-  constexpr uint32_t maxNumberOfDoublets() { return 2*1024 * 1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
 #endif
   constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index c180ca25bfa61..6888e6725bc39 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -286,7 +286,7 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
       GPUCACell::TmpTuple stack;
       stack.reset();
       thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
-      assert(stack.size() == 0);
+      assert(stack.empty());
       // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
     }
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index ae722471e3ab7..8af3bd821c714 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -203,16 +203,20 @@ namespace gpuPixelDoubletsAlgos {
           auto mo = hh.detectorIndex(oi);
           if (mo > 2000)
             continue;  //    invalid
-          
-          if (doZ0Cut && z0cutoff(oi)) continue;
+
+          if (doZ0Cut && z0cutoff(oi))
+            continue;
 
           auto mop = hh.iphi(oi);
           uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
-          if (idphi > iphicut)  continue;
+          if (idphi > iphicut)
+            continue;
+
+          if (doClusterCut && zsizeCut(oi))
+            continue;
+          if (doPtCut && ptcut(oi, idphi))
+            continue;
 
-         if (doClusterCut && zsizeCut(oi)) continue;
-         if (doPtCut && ptcut(oi, idphi)) continue;
-      
           auto ind = atomicAdd(nCells, 1);
           if (ind >= maxNumOfDoublets) {
             atomicSub(nCells, 1);

From f77f909c66c53423314bb29dc90b857f0bc3ad5e Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 30 Nov 2019 07:38:48 +0100
Subject: [PATCH 069/102] Synchronise with CMSSW_11_0_0_pre13

---
 Validation/RecoTrack/python/TrackValidation_cff.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index c6bdad88b5e50..f4b24c32dc040 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -923,8 +923,8 @@ def _uniqueFirstLayers(layerList):
 ## customization for timing
 from Configuration.Eras.Modifier_phase2_timing_layer_cff import phase2_timing_layer
 phase2_timing_layer.toModify( generalTracksFromPV, 
-                              timesTag  = cms.InputTag('trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModel'), 
-                              timeResosTag = cms.InputTag('trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModelResolution'), 
+                              timesTag  = cms.InputTag('tofPID:t0'), 
+                              timeResosTag = cms.InputTag('tofPID:sigmat0'),
                               nSigmaDtVertex = cms.double(3) )
 phase2_timing_layer.toModify( trackValidatorStandalone,
                               label_vertex = cms.untracked.InputTag('offlinePrimaryVertices4D') )
@@ -936,9 +936,3 @@ def _uniqueFirstLayers(layerList):
                               label_vertex = cms.untracked.InputTag('offlinePrimaryVertices4D') )
 phase2_timing_layer.toModify( trackValidatorGsfTracks,
                               label_vertex = cms.untracked.InputTag('offlinePrimaryVertices4D') )
-
-from Configuration.Eras.Modifier_phase2_timing_layer_tile_cff import phase2_timing_layer_tile
-from Configuration.Eras.Modifier_phase2_timing_layer_bar_cff import phase2_timing_layer_bar
-(phase2_timing_layer_tile | phase2_timing_layer_bar).toModify( generalTracksFromPV, 
-                              timesTag  = cms.InputTag('tofPID:t0'), 
-                              timeResosTag = cms.InputTag('tofPID:sigmat0') )

From 9ecedf73b8881e8a145f1cad32b26f20a299c3f3 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 2 Dec 2019 14:43:35 +0100
Subject: [PATCH 070/102] Rename exitSansCUDADevices to requireCUDADevices
 (cms-patatrack#423)

---
 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h             | 4 ++--
 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu     | 4 ++--
 .../PixelTrackFitting/test/testEigenGPUNoFit.cu               | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
index 03c51c39acdfb..1fbe6a73da910 100644
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -51,13 +51,13 @@ __global__ void testTSSoA(TS* pts, int n) {
 }
 
 #ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #endif
 
 int main() {
 #ifdef __CUDACC__
-  exitSansCUDADevices();
+  requireCUDADevices();
 #endif
 
   TS ts;
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 7b02a23c41dca..e1606ab54c9c6 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -4,7 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
 
 #ifdef USE_BL
 #include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
@@ -329,7 +329,7 @@ void testFit() {
 }
 
 int main(int argc, char* argv[]) {
-  exitSansCUDADevices();
+  requireCUDADevices();
 
   testFit<4>();
   testFit<3>();
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index e16ac3dbbcbc3..7ef3f572603b0 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -4,7 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
 #include "test_common.h"
 
 using namespace Eigen;
@@ -215,7 +215,7 @@ void testEigenvalues() {
 }
 
 int main(int argc, char *argv[]) {
-  exitSansCUDADevices();
+  requireCUDADevices();
 
   testEigenvalues();
   testInverse3x3();

From 78dc66e17d779e10ef8bcd98fd8f166fec14e5c4 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 17 Jan 2020 09:10:53 -0600
Subject: [PATCH 071/102] Implement changes from the CUDA framework review
 (cms-patatrack#429)

Rename the cudautils namespace to cms::cuda or cms::cudatest, and drop the CUDA prefix from the symbols defined there.

Always record and query the CUDA event, to minimize need for error checking in CUDAScopedContextProduce destructor.

Add comments to highlight the pieces in CachingDeviceAllocator that have been changed wrt. cub.

Various other updates and clean up:
  - enable CUDA for compute capability 3.5.
  - clean up CUDAService, CUDA tests and plugins.
  - add CUDA existence protections to BuildFiles.
  - mark thread-safe static variables with CMS_THREAD_SAFE.
---
 CUDADataFormats/Track/src/classes.h             |  6 +++---
 CUDADataFormats/Track/src/classes_def.xml       |  4 ++--
 .../Track/test/TrajectoryStateSOA_t.h           |  4 ++--
 .../plugins/PixelTrackSoAFromCUDA.cc            | 14 +++++++-------
 .../PixelTrackFitting/test/testEigenGPU.cu      |  4 ++--
 .../PixelTrackFitting/test/testEigenGPUNoFit.cu |  4 ++--
 .../PixelTriplets/plugins/BrokenLineFitOnGPU.cu |  6 +++---
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc   | 17 +++++++++--------
 .../plugins/CAHitNtupletGeneratorKernels.cc     |  8 ++++----
 .../plugins/CAHitNtupletGeneratorKernels.cu     | 12 ++++++------
 .../plugins/CAHitNtupletGeneratorKernelsAlloc.h |  4 ++--
 .../plugins/CAHitNtupletGeneratorOnGPU.cc       |  2 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu    |  8 ++++----
 13 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 699e45ede05d4..8a38f939bc68b 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -1,7 +1,7 @@
-#ifndef CUDADataFormats__src_classes_h
-#define CUDADataFormats__src_classes_h
+#ifndef CUDADataFormats_Track_src_classes_h
+#define CUDADataFormats_Track__src_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/Common/interface/ArrayShadow.h"
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index a4c2e766582dd..7c73c676ad13d 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,6 +1,6 @@
 <lcgdict>
-  <class name="CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
   <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
   <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
   <class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
index 1fbe6a73da910..c8e92aca2628f 100644
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -51,13 +51,13 @@ __global__ void testTSSoA(TS* pts, int n) {
 }
 
 #ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #endif
 
 int main() {
 #ifdef __CUDACC__
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 #endif
 
   TS ts;
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 3e73cfd7a4e96..c8310bc645db3 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
@@ -15,7 +15,7 @@
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
 class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -30,14 +30,14 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenCUDA_;
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
-  cudautils::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
+  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+    : tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -50,8 +50,8 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAProduct<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
   m_soa = inputData.toHostAsync(ctx.stream());
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index e1606ab54c9c6..f0b641361aee4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -4,7 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 #ifdef USE_BL
 #include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
@@ -329,7 +329,7 @@ void testFit() {
 }
 
 int main(int argc, char* argv[]) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   testFit<4>();
   testFit<3>();
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index 7ef3f572603b0..6ac1088943305 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -4,7 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "test_common.h"
 
 using namespace Eigen;
@@ -215,7 +215,7 @@ void testEigenvalues() {
 }
 
 int main(int argc, char *argv[]) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   testEigenvalues();
   testInverse3x3();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 660cf75e1f460..6fc537237286f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -11,11 +11,11 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
+  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
+  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
+  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 11b644d466768..31e5070e55e05 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -15,7 +15,7 @@
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
@@ -34,8 +34,8 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 
   bool m_OnGPU;
 
-  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
   edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
 
@@ -45,8 +45,9 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
     : m_OnGPU(iConfig.getParameter<bool>("onGPU")), gpuAlgo_(iConfig, consumesCollector()) {
   if (m_OnGPU) {
-    tokenHitGPU_ = consumes<CUDAProduct<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<CUDAProduct<PixelTrackHeterogeneous>>();
+    tokenHitGPU_ =
+        consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
@@ -68,10 +69,10 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
   auto bf = 1. / PixelRecoUtilities::fieldInInvGev(es);
 
   if (m_OnGPU) {
-    edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>> hHits;
+    edm::Handle<cms::cuda::Product<TrackingRecHit2DCUDA>> hHits;
     iEvent.getByToken(tokenHitGPU_, hHits);
 
-    CUDAScopedContextProduce ctx{*hHits};
+    cms::cuda::ScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
 
     ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 75066458dc170..05106a1bfed41 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -67,7 +67,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   assert(tuples_d && quality_d);
 
   // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
   assert(nhits <= pixelGPUConstants::maxNumberOfHits);
@@ -108,13 +108,13 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   if (m_params.doStats_)
     kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
 
-  cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
 
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && m_params.lateFishbone_) {
@@ -154,7 +154,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
   kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
 
   // remove duplicates (tracks that share a hit)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index aaf882633f17d..7bfee1c8d557f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -21,7 +21,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   auto *quality_d = (Quality *)(&tracks_d->m_quality);
 
   // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
   assert(nhits <= pixelGPUConstants::maxNumberOfHits);
@@ -96,7 +96,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   blockSize = 128;
   numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
-  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
@@ -108,7 +108,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
@@ -160,7 +160,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
   assert(device_isOuterHitOfCell_.get());
   {
     int threadsPerBlock = 128;
@@ -175,7 +175,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cudautils::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -252,7 +252,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
-    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
     cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index b91911c66924e..592aee9770ae4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -46,6 +46,6 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   } else {
     *device_nCells_ = 0;
   }
-  cudautils::launchZero(device_tupleMultiplicity_.get(), stream);
-  cudautils::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
+  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
+  cms::cuda::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 2e875caba7130..4a8240706efc2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -162,7 +162,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
+  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
 
   auto* soa = tracks.get();
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index cb5d32b47aea3..1077bb7736667 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -11,14 +11,14 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
+  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
+  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
+  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
   auto circle_fit_resultsGPU_holder =
-      cudautils::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
+      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
   Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {

From 0491687107eeb7f5d1bdf9c98dee2ba4b550d637 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 27 Jan 2020 12:17:14 +0100
Subject: [PATCH 072/102] Synchronise with CMSSW_11_1_0_pre2

Major changes:
  - restructure the RecoPixelVertexing/PixelVertexFinding package;
  - update the interface of PixelCPEFast.
---
 .../RecoTrack/python/TrackValidation_cff.py   | 81 +++++++++++++++++--
 1 file changed, 75 insertions(+), 6 deletions(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index f4b24c32dc040..0765b76fa6e45 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -71,7 +71,10 @@ def _addSelectorsByAlgo(algos, modDict):
             continue
         modName = _algoToSelector(algo)
         if modName not in modDict:
-            mod = cutsRecoTracks_cfi.cutsRecoTracks.clone(algorithm=[algo])
+            mod = cutsRecoTracks_cfi.cutsRecoTracks.clone(
+#                src = [src],
+                algorithm=[algo]
+            )
             modDict[modName] = mod
         else:
             mod = modDict[modName]
@@ -404,10 +407,38 @@ def _getMVASelectors(postfix):
     doResolutionPlotsForLabels = ["disabled"], # resolutions are same as in trackValidator, no need to repeat here
 )
 
+## Select signal TrackingParticles, and do the corresponding associations
+trackingParticlesEtaGreater2p7 = _trackingParticleRefSelector.clone(
+    signalOnly = cms.bool(False),
+    tip = 1e5,
+    lip = 1e5,
+    minRapidity = -2.7,
+    maxRapidity =  2.7,
+    invertRapidityCut = cms.bool(True),
+    ptMin = 0,
+)
+
+
+# select tracks with |eta| > 2.7
+generalTracksEtaGreater2p7 = cutsRecoTracks_cfi.cutsRecoTracks.clone(
+    minRapidity = cms.double(-2.7),
+    maxRapidity = cms.double( 2.7),
+    invertRapidityCut = cms.bool(True)
+)
+
+_taskForEachEra(_addSelectorsBySrc, modDict=globals(),
+                    args=[["_generalTracksHp"]],
+                    plainArgs=["EtaGreater2p7", "generalTracksEtaGreater2p7"],
+                    names="_selectorsEtaGreater2p7", task="_tracksValidationSelectorsEtaGreater2p7",
+                    modifyTask=lambda task: task.add(generalTracksEtaGreater2p7))
+
 # for high-eta (phase2 : |eta| > 2.7)
 trackValidatorTPEtaGreater2p7 = trackValidator.clone(
     dirName = "Tracking/TrackTPEtaGreater2p7/",
-    label = [x for x in trackValidator.label.value() if ("Pt09" not in x) and ("BtvLike" not in x) and ("AK4PFJets" not in x)],
+    label_tp_effic = "trackingParticlesEtaGreater2p7",
+    label_tp_fake  = "trackingParticlesEtaGreater2p7",
+    label_tp_effic_refvector = True,
+    label_tp_fake_refvector  = True,
     dodEdxPlots = False,
 #    doPVAssociationPlots = False,
     minRapidityTP = -2.7,
@@ -426,11 +457,16 @@ def _getMVASelectors(postfix):
 #        nintEta = 90,
         #    minPt  = 0.01,
     ),
-    doSimPlots = False,       # same as in trackValidator, no need to repeat here
-    doRecoTrackPlots = False, # fake rates are same as in trackValidator, no need to repeat here
+    doSimPlots = True,       # ####same as in trackValidator, no need to repeat here
+    doRecoTrackPlots = True, # ####fake rates are same as in trackValidator, no need to repeat here
     doResolutionPlotsForLabels = ["disabled"] # resolutions are same as in trackValidator, no need to repeat here
 )
-
+for _eraName, _postfix, _era in _relevantEras:
+    _setForEra(trackValidatorTPEtaGreater2p7, _eraName, _era,
+               label = ["generalTracksEtaGreater2p7"] + locals()["_selectorsEtaGreater2p7"+_postfix] +
+                       locals()["_selectorsByAlgo"+_postfix] + locals()["_selectorsByAlgoHp"+_postfix],
+               doResolutionPlotsForLabels = ["generalTracksEtaGreater2p7"] + locals()["_selectorsEtaGreater2p7"+_postfix]
+    )
 
 # For efficiency of signal TPs vs. signal tracks, and fake rate of
 # signal tracks vs. signal TPs
@@ -591,6 +627,29 @@ def _uniqueFirstLayers(layerList):
     ak4JetTracksAssociatorExplicitAll,
     cutsRecoTracksAK4PFJets
 )
+phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(generalTracksEtaGreater2p7))
+phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(cutsRecoTracksEtaGreater2p7Hp))
+
+# Validation iterative steps
+_taskForEachEra(_addSelectorsByAlgo, modDict=globals(),
+                args=["_algos"], 
+                names="_selectorsByAlgo", task="_tracksEtaGreater2p7ValidationSelectorsByAlgo"                
+               )
+
+# high purity
+_taskForEachEra(_addSelectorsByHp, modDict=globals(),
+                args=["_algos"], 
+                names="_selectorsByAlgoHp", task="_tracksEtaGreater2p7ValidationSelectorsByAlgoHp"
+               )
+
+for _eraName, _postfix, _era in _relevantEras:
+    selectors = locals()["_selectorsByAlgoHp"+_postfix]
+    locals()["_generalTracksHp"+_postfix] = selectors[0]
+    locals()["_selectorsByAlgoHp"+_postfix] = selectors[1:]
+
+phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(tracksEtaGreater2p7ValidationSelectorsByAlgo))
+phase2_tracker.toModify(tracksValidationSelectors, lambda x: x.add(tracksEtaGreater2p7ValidationSelectorsByAlgoHp))
+
 tracksValidationTruth = cms.Task(
     tpClusterProducer,
     tpClusterProducerPreSplitting,
@@ -628,6 +687,8 @@ def _uniqueFirstLayers(layerList):
     trackingParticlesConversion,
 ]))
 
+
+
 tracksValidation = cms.Sequence(
     trackValidator +
     trackValidatorTPPtLess09 +
@@ -643,6 +704,10 @@ def _uniqueFirstLayers(layerList):
 
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
 #tracksValidationPhase2 = cms.Sequence(tracksValidation+trackValidatorTPEtaGreater2p7) # it does not work
+tracksPreValidationPhase2 = tracksPreValidation.copy()
+tracksPreValidationPhase2.add(trackingParticlesEtaGreater2p7)
+phase2_tracker.toReplaceWith(tracksPreValidation, tracksPreValidationPhase2)
+
 tracksValidationPhase2 = tracksValidation.copy()
 tracksValidationPhase2+=trackValidatorTPEtaGreater2p7
 phase2_tracker.toReplaceWith(tracksValidation, tracksValidationPhase2)
@@ -746,6 +811,11 @@ def _uniqueFirstLayers(layerList):
     trackValidatorGsfTracksStandalone +
     trackValidatorBHadronStandalone
 )
+
+_trackValidatorsBasePhase2 = _trackValidatorsBase.copy()
+_trackValidatorsBasePhase2+=trackValidatorTPEtaGreater2p7
+phase2_tracker.toReplaceWith(_trackValidatorsBase, _trackValidatorsBasePhase2)
+
 trackValidatorsStandalone = _trackValidatorsBase.copy()
 fastSim.toModify(trackValidatorsStandalone, lambda x: x.remove(trackValidatorConversionStandalone) )
 
@@ -820,7 +890,6 @@ def _uniqueFirstLayers(layerList):
     trackValidatorBHadronTrackingOnly
 ]))
 
-
 tracksValidationTrackingOnly = cms.Sequence(
     trackValidatorsTrackingOnly,
     tracksPreValidationTrackingOnly,

From 712dc8be2568a82860552a49a2fa6387acc77c37 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 11 Feb 2020 10:29:33 +0100
Subject: [PATCH 073/102] Apply feedback from upstream PR (cms-patatrack#441)

Fix include guard in CUDADataFormats/Track/src/classes.h .
Remove unused variables in DataFormats/Math/test/CholeskyInvert_t.cpp .
---
 CUDADataFormats/Track/src/classes.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 8a38f939bc68b..49c71bf03b90a 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -1,5 +1,5 @@
 #ifndef CUDADataFormats_Track_src_classes_h
-#define CUDADataFormats_Track__src_classes_h
+#define CUDADataFormats_Track_src_classes_h
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
@@ -7,4 +7,4 @@
 #include "CUDADataFormats/Common/interface/ArrayShadow.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
-#endif
+#endif  // CUDADataFormats_Track_src_classes_h

From 3ecdd549666114a795eed93ac434ce762b13a623 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 5 Mar 2020 11:00:41 +0100
Subject: [PATCH 074/102] Synchronise with CMSSW_11_1_0_pre4

---
 RecoPixelVertexing/PixelTrackFitting/BuildFile.xml      | 3 ++-
 Validation/RecoTrack/python/PostProcessorTracker_cfi.py | 2 ++
 Validation/RecoTrack/python/plotting/trackingPlots.py   | 9 +++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
index 3300d67809f33..c77fc5cabade4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
@@ -1,5 +1,6 @@
-<use   name="root"/>
+<use   name="cuda"/>
 <use   name="eigen"/>
+<use   name="root"/>
 <use   name="CommonTools/Statistics"/>
 <use   name="DataFormats/GeometrySurface"/>
 <use   name="DataFormats/GeometryVector"/>
diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index 91adbee0f9bba..6ee5e4d41a8f0 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -174,9 +174,11 @@ def _addNoFlow(module):
                              "cotThetares_vs_pt '#sigma(cot(#theta)) vs p_{T}' cotThetares_vs_pt",
                              "h_dxypulleta 'd_{xy} Pull vs #eta' dxypull_vs_eta",
                              "dxyres_vs_eta '#sigma(d_{xy}) vs #eta' dxyres_vs_eta",
+                             "dxyres_vs_phi '#sigma(d_{xy}) vs #phi' dxyres_vs_phi",
                              "dxyres_vs_pt '#sigma(d_{xy}) vs p_{T}' dxyres_vs_pt",
                              "h_dzpulleta 'd_{z} Pull vs #eta' dzpull_vs_eta",
                              "dzres_vs_eta '#sigma(d_{z}) vs #eta' dzres_vs_eta",
+                             "dzres_vs_phi '#sigma(d_{z}) vs #phi' dzres_vs_phi",
                              "dzres_vs_pt '#sigma(d_{z}) vs p_{T}' dzres_vs_pt",
                              "etares_vs_eta '#sigma(#eta) vs #eta' etares_vs_eta",
                              "h_phipulleta '#phi Pull vs #eta' phipull_vs_eta",
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index 0a2ea8a35ee1b..76fcf73623f03 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -402,6 +402,13 @@ def _makeMVAPlots(num, hp=False):
     Plot("dzres_vs_pt_Sigma", ytitle="#sigma(#delta d_{z}) (cm)", **_common),
     Plot("ptres_vs_pt_Sigma", ytitle="#sigma(#delta p_{T}/p_{T})", **_common),
 ])
+_common = {"title": "", "ylog": True, "xtitle": "TP #Phi (PCA to beamline)", "ymin": _minMaxResol, "ymax": _minMaxResol}
+_resolutionsPhi = PlotGroup("resolutionsPhi", [
+    Plot("dxyres_vs_phi_Sigma", ytitle="#sigma(#delta d_{xy}) (cm)", **_common),
+    Plot("dzres_vs_phi_Sigma", ytitle="#sigma(#delta d_{z}) (cm)", **_common),
+    Plot("phires_vs_phi_Sigma", ytitle="#sigma(#delta #phi) (rad)", **_common),
+    Plot("ptres_vs_phi_Sigma", ytitle="#sigma(#delta p_{T}/p_{T})", **_common),
+])
 
 ## Extended set of plots
 _extDistPtEtaPhi = PlotGroup("distPtEtaPhi",
@@ -1225,6 +1232,7 @@ def _trackingFolders(lastDirName="Track"):
     _hitsAndPt,
     _pulls,
     _resolutionsEta,
+    _resolutionsPhi,
     _resolutionsPt,
     _tuning,
 ]
@@ -1247,6 +1255,7 @@ def _trackingFolders(lastDirName="Track"):
 _buildingExtendedPlots = [
     _pulls,
     _resolutionsEta,
+    _resolutionsPhi,
     _resolutionsPt,
     _tuning,
 ]

From 23bc9090336604efe306ce3d06aec5e444d078f2 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 25 Mar 2020 00:28:04 +0100
Subject: [PATCH 075/102] Integrate the comments from the upstream PRs
 (cms-patatrack#442)

Clean up the Patatrack code base following the comments received during the integration into the upstream release.

Currently tracks the changes introduced due to
   - cms-sw#29109: Patatrack integration - trivial changes (1/N)
   - cms-sw#29110: Patatrack integration - common tools (2/N)

List of changes:
 * Remove unused files
 * Fix compilation warnings
 * Fix AtomicPairCounter unit test
 * Rename the cudaCompat namespace to cms::cudacompat
 * Remove extra semicolon
 * Move SimpleVector and VecArray to the cms::cuda namespace
 * Add missing dependency
 * Move HistoContainer, AtomicPairCounter, prefixScan and radixSort to the cms::cuda namespace
 * Remove rule exception for HeterogeneousCore
 * Fix code rule violations:
    - replace using namespace cms::cuda in test/OneToManyAssoc_t.h .
    - add an exception for cudaCompat.h:
      cudaCompat relies on defining equivalent symbols to the CUDA
      intrinsics in the cms::cudacompat namespace, and pulling them in the
      global namespace when compiling device code without CUDA.
* Protect the headers to compile only with a CUDA compiler
---
 .../Track/interface/PixelTrackHeterogeneous.h |  2 +-
 .../PixelTrackFitting/interface/BrokenLine.h  |  4 ++--
 .../PixelTrackFitting/interface/RiemannFit.h  |  6 ++---
 .../PixelTriplets/plugins/CAConstants.h       | 24 +++++++++----------
 .../plugins/CAHitNtupletGeneratorKernels.h    | 10 ++++----
 .../CAHitNtupletGeneratorKernelsAlloc.h       | 12 +++++-----
 .../CAHitNtupletGeneratorKernelsImpl.h        |  8 +++----
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  2 +-
 .../PixelTriplets/plugins/GPUCACell.h         |  8 +++----
 .../PixelTriplets/plugins/gpuFishbone.h       |  2 +-
 .../plugins/gpuPixelDoubletsAlgos.h           |  2 +-
 11 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index bd4ec059f6e9c..d462be2c5dd7b 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -17,7 +17,7 @@ class TrackSoAT {
 
   using Quality = trackQuality::Quality;
   using hindex_type = uint16_t;
-  using HitContainer = OneToManyAssoc<hindex_type, S, 5 * S>;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
 
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
index 01d59e7af2100..5b55e5e804167 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -322,7 +322,7 @@ namespace BrokenLine {
     std::cout << "CU5\n" << C_U << std::endl;
 #endif
     MatrixNplusONEd<N> I;
-    choleskyInversion::invert(C_U, I);
+    math::cholesky::invert(C_U, I);
     // MatrixNplusONEd<N> I = C_U.inverse();
 #ifdef CPP_DUMP
     std::cout << "I5\n" << I << std::endl;
@@ -443,7 +443,7 @@ namespace BrokenLine {
     std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl;
 #endif
     MatrixNd<N> I;
-    choleskyInversion::invert(MatrixC_u(w, S, VarBeta), I);
+    math::cholesky::invert(MatrixC_u(w, S, VarBeta), I);
     //    MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
 #ifdef CPP_DUMP
     std::cout << "I4\n" << I << std::endl;
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index f69b425ef884a..4573205f9c11e 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -486,7 +486,7 @@ namespace Rfit {
       printIt(&V, "circle_fit - V:");
       cov_rad += scatter_cov_rad;
       printIt(&cov_rad, "circle_fit - cov_rad:");
-      choleskyInversion::invert(cov_rad, G);
+      math::cholesky::invert(cov_rad, G);
       // G = cov_rad.inverse();
       renorm = G.sum();
       G *= 1. / renorm;
@@ -889,11 +889,11 @@ namespace Rfit {
 
     // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
     MatrixNd<N> Vy_inv;
-    choleskyInversion::invert(cov_with_ms, Vy_inv);
+    math::cholesky::invert(cov_with_ms, Vy_inv);
     // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
     Eigen::Matrix<double, 2, 2> Cov_params = A * Vy_inv * A.transpose();
     // Compute the Covariance Matrix of the fit parameters
-    choleskyInversion::invert(Cov_params, Cov_params);
+    math::cholesky::invert(Cov_params, Cov_params);
 
     // Now Compute the Parameters in the form [2,1]
     // The first component is q.
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index a9028edf98a6b..fce0c23596137 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -6,8 +6,8 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 // #define ONLY_PHICUT
@@ -48,21 +48,21 @@ namespace CAConstants {
   using tindex_type = uint16_t;  //  for tuples
 
 #ifndef ONLY_PHICUT
-  using CellNeighbors = GPU::VecArray<uint32_t, 36>;
-  using CellTracks = GPU::VecArray<tindex_type, 42>;
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 42>;
 #else
-  using CellNeighbors = GPU::VecArray<uint32_t, 64>;
-  using CellTracks = GPU::VecArray<tindex_type, 64>;
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
 #endif
 
-  using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
-  using CellTracksVector = GPU::SimpleVector<CellTracks>;
+  using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
+  using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
 
-  using OuterHitOfCell = GPU::VecArray<uint32_t, maxCellsPerHit()>;
-  using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
+  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit()>;
+  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
   using HitToTuple =
-      OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
-  using TupleMultiplicity = OneToManyAssoc<tindex_type, 8, maxTuples()>;
+      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
+  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples()>;
 
 }  // namespace CAConstants
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 0140958aa9ef2..5382d0f6e88d6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -189,20 +189,20 @@ class CAHitNtupletGeneratorKernels {
   uint32_t* device_nCells_ = nullptr;
 
   unique_ptr<HitToTuple> device_hitToTuple_;
-  AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
+  cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
 
-  AtomicPairCounter* device_hitTuple_apc_ = nullptr;
+  cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr;
 
   unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
 
   uint8_t* device_tmws_;
 
-  unique_ptr<AtomicPairCounter::c_type[]> device_storage_;
+  unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
   // params
   Params const& m_params;
 };
 
-using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cudaCompat::GPUTraits>;
-using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cudaCompat::CPUTraits>;
+using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
+using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits>;
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index 592aee9770ae4..881b30ba46752 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -24,13 +24,13 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
 
   auto storageSize =
-      3 + (std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) + sizeof(AtomicPairCounter::c_type)) /
-              sizeof(AtomicPairCounter::c_type);
+      3 + (std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) + sizeof(cms::cuda::AtomicPairCounter::c_type)) /
+              sizeof(cms::cuda::AtomicPairCounter::c_type);
 
-  device_storage_ = Traits::template make_unique<AtomicPairCounter::c_type[]>(storageSize, stream);
+  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(storageSize, stream);
 
-  device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
-  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get() + 1;
+  device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
+  device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
   device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
   device_tmws_ = (uint8_t*)(device_storage_.get() + 3);
 
@@ -41,7 +41,7 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
 #ifndef __CUDACC__
       constexpr
 #endif
-      (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
+      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 6888e6725bc39..18a2138247946 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -33,7 +33,7 @@ using HitContainer = pixelTrack::HitContainer;
 
 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
                                       CAConstants::TupleMultiplicity *tupleMultiplicity,
-                                      AtomicPairCounter *apc,
+                                      cms::cuda::AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
                                       uint32_t const *__restrict__ nCells,
                                       CellNeighborsVector const *cellNeighbors,
@@ -190,8 +190,8 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
   }
 }
 
-__global__ void kernel_connect(AtomicPairCounter *apc1,
-                               AtomicPairCounter *apc2,  // just to zero them,
+__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
+                               cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
                                GPUCACell::Hits const *__restrict__ hhp,
                                GPUCACell *cells,
                                uint32_t const *__restrict__ nCells,
@@ -268,7 +268,7 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                      uint32_t const *nCells,
                                      CellTracksVector *cellTracks,
                                      HitContainer *foundNtuplets,
-                                     AtomicPairCounter *apc,
+                                     cms::cuda::AtomicPairCounter *apc,
                                      Quality *__restrict__ quality,
                                      unsigned int minHitsPerNtuplet) {
   // recursive: not obvious to widen
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index de2e1913dd18b..e920ebf7a803d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -8,7 +8,7 @@
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
 
 #include "CAHitNtupletGeneratorKernels.h"
 #include "HelixFitOnGPU.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 6527c9f2bfbea..6e1c2a587e212 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -10,8 +10,8 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
@@ -31,7 +31,7 @@ class GPUCACell {
   using Hits = TrackingRecHit2DSOAView;
   using hindex_type = Hits::hindex_type;
 
-  using TmpTuple = GPU::VecArray<uint32_t, 6>;
+  using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
 
   using HitContainer = pixelTrack::HitContainer;
   using Quality = trackQuality::Quality;
@@ -246,7 +246,7 @@ class GPUCACell {
                                        GPUCACell* __restrict__ cells,
                                        CellTracksVector& cellTracks,
                                        HitContainer& foundNtuplets,
-                                       AtomicPairCounter& apc,
+                                       cms::cuda::AtomicPairCounter& apc,
                                        Quality* __restrict__ quality,
                                        TmpTuple& tmpNtuplet,
                                        const unsigned int minHitsPerNtuplet,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index f761bc3d811f1..336dbbc98521f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -9,7 +9,7 @@
 
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "GPUCACell.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 8af3bd821c714..c750ad01fd487 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -9,7 +9,7 @@
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "CAConstants.h"

From cd2faf13e2c2e304251fa697e364b2598d740f4e Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 26 Mar 2020 19:09:30 +0100
Subject: [PATCH 076/102] Synchronise with CMSSW_11_1_0_pre5

---
 RecoPixelVertexing/PixelTrackFitting/BuildFile.xml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
index c77fc5cabade4..5b923e24f0b26 100644
--- a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
@@ -2,6 +2,7 @@
 <use   name="eigen"/>
 <use   name="root"/>
 <use   name="CommonTools/Statistics"/>
+<use   name="CommonTools/Utils"/>
 <use   name="DataFormats/GeometrySurface"/>
 <use   name="DataFormats/GeometryVector"/>
 <use   name="DataFormats/SiPixelDetId"/>
@@ -11,24 +12,21 @@
 <use   name="FWCore/Framework"/>
 <use   name="FWCore/MessageLogger"/>
 <use   name="FWCore/ParameterSet"/>
-<use   name="FWCore/PluginManager"/>
 <use   name="FWCore/Utilities"/>
 <use   name="Geometry/CommonDetUnit"/>
 <use   name="Geometry/Records"/>
 <use   name="Geometry/TrackerGeometryBuilder"/>
 <use   name="MagneticField/Engine"/>
-<use   name="MagneticField/Records"/>
+<use   name="RecoTracker/TkHitPairs"/>
 <use   name="RecoTracker/TkMSParametrization"/>
 <use   name="RecoTracker/TkTrackingRegions"/>
 <use   name="TrackingTools/DetLayers"/>
-<use   name="TrackingTools/PatternTools"/>
+<use   name="TrackingTools/GeomPropagators"/>
 <use   name="TrackingTools/KalmanUpdators"/>
-<use   name="TrackingTools/Records"/>
+<use   name="TrackingTools/PatternTools"/>
 <use   name="TrackingTools/TrajectoryParametrization"/>
 <use   name="TrackingTools/TrajectoryState"/>
 <use   name="TrackingTools/TransientTrackingRecHit"/>
-<use   name="RecoTracker/TransientTrackingRecHit"/>
-<use   name="RecoPixelVertexing/PixelTriplets"/>
 <export>
   <lib   name="1"/>
 </export>

From 668deea24c910aabcbe5347e29038663cfaa11ff Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 6 Apr 2020 15:57:55 +0200
Subject: [PATCH 077/102] Backport remove unneeded dependencies in Reco
 subsystems (#29295)

---
 .../PixelTrackFitting/test/BuildFile.xml      | 22 ++++++++-----------
 .../PixelTriplets/plugins/BuildFile.xml       |  1 -
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 68349ca4f45a4..9cd815df74538 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -1,17 +1,13 @@
 <library file="PixelTrackTest.cc" name="PixelTrackTest">
   <use name="boost"/>
   <use name="root"/>
+  <use name="DataFormats/Common"/>
+  <use name="DataFormats/TrackReco"/>
   <use name="FWCore/Framework"/>
-  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/MessageLogger"/>
   <use name="FWCore/ParameterSet"/>
-  <use name="Geometry/Records"/>
-  <use name="Geometry/CommonDetUnit"/>
-  <use name="Geometry/TrackerGeometryBuilder"/>
-  <use name="DataFormats/TrackerRecHit2D"/>
-  <use name="RecoTracker/TkHitPairs"/>
-  <use name="RecoTracker/TkTrackingRegions"/>
-  <use name="RecoPixelVertexing/PixelTriplets"/>
-  <use name="RecoPixelVertexing/PixelTrackFitting"/>
+  <use name="FWCore/ServiceRegistry"/>
+  <use name="SimDataFormats/Track"/>
   <flags EDM_PLUGIN="1"/>
 </library>
 
@@ -76,10 +72,10 @@
 </bin>
 
 <bin file="testEigenJacobian.cpp">  
-  <use   name="DataFormats/GeometrySurface"/>
-  <use   name="TrackingTools/AnalyticalJacobians"/>
-  <use   name="TrackingTools/TrajectoryParametrization"/>
-  <use   name="MagneticField/Engine"/>
+  <use name="DataFormats/GeometrySurface"/>
+  <use name="TrackingTools/AnalyticalJacobians"/>
+  <use name="TrackingTools/TrajectoryParametrization"/>
+  <use name="MagneticField/Engine"/>
   <use name="eigen"/>
   <use name="cuda"/>
   <flags CXXFLAGS="-g"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index cacfe4662b51f..d704894631eca 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -6,7 +6,6 @@
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/PluginManager"/>
 <use name="HeterogeneousCore/CUDACore"/>
-<use name="RecoPixelVertexing/PixelTrackFitting"/>
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>

From 71563e8239d4d5233dba7cb9c40bdea191bdbfd1 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 7 Apr 2020 11:24:15 +0200
Subject: [PATCH 078/102] Fix use of namespaces (cms-patatrack#446)

Clean up instances of using namespace ... from header files,
following the comments from the upstream integration.
---
 .../PixelTrackFitting/interface/BrokenLine.h  | 220 +++++++++---------
 .../plugins/BrokenLineFitOnGPU.h              |   2 -
 .../plugins/CAHitNtupletGeneratorKernels.cc   |   6 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   |   4 +-
 .../CAHitNtupletGeneratorKernelsImpl.h        |  10 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   |   2 -
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |   2 -
 .../plugins/gpuPixelDoubletsAlgos.h           |   4 +-
 8 files changed, 124 insertions(+), 126 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
index 5b55e5e804167..be1b67be89c35 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -7,8 +7,6 @@
 
 namespace BrokenLine {
 
-  using namespace Rfit;
-
   //!< Karimäki's parameters: (phi, d, k=1/R)
   /*!< covariance matrix: \n
     |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
@@ -22,12 +20,13 @@ namespace BrokenLine {
   */
   template <int N>
   struct PreparedBrokenLineData {
-    int q;                //!< particle charge
-    Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
-    VectorNd<N> s;  //!< total distance traveled in the transverse plane starting from the pre-fitted closest approach
-    VectorNd<N> S;  //!< total distance traveled (three-dimensional)
-    VectorNd<N> Z;  //!< orthogonal coordinate to the pre-fitted line in the sz plane
-    VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
+    int q;                      //!< particle charge
+    Rfit::Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
+    Rfit::VectorNd<N> s;        //!< total distance traveled in the transverse plane
+                                //   starting from the pre-fitted closest approach
+    Rfit::VectorNd<N> S;        //!< total distance traveled (three-dimensional)
+    Rfit::VectorNd<N> Z;        //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    Rfit::VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
   };
 
   /*!
@@ -55,9 +54,9 @@ namespace BrokenLine {
     //XX_0*=1;
     constexpr double geometry_factor =
         0.7;  //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
-    constexpr double fact = geometry_factor * sqr(13.6 / 1000.);
-    return fact / (pt2 * (1. + sqr(slope))) * (std::abs(length) * XXI_0) *
-           sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
+    constexpr double fact = geometry_factor * Rfit::sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + Rfit::sqr(slope))) * (std::abs(length) * XXI_0) *
+           Rfit::sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
   }
 
   /*!
@@ -67,9 +66,9 @@ namespace BrokenLine {
     
     \return 2D rotation matrix.
   */
-  __host__ __device__ inline Matrix2d RotationMatrix(double slope) {
-    Matrix2d Rot;
-    Rot(0, 0) = 1. / sqrt(1. + sqr(slope));
+  __host__ __device__ inline Rfit::Matrix2d RotationMatrix(double slope) {
+    Rfit::Matrix2d Rot;
+    Rot(0, 0) = 1. / sqrt(1. + Rfit::sqr(slope));
     Rot(0, 1) = slope * Rot(0, 0);
     Rot(1, 0) = -Rot(0, 1);
     Rot(1, 1) = Rot(0, 0);
@@ -82,31 +81,34 @@ namespace BrokenLine {
     \param circle circle fit in the old coordinate system.
     \param x0 x coordinate of the translation vector.
     \param y0 y coordinate of the translation vector.
-    \param Jacob passed by reference in order to save stack.
+    \param jacobian passed by reference in order to save stack.
   */
-  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle, double x0, double y0, Matrix3d& Jacob) {
+  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle,
+                                                    double x0,
+                                                    double y0,
+                                                    Rfit::Matrix3d& jacobian) {
     double A, U, BB, C, DO, DP, uu, xi, v, mu, lambda, zeta;
     DP = x0 * cos(circle.par(0)) + y0 * sin(circle.par(0));
     DO = x0 * sin(circle.par(0)) - y0 * cos(circle.par(0)) + circle.par(1);
     uu = 1 + circle.par(2) * circle.par(1);
     C = -circle.par(2) * y0 + uu * cos(circle.par(0));
     BB = circle.par(2) * x0 + uu * sin(circle.par(0));
-    A = 2. * DO + circle.par(2) * (sqr(DO) + sqr(DP));
+    A = 2. * DO + circle.par(2) * (Rfit::sqr(DO) + Rfit::sqr(DP));
     U = sqrt(1. + circle.par(2) * A);
-    xi = 1. / (sqr(BB) + sqr(C));
+    xi = 1. / (Rfit::sqr(BB) + Rfit::sqr(C));
     v = 1. + circle.par(2) * DO;
-    lambda = (0.5 * A) / (U * sqr(1. + U));
+    lambda = (0.5 * A) / (U * Rfit::sqr(1. + U));
     mu = 1. / (U * (1. + U)) + circle.par(2) * lambda;
-    zeta = sqr(DO) + sqr(DP);
+    zeta = Rfit::sqr(DO) + Rfit::sqr(DP);
 
-    Jacob << xi * uu * v, -xi * sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
+    jacobian << xi * uu * v, -xi * Rfit::sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
         mu * zeta - lambda * A, 0, 0, 1.;
 
     circle.par(0) = atan2(BB, C);
     circle.par(1) = A / (1 + U);
     // circle.par(2)=circle.par(2);
 
-    circle.cov = Jacob * circle.cov * Jacob.transpose();
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
   }
 
   /*!
@@ -125,28 +127,28 @@ namespace BrokenLine {
                                                         PreparedBrokenLineData<N>& results) {
     constexpr auto n = N;
     u_int i;
-    Vector2d d;
-    Vector2d e;
+    Rfit::Vector2d d;
+    Rfit::Vector2d e;
 
     d = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
     e = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
-    results.q = cross2D(d, e) > 0 ? -1 : 1;
+    results.q = Rfit::cross2D(d, e) > 0 ? -1 : 1;
 
     const double slope = -results.q / fast_fit(3);
 
-    Matrix2d R = RotationMatrix(slope);
+    Rfit::Matrix2d R = RotationMatrix(slope);
 
     // calculate radii and s
-    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * MatrixXd::Constant(1, n, 1);
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * Rfit::MatrixXd::Constant(1, n, 1);
     e = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
     for (i = 0; i < n; i++) {
       d = results.radii.block(0, i, 2, 1);
-      results.s(i) = results.q * fast_fit(2) * atan2(cross2D(d, e), d.dot(e));  // calculates the arc length
+      results.s(i) = results.q * fast_fit(2) * atan2(Rfit::cross2D(d, e), d.dot(e));  // calculates the arc length
     }
-    VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
+    Rfit::VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
 
     //calculate S and Z
-    Matrix2xNd<N> pointsSZ = Matrix2xNd<N>::Zero();
+    Rfit::Matrix2xNd<N> pointsSZ = Rfit::Matrix2xNd<N>::Zero();
     for (i = 0; i < n; i++) {
       pointsSZ(0, i) = results.s(i);
       pointsSZ(1, i) = z(i);
@@ -173,21 +175,21 @@ namespace BrokenLine {
     \return the n-by-n matrix of the linear system
   */
   template <int N>
-  __host__ __device__ inline MatrixNd<N> MatrixC_u(const VectorNd<N>& w,
-                                                   const VectorNd<N>& S,
-                                                   const VectorNd<N>& VarBeta) {
+  __host__ __device__ inline Rfit::MatrixNd<N> MatrixC_u(const Rfit::VectorNd<N>& w,
+                                                         const Rfit::VectorNd<N>& S,
+                                                         const Rfit::VectorNd<N>& VarBeta) {
     constexpr u_int n = N;
     u_int i;
 
-    MatrixNd<N> C_U = MatrixNd<N>::Zero();
+    Rfit::MatrixNd<N> C_U = Rfit::MatrixNd<N>::Zero();
     for (i = 0; i < n; i++) {
       C_U(i, i) = w(i);
       if (i > 1)
-        C_U(i, i) += 1. / (VarBeta(i - 1) * sqr(S(i) - S(i - 1)));
+        C_U(i, i) += 1. / (VarBeta(i - 1) * Rfit::sqr(S(i) - S(i - 1)));
       if (i > 0 && i < n - 1)
-        C_U(i, i) += (1. / VarBeta(i)) * sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+        C_U(i, i) += (1. / VarBeta(i)) * Rfit::sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
       if (i < n - 2)
-        C_U(i, i) += 1. / (VarBeta(i + 1) * sqr(S(i + 1) - S(i)));
+        C_U(i, i) += 1. / (VarBeta(i + 1) * Rfit::sqr(S(i + 1) - S(i)));
 
       if (i > 0 && i < n - 1)
         C_U(i, i + 1) =
@@ -219,22 +221,22 @@ namespace BrokenLine {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
     constexpr auto n = N;  // get the number of hits
 
-    const Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-    const Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
-    const Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+    const Rfit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Rfit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    const Rfit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
 
-    auto tmp = 0.5 / cross2D(c, a);
+    auto tmp = 0.5 / Rfit::cross2D(c, a);
     result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
     result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
     // check Wikipedia for these formulas
 
-    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(cross2D(b, a)));
+    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(Rfit::cross2D(b, a)));
     // Using Math Olympiad's formula R=abc/(4A)
 
-    const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-    const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+    const Rfit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const Rfit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
 
-    result(3) = result(2) * atan2(cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+    result(3) = result(2) * atan2(Rfit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
     // ds/dz slope between last and first point
   }
 
@@ -272,15 +274,15 @@ namespace BrokenLine {
     auto& Z = data.Z;
     auto& VarBeta = data.VarBeta;
     const double slope = -circle_results.q / fast_fit(3);
-    VarBeta *= 1. + sqr(slope);  // the kink angles are projected!
+    VarBeta *= 1. + Rfit::sqr(slope);  // the kink angles are projected!
 
     for (i = 0; i < n; i++) {
       Z(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
     }
 
-    Matrix2d V;     // covariance matrix
-    VectorNd<N> w;  // weights
-    Matrix2d RR;    // rotation matrix point by point
+    Rfit::Matrix2d V;     // covariance matrix
+    Rfit::VectorNd<N> w;  // weights
+    Rfit::Matrix2d RR;    // rotation matrix point by point
     //double Slope; // slope of the circle point by point
     for (i = 0; i < n; i++) {
       V(0, 0) = hits_ge.col(i)[0];            // x errors
@@ -291,13 +293,13 @@ namespace BrokenLine {
       w(i) = 1. / ((RR * V * RR.transpose())(1, 1));  // compute the orthogonal weight point by point
     }
 
-    VectorNplusONEd<N> r_u;
+    Rfit::VectorNplusONEd<N> r_u;
     r_u(n) = 0;
     for (i = 0; i < n; i++) {
       r_u(i) = w(i) * Z(i);
     }
 
-    MatrixNplusONEd<N> C_U;
+    Rfit::MatrixNplusONEd<N> C_U;
     C_U.block(0, 0, n, n) = MatrixC_u(w, s, VarBeta);
     C_U(n, n) = 0;
     //add the border to the C_u matrix
@@ -315,69 +317,69 @@ namespace BrokenLine {
       }
       C_U(n, i) = C_U(i, n);
       if (i > 0 && i < n - 1)
-        C_U(n, n) += sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
+        C_U(n, n) += Rfit::sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
     }
 
 #ifdef CPP_DUMP
     std::cout << "CU5\n" << C_U << std::endl;
 #endif
-    MatrixNplusONEd<N> I;
+    Rfit::MatrixNplusONEd<N> I;
     math::cholesky::invert(C_U, I);
-    // MatrixNplusONEd<N> I = C_U.inverse();
+    // Rfit::MatrixNplusONEd<N> I = C_U.inverse();
 #ifdef CPP_DUMP
     std::cout << "I5\n" << I << std::endl;
 #endif
 
-    VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    Rfit::VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
 
     // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
 
     radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
     radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
 
-    Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
-    Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
+    Rfit::Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
+    Rfit::Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
 
     circle_results.par << atan2((e - d)(1), (e - d)(0)),
-        -circle_results.q * (fast_fit(2) - sqrt(sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
+        -circle_results.q * (fast_fit(2) - sqrt(Rfit::sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
         circle_results.q * (1. / fast_fit(2) + u(n));
 
     assert(circle_results.q * circle_results.par(1) <= 0);
 
-    Vector2d eMinusd = e - d;
+    Rfit::Vector2d eMinusd = e - d;
     double tmp1 = eMinusd.squaredNorm();
 
-    Matrix3d Jacob;
-    Jacob << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
+    Rfit::Matrix3d jacobian;
+    jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
         (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0,
         (circle_results.q / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) /
-            sqrt(sqr(2 * fast_fit(2)) - tmp1),
+            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
         (circle_results.q / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) /
-            sqrt(sqr(2 * fast_fit(2)) - tmp1),
+            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
         0, 0, 0, circle_results.q;
 
     circle_results.cov << I(0, 0), I(0, 1), I(0, n), I(1, 0), I(1, 1), I(1, n), I(n, 0), I(n, 1), I(n, n);
 
-    circle_results.cov = Jacob * circle_results.cov * Jacob.transpose();
+    circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
 
     //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
 
-    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), Jacob);
-    circle_results.cov(0, 0) += (1 + sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
+    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), jacobian);
+    circle_results.cov(0, 0) += (1 + Rfit::sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
 
     //...And translate back to the original system
 
-    TranslateKarimaki(circle_results, d(0), d(1), Jacob);
+    TranslateKarimaki(circle_results, d(0), d(1), jacobian);
 
     // compute chi2
     circle_results.chi2 = 0;
     for (i = 0; i < n; i++) {
-      circle_results.chi2 += w(i) * sqr(Z(i) - u(i));
+      circle_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
       if (i > 0 && i < n - 1)
-        circle_results.chi2 +=
-            sqr(u(i - 1) / (s(i) - s(i - 1)) - u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
-                u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
-            VarBeta(i);
+        circle_results.chi2 += Rfit::sqr(u(i - 1) / (s(i) - s(i - 1)) -
+                                         u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
+                                         u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
+                               VarBeta(i);
     }
 
     // assert(circle_results.chi2>=0);
@@ -405,7 +407,7 @@ namespace BrokenLine {
                                               const V4& fast_fit,
                                               const double B,
                                               const PreparedBrokenLineData<N>& data,
-                                              line_fit& line_results) {
+                                              Rfit::line_fit& line_results) {
     constexpr u_int n = N;
     u_int i;
 
@@ -415,11 +417,11 @@ namespace BrokenLine {
     const auto& VarBeta = data.VarBeta;
 
     const double slope = -data.q / fast_fit(3);
-    Matrix2d R = RotationMatrix(slope);
+    Rfit::Matrix2d R = RotationMatrix(slope);
 
-    Matrix3d V = Matrix3d::Zero();                 // covariance matrix XYZ
-    Matrix2x3d JacobXYZtosZ = Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
-    VectorNd<N> w = VectorNd<N>::Zero();
+    Rfit::Matrix3d V = Rfit::Matrix3d::Zero();                 // covariance matrix XYZ
+    Rfit::Matrix2x3d JacobXYZtosZ = Rfit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    Rfit::VectorNd<N> w = Rfit::VectorNd<N>::Zero();
     for (i = 0; i < n; i++) {
       V(0, 0) = hits_ge.col(i)[0];            // x errors
       V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
@@ -435,57 +437,57 @@ namespace BrokenLine {
                       1, 1));  // compute the orthogonal weight point by point
     }
 
-    VectorNd<N> r_u;
+    Rfit::VectorNd<N> r_u;
     for (i = 0; i < n; i++) {
       r_u(i) = w(i) * Z(i);
     }
 #ifdef CPP_DUMP
     std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl;
 #endif
-    MatrixNd<N> I;
+    Rfit::MatrixNd<N> I;
     math::cholesky::invert(MatrixC_u(w, S, VarBeta), I);
-    //    MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
+    //    Rfit::MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
 #ifdef CPP_DUMP
     std::cout << "I4\n" << I << std::endl;
 #endif
 
-    VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    Rfit::VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
 
     // line parameters in the system in which the first hit is the origin and with axis along SZ
     line_results.par << (u(1) - u(0)) / (S(1) - S(0)), u(0);
     auto idiff = 1. / (S(1) - S(0));
-    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * sqr(idiff) +
+    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * Rfit::sqr(idiff) +
                             MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope),
         (I(0, 1) - I(0, 0)) * idiff, (I(0, 1) - I(0, 0)) * idiff, I(0, 0);
 
     // translate to the original SZ system
-    Matrix2d Jacob;
-    Jacob(0, 0) = 1.;
-    Jacob(0, 1) = 0;
-    Jacob(1, 0) = -S(0);
-    Jacob(1, 1) = 1.;
+    Rfit::Matrix2d jacobian;
+    jacobian(0, 0) = 1.;
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = -S(0);
+    jacobian(1, 1) = 1.;
     line_results.par(1) += -line_results.par(0) * S(0);
-    line_results.cov = Jacob * line_results.cov * Jacob.transpose();
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // rotate to the original sz system
     auto tmp = R(0, 0) - line_results.par(0) * R(0, 1);
-    Jacob(1, 1) = 1. / tmp;
-    Jacob(0, 0) = Jacob(1, 1) * Jacob(1, 1);
-    Jacob(0, 1) = 0;
-    Jacob(1, 0) = line_results.par(1) * R(0, 1) * Jacob(0, 0);
-    line_results.par(1) = line_results.par(1) * Jacob(1, 1);
-    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * Jacob(1, 1);
-    line_results.cov = Jacob * line_results.cov * Jacob.transpose();
+    jacobian(1, 1) = 1. / tmp;
+    jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = line_results.par(1) * R(0, 1) * jacobian(0, 0);
+    line_results.par(1) = line_results.par(1) * jacobian(1, 1);
+    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * jacobian(1, 1);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // compute chi2
     line_results.chi2 = 0;
     for (i = 0; i < n; i++) {
-      line_results.chi2 += w(i) * sqr(Z(i) - u(i));
+      line_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
       if (i > 0 && i < n - 1)
-        line_results.chi2 +=
-            sqr(u(i - 1) / (S(i) - S(i - 1)) - u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
-                u(i + 1) / (S(i + 1) - S(i))) /
-            VarBeta(i);
+        line_results.chi2 += Rfit::sqr(u(i - 1) / (S(i) - S(i - 1)) -
+                                       u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
+                                       u(i + 1) / (S(i + 1) - S(i))) /
+                             VarBeta(i);
     }
 
     // assert(line_results.chi2>=0);
@@ -526,27 +528,29 @@ namespace BrokenLine {
     \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
   */
   template <int N>
-  inline helix_fit BL_Helix_fit(const Matrix3xNd<N>& hits, const Eigen::Matrix<float, 6, 4>& hits_ge, const double B) {
-    helix_fit helix;
-    Vector4d fast_fit;
+  inline Rfit::helix_fit BL_Helix_fit(const Rfit::Matrix3xNd<N>& hits,
+                                      const Eigen::Matrix<float, 6, 4>& hits_ge,
+                                      const double B) {
+    Rfit::helix_fit helix;
+    Rfit::Vector4d fast_fit;
     BL_Fast_fit(hits, fast_fit);
 
     PreparedBrokenLineData<N> data;
     karimaki_circle_fit circle;
-    line_fit line;
-    Matrix3d Jacob;
+    Rfit::line_fit line;
+    Rfit::Matrix3d jacobian;
 
     prepareBrokenLineData(hits, fast_fit, B, data);
     BL_Line_fit(hits_ge, fast_fit, B, data, line);
     BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
 
     // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
-    Jacob << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (sqr(circle.par(2)) * circle.par(2));
+    jacobian << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (Rfit::sqr(circle.par(2)) * circle.par(2));
     circle.par(2) = B / std::abs(circle.par(2));
-    circle.cov = Jacob * circle.cov * Jacob.transpose();
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
 
     helix.par << circle.par, line.par;
-    helix.cov = MatrixXd::Zero(5, 5);
+    helix.cov = Rfit::MatrixXd::Zero(5, 5);
     helix.cov.block(0, 0, 3, 3) = circle.cov;
     helix.cov.block(3, 3, 2, 2) = line.cov;
     helix.q = circle.q;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 82a5bee443f88..7b5c9ea0ce0a4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -20,8 +20,6 @@ using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
 
-using namespace Eigen;
-
 // #define BL_DUMP_HITS
 
 template <int N>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 05106a1bfed41..21b26b2d4b9f5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -94,7 +94,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  m_params.dcaCutOuterTriplet_);
 
   if (nhits > 1 && m_params.earlyFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
   }
 
   kernel_find_ntuplets(hh.view(),
@@ -118,7 +119,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && m_params.lateFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
   }
 
   if (m_params.doStats_) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 7bfee1c8d557f..5197afc7f44ea 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -68,7 +68,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
     cudaCheck(cudaGetLastError());
   }
@@ -120,7 +120,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
     cudaCheck(cudaGetLastError());
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 18a2138247946..dbc513468fb87 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -19,8 +19,6 @@
 #include "gpuFishbone.h"
 #include "gpuPixelDoublets.h"
 
-using namespace gpuPixelDoublets;
-
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using HitsOnCPU = TrackingRecHit2DCUDA;
 
@@ -36,8 +34,8 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
                                       cms::cuda::AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
                                       uint32_t const *__restrict__ nCells,
-                                      CellNeighborsVector const *cellNeighbors,
-                                      CellTracksVector const *cellTracks,
+                                      gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
+                                      gpuPixelDoublets::CellTracksVector const *cellTracks,
                                       GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
                                       uint32_t nHits,
                                       uint32_t maxNumberOfDoublets,
@@ -195,7 +193,7 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
                                GPUCACell::Hits const *__restrict__ hhp,
                                GPUCACell *cells,
                                uint32_t const *__restrict__ nCells,
-                               CellNeighborsVector *cellNeighbors,
+                               gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
                                GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
                                float hardCurvCut,
                                float ptmin,
@@ -266,7 +264,7 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
                                      GPUCACell *__restrict__ cells,
                                      uint32_t const *nCells,
-                                     CellTracksVector *cellTracks,
+                                     gpuPixelDoublets::CellTracksVector *cellTracks,
                                      HitContainer *foundNtuplets,
                                      cms::cuda::AtomicPairCounter *apc,
                                      Quality *__restrict__ quality,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index b46de519034d5..75e9d570a129e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -18,8 +18,6 @@ using HitsOnGPU = TrackingRecHit2DSOAView;
 using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
 
-using namespace Eigen;
-
 template <int N>
 __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
                               CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 5f9cb4f79aa63..8e0b05dcb6c8a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -7,8 +7,6 @@
 
 namespace gpuPixelDoublets {
 
-  using namespace gpuPixelDoubletsAlgos;
-
   constexpr int nPairs = 13 + 2 + 4;
   static_assert(nPairs <= CAConstants::maxNumberOfLayerPairs());
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index c750ad01fd487..4eb6823907bcc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -15,7 +15,7 @@
 #include "CAConstants.h"
 #include "GPUCACell.h"
 
-namespace gpuPixelDoubletsAlgos {
+namespace gpuPixelDoublets {
 
   using CellNeighbors = CAConstants::CellNeighbors;
   using CellTracks = CAConstants::CellTracks;
@@ -239,6 +239,6 @@ namespace gpuPixelDoubletsAlgos {
     }  // loop in block...
   }
 
-}  // namespace gpuPixelDoubletsAlgos
+}  // namespace gpuPixelDoublets
 
 #endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h

From 2bfeeb9d992756b47c3dc4a470e7c089916a6a5d Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 11 May 2020 14:49:56 +0200
Subject: [PATCH 079/102] Synchronise with CMSSW_11_1_0_pre7

---
 .../PixelTrackFitting/BuildFile.xml           | 60 +++++++++----------
 .../PixelTrackFitting/test/BuildFile.xml      | 34 +++++------
 .../PixelTriplets/plugins/BuildFile.xml       |  4 +-
 .../PixelTriplets/test/BuildFile.xml          | 42 +++++++------
 .../TkSeedGenerator/plugins/BuildFile.xml     |  6 +-
 5 files changed, 74 insertions(+), 72 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
index 5b923e24f0b26..74761cc5ee240 100644
--- a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
@@ -1,32 +1,32 @@
-<use   name="cuda"/>
-<use   name="eigen"/>
-<use   name="root"/>
-<use   name="CommonTools/Statistics"/>
-<use   name="CommonTools/Utils"/>
-<use   name="DataFormats/GeometrySurface"/>
-<use   name="DataFormats/GeometryVector"/>
-<use   name="DataFormats/SiPixelDetId"/>
-<use   name="DataFormats/TrackerRecHit2D"/>
-<use   name="DataFormats/TrackingRecHit"/>
-<use   name="DataFormats/TrackReco"/>
-<use   name="FWCore/Framework"/>
-<use   name="FWCore/MessageLogger"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="FWCore/Utilities"/>
-<use   name="Geometry/CommonDetUnit"/>
-<use   name="Geometry/Records"/>
-<use   name="Geometry/TrackerGeometryBuilder"/>
-<use   name="MagneticField/Engine"/>
-<use   name="RecoTracker/TkHitPairs"/>
-<use   name="RecoTracker/TkMSParametrization"/>
-<use   name="RecoTracker/TkTrackingRegions"/>
-<use   name="TrackingTools/DetLayers"/>
-<use   name="TrackingTools/GeomPropagators"/>
-<use   name="TrackingTools/KalmanUpdators"/>
-<use   name="TrackingTools/PatternTools"/>
-<use   name="TrackingTools/TrajectoryParametrization"/>
-<use   name="TrackingTools/TrajectoryState"/>
-<use   name="TrackingTools/TransientTrackingRecHit"/>
+<use name="cuda"/>
+<use name="eigen"/>
+<use name="root"/>
+<use name="CommonTools/Statistics"/>
+<use name="CommonTools/Utils"/>
+<use name="DataFormats/GeometrySurface"/>
+<use name="DataFormats/GeometryVector"/>
+<use name="DataFormats/SiPixelDetId"/>
+<use name="DataFormats/TrackerRecHit2D"/>
+<use name="DataFormats/TrackingRecHit"/>
+<use name="DataFormats/TrackReco"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/MessageLogger"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/Utilities"/>
+<use name="Geometry/CommonDetUnit"/>
+<use name="Geometry/Records"/>
+<use name="Geometry/TrackerGeometryBuilder"/>
+<use name="MagneticField/Engine"/>
+<use name="RecoTracker/TkHitPairs"/>
+<use name="RecoTracker/TkMSParametrization"/>
+<use name="RecoTracker/TkTrackingRegions"/>
+<use name="TrackingTools/DetLayers"/>
+<use name="TrackingTools/GeomPropagators"/>
+<use name="TrackingTools/KalmanUpdators"/>
+<use name="TrackingTools/PatternTools"/>
+<use name="TrackingTools/TrajectoryParametrization"/>
+<use name="TrackingTools/TrajectoryState"/>
+<use name="TrackingTools/TransientTrackingRecHit"/>
 <export>
-  <lib   name="1"/>
+  <lib name="1"/>
 </export>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 9cd815df74538..8d02db9a0e638 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -6,77 +6,75 @@
   <use name="FWCore/Framework"/>
   <use name="FWCore/MessageLogger"/>
   <use name="FWCore/ParameterSet"/>
-  <use name="FWCore/ServiceRegistry"/>
-  <use name="SimDataFormats/Track"/>
   <flags EDM_PLUGIN="1"/>
 </library>
 
 <bin file="testRiemannFit.cpp">
-  <use name="eigen"/>
   <use name="cuda"/>
+  <use name="eigen"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="testRiemannFit.cpp" name="testBrokenLineFit">
-  <use name="eigen"/>
   <use name="cuda"/>
+  <use name="eigen"/>
   <flags CXXFLAGS="-g -DUSE_BL"/>
 </bin>
 
 <bin file="testRiemannFit.cpp" name="testRiemannFitDump">
-  <use name="eigen"/>
   <use name="cuda"/>
+  <use name="eigen"/>
   <flags CXXFLAGS="-g -DRFIT_DEBUG"/>
 </bin>
 
 <bin file="testEigenGPU.cu" name="testRiemannFitGPU_t">
-  <use name="eigen"/>
-  <use name="cuda"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+  <use name="eigen"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="testEigenGPU.cu" name="testBrokenLineFitGPU_t">
-  <use name="eigen"/>
-  <use name="cuda"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+  <use name="eigen"/>
   <flags CXXFLAGS="-g -DUSE_BL"/>
 </bin>
 
 <bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
-  <use name="eigen"/>
   <use name="cuda"/>
+  <use name="eigen"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="PixelTrackRiemannFit.cc">
-  <use name="eigen"/>
   <use name="cuda"/>
+  <use name="eigen"/>
   <use name="root"/>
   <flags CXXFLAGS="-DEIGEN_NO_DEBUG"/>
 </bin>
 
-<bin file="PixelTrackRiemannFit.cc" name = "PixelTrackBrokenLineFit">
-  <use name="eigen"/>
+<bin file="PixelTrackRiemannFit.cc" name="PixelTrackBrokenLineFit">
   <use name="cuda"/>
+  <use name="eigen"/>
   <use name="root"/>
   <flags CXXFLAGS="-DEIGEN_NO_DEBUG -DUSE_BL"/>
 </bin>
 
-<bin file="PixelTrackRiemannFit.cc" name = "PixelTrackRiemannFit_Debug">
-  <use name="eigen"/>
+<bin file="PixelTrackRiemannFit.cc" name="PixelTrackRiemannFit_Debug">
   <use name="cuda"/>
+  <use name="eigen"/>
   <use name="root"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="testEigenJacobian.cpp">  
+  <use name="cuda"/>
+  <use name="eigen"/>
   <use name="DataFormats/GeometrySurface"/>
+  <use name="MagneticField/Engine"/>
   <use name="TrackingTools/AnalyticalJacobians"/>
   <use name="TrackingTools/TrajectoryParametrization"/>
-  <use name="MagneticField/Engine"/>
-  <use name="eigen"/>
-  <use name="cuda"/>
   <flags CXXFLAGS="-g"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index d704894631eca..3a54cd1134bc2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -6,9 +6,9 @@
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/PluginManager"/>
 <use name="HeterogeneousCore/CUDACore"/>
+<use name="RecoTracker/TkTrackingRegions"/>
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
-<use name="RecoTracker/TkTrackingRegions"/>
-<library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
+<library file="*.cc *.cu" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 767d140a5d5ed..9b8b315e93937 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -1,25 +1,29 @@
-<library   file="HitTripletProducer.cc" name="HitTripletProducer">
-<use   name="boost"/>
-<use   name="root"/>
-<use   name="FWCore/Framework"/>
-<use   name="FWCore/PluginManager"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="Geometry/Records"/>
-<use   name="Geometry/CommonDetUnit"/>
-<use   name="Geometry/TrackerGeometryBuilder"/>
-<use   name="DataFormats/TrackerRecHit2D"/>
-<use   name="RecoTracker/TkHitPairs"/>
-<use   name="RecoTracker/TkTrackingRegions"/>
-<use   name="RecoPixelVertexing/PixelTriplets"/>
-  <flags   EDM_PLUGIN="1"/>
+<library file="HitTripletProducer.cc" name="HitTripletProducer">
+  <use name="boost"/>
+  <use name="root"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="Geometry/Records"/>
+  <use name="Geometry/CommonDetUnit"/>
+  <use name="Geometry/TrackerGeometryBuilder"/>
+  <use name="DataFormats/TrackerRecHit2D"/>
+  <use name="RecoTracker/TkHitPairs"/>
+  <use name="RecoTracker/TkTrackingRegions"/>
+  <use name="RecoPixelVertexing/PixelTriplets"/>
+  <flags EDM_PLUGIN="1"/>
 </library>
+
 <bin file="PixelTriplets_InvPrbl_t.cpp">
-  <use   name="RecoPixelVertexing/PixelTriplets"/>
+  <use name="RecoPixelVertexing/PixelTriplets"/>
 </bin>
+
 <bin file="PixelTriplets_InvPrbl_prec.cpp">
-  <use   name="RecoPixelVertexing/PixelTriplets"/>
-  <flags NO_TESTRUN="1"/>
+  <use name="RecoPixelVertexing/PixelTriplets"/>
 </bin>
 
-<bin file="fastDPHI_t.cpp"/>
-<bin file="CircleEq_t.cpp"/>
+<bin file="fastDPHI_t.cpp">
+</bin>
+
+<bin file="CircleEq_t.cpp">
+</bin>
diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
index 26ef004940306..fd2c6b2a67d4e 100644
--- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
+++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
@@ -2,11 +2,11 @@
 <use name="CommonTools/RecoAlgos"/>
 <use name="DataFormats/TrackerRecHit2D"/>
 <use name="HeterogeneousCore/CUDACore"/>
-<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
-<use name="RecoPixelVertexing/PixelTrackFitting"/>
-<use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedGenerator"/>
 <use name="RecoTracker/TkTrackingRegions"/>
+<use name="RecoPixelVertexing/PixelTriplets"/>
+<use name="RecoPixelVertexing/PixelTrackFitting"/>
+<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
 <library file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>

From 14de161e6a8845999b1af56850bcd26ed08415b0 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 13 May 2020 10:42:17 +0200
Subject: [PATCH 080/102] Use std::isnan (cms-patatrack#456)

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index dbc513468fb87..654b37c076f99 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -361,7 +361,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     // if the fit has any invalid parameters, mark it as bad
     bool isNaN = false;
     for (int i = 0; i < 5; ++i) {
-      isNaN |= isnan(tracks->stateAtBS.state(it)(i));
+      isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
     }
     if (isNaN) {
 #ifdef NTUPLE_DEBUG

From b20bd1bfce90c271cd3280604d460ab00df28d8d Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 15 May 2020 17:33:59 +0200
Subject: [PATCH 081/102] Replace cub prefix scan with home-brewed one
 (cms-patatrack#447)

Replace the use of the prefix scan from CUB with a home-brewed implementation,
using dynamic instead of static shared memory.

No changes to physics or timing performance.
---
 .../plugins/CAHitNtupletGeneratorKernels.cc            |  4 ++--
 .../plugins/CAHitNtupletGeneratorKernels.cu            |  4 ++--
 .../plugins/CAHitNtupletGeneratorKernels.h             |  2 --
 .../plugins/CAHitNtupletGeneratorKernelsAlloc.h        | 10 +---------
 4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 21b26b2d4b9f5..4eafb6dccd31c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -115,7 +115,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
 
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && m_params.lateFishbone_) {
@@ -156,7 +156,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
   kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
 
   // remove duplicates (tracks that share a hit)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 5197afc7f44ea..541ab5ed905f5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -108,7 +108,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
@@ -252,7 +252,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
-    cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
     cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 5382d0f6e88d6..e112e9d17adeb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -195,8 +195,6 @@ class CAHitNtupletGeneratorKernels {
 
   unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
 
-  uint8_t* device_tmws_;
-
   unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
   // params
   Params const& m_params;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index 881b30ba46752..05bf4f09f7f93 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -23,19 +23,11 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
 
   device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
 
-  auto storageSize =
-      3 + (std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) + sizeof(cms::cuda::AtomicPairCounter::c_type)) /
-              sizeof(cms::cuda::AtomicPairCounter::c_type);
-
-  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(storageSize, stream);
+  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
 
   device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
   device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
   device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
-  device_tmws_ = (uint8_t*)(device_storage_.get() + 3);
-
-  assert(device_tmws_ + std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <=
-         (uint8_t*)(device_storage_.get() + storageSize));
 
   if
 #ifndef __CUDACC__

From ca80b700c0f928b4d287c254623f702ab42298c2 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 23 May 2020 11:59:55 +0200
Subject: [PATCH 082/102] Synchronise with CMSSW_11_1_0_pre8

---
 Validation/RecoTrack/python/plotting/html.py          | 5 +++++
 Validation/RecoTrack/python/plotting/trackingPlots.py | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py
index d4f83995b8bcb..4e3bc59fed8fc 100644
--- a/Validation/RecoTrack/python/plotting/html.py
+++ b/Validation/RecoTrack/python/plotting/html.py
@@ -48,6 +48,7 @@ def _lowerFirst(s):
 _fromPVName = "Tracks from PV"
 _fromPVAllTPName = "Tracks from PV (all TPs)"
 _tpPtLess09Name = "All tracks (TP pT &lt; 0.9 GeV)"
+_tpEtaGreater2p7Name = "All tracks (TP |eta| &gt; 2.7)"
 _conversionName = "Tracks for conversions"
 _gsfName = "Electron GSF tracks"
 _bhadronName = "All tracks (B-hadron TPs)"
@@ -91,6 +92,8 @@ def _toPixel(s):
     ("tpPtLess09_highPurityByOriginalAlgo", _toOriAlgo(_allToHP(_tpPtLess09Name))),
     ("tpPtLess09_ByAlgoMask", _toAlgoMask(_tpPtLess09Name)),
     ("tpPtLess09_highPurityByAlgoMask", _toAlgoMask(_allToHP(_tpPtLess09Name))),
+    ("tpEtaGreater2p7_", _tpEtaGreater2p7Name),
+    ("tpEtaGreater2p7_highPurity", _allToHP(_tpEtaGreater2p7Name)),
     ("btvLike", _allToBTV(_allName)),
     ("ak4PFJets", "AK4 PF jets"),
     ("allTPEffic_", _allTPEfficName),
@@ -198,6 +201,8 @@ def _toPixel(s):
     ("highPurityPt09", _ptCut(_allToHP(_allName))),
     ("tpPtLess09", _tpPtLess09Name),
     ("tpPtLess09_highPurity", _allToHP(_tpPtLess09Name)),
+    ("tpEtaGreater2p7", _tpEtaGreater2p7Name),
+    ("tpEtaGreater2p7_highPurity", _allToHP(_tpEtaGreater2p7Name)),
     ("btvLike", "BTV-like"),
     ("ak4PFJets", "AK4 PF jets"),
     ("allTPEffic", _allTPEfficName),
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index 76fcf73623f03..deba27b0ed3c0 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -632,8 +632,8 @@ def _mapCollectionToAlgoQuality(collName):
         collNameLow = collNameLow[:i_seeds]
 
     algo = None
-    prefixes = ["cutsreco", "cutsrecofrompv", "cutsrecofrompv2", "cutsrecofrompvalltp"]
-    if collNameLow in ["general", "generalfrompv"]+prefixes:
+    prefixes = ["cutsreco", "cutsrecofrompv", "cutsrecofrompv2", "cutsrecofrompvalltp", "cutsrecoetagreater2p7"]
+    if collNameLow in ["general", "generalfrompv", "generaletagreater2p7"]+prefixes:
         algo = "ootb"
     elif collNameLow in ["pixel", "pixelfrompv", "pixelfrompvalltp"]:
         algo = "pixel"
@@ -1368,6 +1368,7 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
         plotter.appendTable(summaryName, folders, TrackingSummaryTable(section="ak4PFJets", collection=TrackingSummaryTable.AK4PFJets))
 _appendTrackingPlots("Track", "", _simBasedPlots+_recoBasedPlots)
 _appendTrackingPlots("TrackTPPtLess09", "tpPtLess09", _simBasedPlots)
+_appendTrackingPlots("TrackTPEtaGreater2p7", "tpEtaGreater2p7", _simBasedPlots+_recoBasedPlots)
 _appendTrackingPlots("TrackAllTPEffic", "allTPEffic", _simBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackFromPV", "fromPV", _simBasedPlots+_recoBasedPlots, onlyForPileup=True)
 _appendTrackingPlots("TrackFromPVAllTP", "fromPVAllTP", _simBasedPlots+_recoBasedPlots, onlyForPileup=True)

From ddeaccbb33ba042e708668d4a82614e6ee2a88b2 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 12 Jul 2020 23:24:43 +0200
Subject: [PATCH 083/102] Synchronise with CMSSW_11_2_0_pre2

---
 RecoPixelVertexing/PixelTrackFitting/BuildFile.xml          | 6 ++++++
 .../PixelTriplets/interface/CAHitQuadrupletGenerator.h      | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
index 74761cc5ee240..a589aad036996 100644
--- a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
@@ -27,6 +27,12 @@
 <use name="TrackingTools/TrajectoryParametrization"/>
 <use name="TrackingTools/TrajectoryState"/>
 <use name="TrackingTools/TransientTrackingRecHit"/>
+<use name="DataFormats/BeamSpot"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/GeometryCommonDetAlgo"/>
+<use name="DataFormats/TrackerCommon"/>
+<use name="RecoTracker/TkSeedingLayers"/>
+<use name="TrackingTools/TrajectoryFiltering"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
index 065dcd0f3ecb1..deb2beb6099ee 100644
--- a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
+++ b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
@@ -16,6 +16,7 @@
 
 #include "RecoTracker/TkHitPairs/interface/IntermediateHitDoublets.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/OrderedHitSeeds.h"
+#include "RecoPixelVertexing/PixelTriplets/src/CACut.h"
 
 class TrackingRegion;
 class SeedingLayerSetsHits;
@@ -127,8 +128,8 @@ class CAHitQuadrupletGenerator {
   const bool fitFastCircleChi2Cut;
   const bool useBendingCorrection;
 
-  const float caThetaCut = 0.00125f;
-  const float caPhiCut = 0.1f;
+  CACut caThetaCut;
+  CACut caPhiCut;
   const float caHardPtCut = 0.f;
 };
 #endif

From 17accf28a5f043576c25cafdade061799c072a07 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 15 Jul 2020 12:17:18 +0200
Subject: [PATCH 084/102] Reduce GPU memory usage (cms-patatrack#509)

Adjust the growth factor in the caching allocators to use more granular bins, reducing the memory wasted by the allocations.

Use a dynamic buffer for CA cells components.

Fix a possible data race in the prefix scan.
---
 .../PixelTriplets/plugins/CAConstants.h       |  6 +--
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 28 +++++++----
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 33 +++++++++----
 .../plugins/CAHitNtupletGeneratorKernels.h    |  9 ++--
 .../CAHitNtupletGeneratorKernelsAlloc.h       |  8 +---
 .../CAHitNtupletGeneratorKernelsImpl.h        |  4 ++
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  5 +-
 .../PixelTriplets/plugins/GPUCACell.h         | 48 +++++++++++++++----
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 11 +++++
 .../PixelTriplets/test/BuildFile.xml          |  5 ++
 .../PixelTriplets/test/CAsizes_t.cpp          | 25 ++++++++++
 11 files changed, 139 insertions(+), 43 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index fce0c23596137..0ebbdf3ed3705 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -27,7 +27,7 @@ namespace CAConstants {
   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
 #ifndef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumberOfDoublets() { return 448 * 1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 128; }
 #else
   constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
@@ -37,7 +37,7 @@ namespace CAConstants {
   constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
 #endif
-  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 4; }
+  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }
 
   constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
   constexpr uint32_t maxNumberOfLayers() { return 10; }
@@ -49,7 +49,7 @@ namespace CAConstants {
 
 #ifndef ONLY_PHICUT
   using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
-  using CellTracks = cms::cuda::VecArray<tindex_type, 42>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
 #else
   using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
   using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 4eafb6dccd31c..4d4791b87ad3b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -24,12 +24,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   device_isOuterHitOfCell_.reset(
       (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
   assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
+                                             CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ =
+      (GPUCACell::CellTracks *)(cellStorage_.get() +
+                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+
   gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
                                  nhits,
-                                 device_theCellNeighbors_,
-                                 device_theCellNeighborsContainer_.get(),
-                                 device_theCellTracks_,
-                                 device_theCellTracksContainer_.get());
+                                 device_theCellNeighbors_.get(),
+                                 device_theCellNeighborsContainer_,
+                                 device_theCellTracks_.get(),
+                                 device_theCellTracksContainer_);
 
   // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
   device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
@@ -47,8 +55,8 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   assert(nActualPairs <= gpuPixelDoublets::nPairs);
   gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
                                          device_nCells_,
-                                         device_theCellNeighbors_,
-                                         device_theCellTracks_,
+                                         device_theCellNeighbors_.get(),
+                                         device_theCellTracks_.get(),
                                          hh.view(),
                                          device_isOuterHitOfCell_.get(),
                                          nActualPairs,
@@ -84,7 +92,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  hh.view(),
                  device_theCells_.get(),
                  device_nCells_,
-                 device_theCellNeighbors_,
+                 device_theCellNeighbors_.get(),
                  device_isOuterHitOfCell_.get(),
                  m_params.hardCurvCut_,
                  m_params.ptmin_,
@@ -101,7 +109,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_find_ntuplets(hh.view(),
                        device_theCells_.get(),
                        device_nCells_,
-                       device_theCellTracks_,
+                       device_theCellTracks_.get(),
                        tuples_d,
                        device_hitTuple_apc_,
                        quality_d,
@@ -129,8 +137,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                           device_hitTuple_apc_,
                           device_theCells_.get(),
                           device_nCells_,
-                          device_theCellNeighbors_,
-                          device_theCellTracks_,
+                          device_theCellNeighbors_.get(),
+                          device_theCellTracks_.get(),
                           device_isOuterHitOfCell_.get(),
                           nhits,
                           m_params.maxNumberOfDoublets_,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 541ab5ed905f5..8a213eee2f579 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -51,7 +51,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       hh.view(),
       device_theCells_.get(),
       device_nCells_,
-      device_theCellNeighbors_,
+      device_theCellNeighbors_.get(),
       device_isOuterHitOfCell_.get(),
       m_params.hardCurvCut_,
       m_params.ptmin_,
@@ -78,7 +78,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
-                                                                     device_theCellTracks_,
+                                                                     device_theCellTracks_.get(),
                                                                      tuples_d,
                                                                      device_hitTuple_apc_,
                                                                      quality_d,
@@ -132,8 +132,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                         device_hitTuple_apc_,
                                                                         device_theCells_.get(),
                                                                         device_nCells_,
-                                                                        device_theCellNeighbors_,
-                                                                        device_theCellTracks_,
+                                                                        device_theCellNeighbors_.get(),
+                                                                        device_theCellTracks_.get(),
                                                                         device_isOuterHitOfCell_.get(),
                                                                         nhits,
                                                                         m_params.maxNumberOfDoublets_,
@@ -144,6 +144,9 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
+
+  // free space asap
+  // device_isOuterHitOfCell_.reset();
 }
 
 template <>
@@ -162,16 +165,26 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   // in principle we can use "nhits" to heuristically dimension the workspace...
   device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
   assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
+      CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
+          CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
+      stream);
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ =
+      (GPUCACell::CellTracks *)(cellStorage_.get() +
+                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+
   {
     int threadsPerBlock = 128;
     // at least one block!
     int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
     gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
                                                                            nhits,
-                                                                           device_theCellNeighbors_,
-                                                                           device_theCellNeighborsContainer_.get(),
-                                                                           device_theCellTracks_,
-                                                                           device_theCellTracksContainer_.get());
+                                                                           device_theCellNeighbors_.get(),
+                                                                           device_theCellNeighborsContainer_,
+                                                                           device_theCellTracks_.get(),
+                                                                           device_theCellTracksContainer_);
     cudaCheck(cudaGetLastError());
   }
 
@@ -201,8 +214,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   dim3 thrs(stride, threadsPerBlock, 1);
   gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
                                                                     device_nCells_,
-                                                                    device_theCellNeighbors_,
-                                                                    device_theCellTracks_,
+                                                                    device_theCellNeighbors_.get(),
+                                                                    device_theCellTracks_.get(),
                                                                     hh.view(),
                                                                     device_isOuterHitOfCell_.get(),
                                                                     nActualPairs,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index e112e9d17adeb..7ab3ed010927e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -179,10 +179,11 @@ class CAHitNtupletGeneratorKernels {
 
 private:
   // workspace
-  CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
-  unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
-  CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
-  unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
+  unique_ptr<unsigned char[]> cellStorage_;
+  unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
+  CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
+  unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
+  CAConstants::CellTracks* device_theCellTracksContainer_;
 
   unique_ptr<GPUCACell[]> device_theCells_;
   unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index 05bf4f09f7f93..fb750267f5c37 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -12,12 +12,8 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  /* not used at the moment 
-  cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
-  cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
-  cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
-  cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
-  */
+  device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
+  device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);
 
   device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 654b37c076f99..691395887dddb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -79,6 +79,10 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
       printf("Tuples overflow\n");
     if (*nCells >= maxNumberOfDoublets)
       printf("Cells overflow\n");
+    if (cellNeighbors && cellNeighbors->full())
+      printf("cellNeighbors overflow\n");
+    if (cellTracks && cellTracks->full())
+      printf("cellTracks overflow\n");
   }
 
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 4a8240706efc2..3e16728a002dd 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -168,14 +168,15 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.counters_ = m_counters;
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
 
   kernels.allocateOnGPU(stream);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   kernels.buildDoublets(hits_d, stream);
   kernels.launchKernels(hits_d, soa, stream);
   kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
+
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   } else {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 6e1c2a587e212..e913b77fe0953 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -56,24 +56,56 @@ class GPUCACell {
     theInnerZ = hh.zGlobal(innerHitId);
     theInnerR = hh.rGlobal(innerHitId);
 
-    outerNeighbors().reset();
-    tracks().reset();
+    // link to default empty
+    theOuterNeighbors = &cellNeighbors[0];
+    theTracks = &cellTracks[0];
     assert(outerNeighbors().empty());
     assert(tracks().empty());
   }
 
   __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
+    // use smart cache
+    if (outerNeighbors().empty()) {
+      auto i = cellNeighbors.extend();  // maybe waisted....
+      if (i > 0) {
+        cellNeighbors[i].reset();
+#ifdef __CUDACC__
+        auto zero = (ptrAsInt)(&cellNeighbors[0]);
+        atomicCAS((ptrAsInt*)(&theOuterNeighbors),
+                  zero,
+                  (ptrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
+#else
+        theOuterNeighbors = &cellNeighbors[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
     return outerNeighbors().push_back(t);
   }
 
   __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
+    if (tracks().empty()) {
+      auto i = cellTracks.extend();  // maybe waisted....
+      if (i > 0) {
+        cellTracks[i].reset();
+#ifdef __CUDACC__
+        auto zero = (ptrAsInt)(&cellTracks[0]);
+        atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
+#else
+        theTracks = &cellTracks[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
     return tracks().push_back(t);
   }
 
-  __device__ __forceinline__ CellTracks& tracks() { return theTracks; }
-  __device__ __forceinline__ CellTracks const& tracks() const { return theTracks; }
-  __device__ __forceinline__ CellNeighbors& outerNeighbors() { return theOuterNeighbors; }
-  __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return theOuterNeighbors; }
+  __device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
+  __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
+  __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
+  __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
   __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
   __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
   __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
@@ -297,8 +329,8 @@ class GPUCACell {
   }
 
 private:
-  CellNeighbors theOuterNeighbors;
-  CellTracks theTracks;
+  CellNeighbors* theOuterNeighbors;
+  CellTracks* theTracks;
 
 public:
   int32_t theDoubletId;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 8e0b05dcb6c8a..5b0d3e8833a52 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -73,6 +73,17 @@ namespace gpuPixelDoublets {
     int first = blockIdx.x * blockDim.x + threadIdx.x;
     for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
       isOuterHitOfCell[i].reset();
+
+    if (0 == first) {
+      cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
+      cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
+      auto i = cellNeighbors->extend();
+      assert(0 == i);
+      (*cellNeighbors)[0].reset();
+      i = cellTracks->extend();
+      assert(0 == i);
+      (*cellTracks)[0].reset();
+    }
   }
 
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 9b8b315e93937..92fa4370faa70 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -27,3 +27,8 @@
 
 <bin file="CircleEq_t.cpp">
 </bin>
+
+<bin file="CAsizes_t.cpp">
+  <use name="cuda"/>
+  <use name="eigen"/>
+</bin>
diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
new file mode 100644
index 0000000000000..5c57eb7005691
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
@@ -0,0 +1,25 @@
+#include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h"
+
+#include <typeinfo>
+#include <iostream>
+
+template <typename T>
+void print() {
+  std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl;
+}
+
+int main() {
+  using namespace CAConstants;
+
+  print<GPUCACell>();
+  print<CellNeighbors>();
+  print<CellTracks>();
+  print<OuterHitOfCell>();
+  print<TuplesContainer>();
+  print<HitToTuple>();
+  print<TupleMultiplicity>();
+
+  print<CellNeighborsVector>();
+
+  return 0;
+}

From ceb4e96cd127592682efac63d1ab6a21c71eb834 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 8 Aug 2020 17:38:10 +0200
Subject: [PATCH 085/102] Synchronise with CMSSW_11_2_0_pre3

---
 RecoPixelVertexing/PixelTriplets/test/BuildFile.xml | 5 -----
 RecoTracker/TkSeedGenerator/plugins/BuildFile.xml   | 7 ++++---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 92fa4370faa70..d480d7408b9e2 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -2,13 +2,8 @@
   <use name="boost"/>
   <use name="root"/>
   <use name="FWCore/Framework"/>
-  <use name="FWCore/PluginManager"/>
   <use name="FWCore/ParameterSet"/>
   <use name="Geometry/Records"/>
-  <use name="Geometry/CommonDetUnit"/>
-  <use name="Geometry/TrackerGeometryBuilder"/>
-  <use name="DataFormats/TrackerRecHit2D"/>
-  <use name="RecoTracker/TkHitPairs"/>
   <use name="RecoTracker/TkTrackingRegions"/>
   <use name="RecoPixelVertexing/PixelTriplets"/>
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
index fd2c6b2a67d4e..b18d26ee39e11 100644
--- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
+++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
@@ -1,12 +1,13 @@
 <use name="cuda"/>
 <use name="CommonTools/RecoAlgos"/>
+<use name="DataFormats/L1TrackTrigger"/>
 <use name="DataFormats/TrackerRecHit2D"/>
 <use name="HeterogeneousCore/CUDACore"/>
+<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
+<use name="RecoPixelVertexing/PixelTrackFitting"/>
+<use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedGenerator"/>
 <use name="RecoTracker/TkTrackingRegions"/>
-<use name="RecoPixelVertexing/PixelTriplets"/>
-<use name="RecoPixelVertexing/PixelTrackFitting"/>
-<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
 <library file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>

From 2004cbd964810c9eb02389f97b050027e89d3497 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 27 Aug 2020 14:37:58 +0200
Subject: [PATCH 086/102] Protect CAHitNtupletCUDA ctor/dtor when CUDA is not
 available (cms-patatrack#544)

---
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 3e16728a002dd..464744594e9a6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -12,8 +12,10 @@
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/EDMException.h"
 #include "FWCore/Utilities/interface/isFinite.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
@@ -92,8 +94,12 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
 #endif
 
   if (m_params.onGPU_) {
-    cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
-    cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+    // allocate pinned host memory only if CUDA is available
+    edm::Service<CUDAService> cs;
+    if (cs and cs->enabled()) {
+      cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
+      cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+    }
   } else {
     m_counters = new Counters();
     memset(m_counters, 0, sizeof(Counters));
@@ -101,17 +107,20 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
 }
 
 CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() {
-  if (m_params.doStats_) {
-    // crash on multi-gpu processes
-    if (m_params.onGPU_) {
-      CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
-    } else {
-      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
-    }
-  }
   if (m_params.onGPU_) {
-    cudaFree(m_counters);
+    // print the gpu statistics and free pinned host memory only if CUDA is available
+    edm::Service<CUDAService> cs;
+    if (cs and cs->enabled()) {
+      if (m_params.doStats_) {
+        // crash on multi-gpu processes
+        CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
+      }
+      cudaFree(m_counters);
+    }
   } else {
+    if (m_params.doStats_) {
+      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
+    }
     delete m_counters;
   }
 }

From e6414606e0de66634e561efb656c0ce0feb5e3c5 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 2 Oct 2020 08:27:02 +0200
Subject: [PATCH 087/102] Add customisations for profiling the Pixel-only
 workflow (cms-patatrack#553)

customizePixelOnlyForProfilingGPUOnly:
  Customise the Pixel-only reconstruction to run on GPU
  Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU.

customizePixelOnlyForProfilingGPUWithHostCopy:
  Customise the Pixel-only reconstruction to run on GPU, and copy the data to the host
  Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU,
  and copy all the products to the host in SoA format.
  The same customisation can be also used on the SoA CPU workflow, running up to the
  tracks and vertices on the CPU in SoA format, without conversion to legacy format.

customizePixelOnlyForProfiling:
  Customise the Pixel-only reconstruction to run on GPU, copy the data to the host,
  and convert to legacy format
  Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU;
  copy all the products to the host in SoA format; and convert them to legacy format.
  The same customisation can be also used on the CPU workflow, running up to the
  tracks and vertices on the CPU.
---
 .../python/customizePixelOnlyForProfiling.py  | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py

diff --git a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
new file mode 100644
index 0000000000000..d46764dbd7edd
--- /dev/null
+++ b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
@@ -0,0 +1,59 @@
+import FWCore.ParameterSet.Config as cms
+
+# Customise the Pixel-only reconstruction to run on GPU
+#
+# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU.
+def customizePixelOnlyForProfilingGPUOnly(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('caHitNtupletCUDA', 'pixelVertexCUDA')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the Pixel-only reconstruction to run on GPU, and copy the data to the host
+#
+# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU,
+# and copy all the products to the host in SoA format.
+#
+# The same customisation can be also used on the SoA CPU workflow, running up to the
+# tracks and vertices on the CPU in SoA format, without conversion to legacy format.
+def customizePixelOnlyForProfilingGPUWithHostCopy(process):
+
+  #? process.siPixelRecHitHostSoA.convertToLegacy = False
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('pixelTrackSoA', 'pixelVertexSoA')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the Pixel-only reconstruction to run on GPU, copy the data to the host,
+# and convert to legacy format
+#
+# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU;
+# copy all the products to the host in SoA format; and convert them to legacy format.
+#
+# The same customisation can be also used on the CPU workflow, running up to the
+# tracks and vertices on the CPU.
+def customizePixelOnlyForProfiling(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('pixelTracks', 'pixelVertices')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process

From 28d292ac6d26b32d91165ce9bdbef787272b9bee Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 2 Oct 2020 14:38:40 +0200
Subject: [PATCH 088/102] Synchronise with CMSSW_11_2_0_pre7

---
 .../PixelTrackFitting/python/PixelTracks_cff.py   |  1 -
 .../RecoTrack/python/PostProcessorTracker_cfi.py  |  5 +++++
 .../RecoTrack/python/TrackValidation_cff.py       | 15 ++++++++++++++-
 .../RecoTrack/python/plotting/trackingPlots.py    | 10 ++++++----
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index ab7738826b1c2..5ff404cb603d4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -1,6 +1,5 @@
 import FWCore.ParameterSet.Config as cms
 
-from RecoLocalTracker.SiPixelRecHits.PixelCPEParmError_cfi import *
 from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import *
 from RecoLocalTracker.SiStripRecHitConverter.SiStripRecHitMatcher_cfi import *
 from RecoTracker.TransientTrackingRecHit.TransientTrackingRecHitBuilder_cfi import *
diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index 6ee5e4d41a8f0..b4745167dd0b1 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -173,25 +173,30 @@ def _addNoFlow(module):
                              "cotThetares_vs_eta '#sigma(cot(#theta)) vs #eta' cotThetares_vs_eta",
                              "cotThetares_vs_pt '#sigma(cot(#theta)) vs p_{T}' cotThetares_vs_pt",
                              "h_dxypulleta 'd_{xy} Pull vs #eta' dxypull_vs_eta",
+                             "h_dxypullpt 'd_{xy} Pull vs p_{T}' dxypull_vs_pt",
                              "dxyres_vs_eta '#sigma(d_{xy}) vs #eta' dxyres_vs_eta",
                              "dxyres_vs_phi '#sigma(d_{xy}) vs #phi' dxyres_vs_phi",
                              "dxyres_vs_pt '#sigma(d_{xy}) vs p_{T}' dxyres_vs_pt",
                              "h_dzpulleta 'd_{z} Pull vs #eta' dzpull_vs_eta",
+                             "h_dzpullpt 'd_{z} Pull vs p_{T}' dzpull_vs_pt",
                              "dzres_vs_eta '#sigma(d_{z}) vs #eta' dzres_vs_eta",
                              "dzres_vs_phi '#sigma(d_{z}) vs #phi' dzres_vs_phi",
                              "dzres_vs_pt '#sigma(d_{z}) vs p_{T}' dzres_vs_pt",
                              "etares_vs_eta '#sigma(#eta) vs #eta' etares_vs_eta",
                              "h_phipulleta '#phi Pull vs #eta' phipull_vs_eta",
+                             "h_phipullpt '#phi Pull vs p_{T}' phipull_vs_pt",
                              "h_phipullphi '#phi Pull vs #phi' phipull_vs_phi",
                              "phires_vs_eta '#sigma(#phi) vs #eta' phires_vs_eta",
                              "phires_vs_phi '#sigma(#phi) vs #phi' phires_vs_phi",
                              "phires_vs_pt '#sigma(#phi) vs p_{T}' phires_vs_pt",
                              "h_ptpulleta 'p_{T} Pull vs #eta' ptpull_vs_eta",
+                             "h_ptpullpt 'p_{T} Pull vs p_{T}' ptpull_vs_pt",
                              "h_ptpullphi 'p_{T} Pull vs #phi' ptpull_vs_phi",
                              "ptres_vs_eta '#sigma(p_{T}) vs #eta' ptres_vs_eta",
                              "ptres_vs_phi '#sigma(p_{T}) vs #phi' ptres_vs_phi",
                              "ptres_vs_pt '#sigma(p_{T}) vs p_{T}' ptres_vs_pt",
                              "h_thetapulleta '#theta Pull vs #eta' thetapull_vs_eta",
+                             "h_thetapullpt '#theta Pull vs p_{T}' thetapull_vs_pt",
                              "h_thetapullphi '#theta Pull vs #phi' thetapull_vs_phi"
                              ),
     cumulativeDists = cms.untracked.vstring(
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 0765b76fa6e45..fc45589bd1569 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 import FWCore.ParameterSet.Config as cms
 
-import SimTracker.TrackAssociatorProducers.trackAssociatorByChi2_cfi
+from SimTracker.TrackAssociatorProducers.trackAssociatorByChi2_cfi import *
 from SimTracker.TrackAssociatorProducers.quickTrackAssociatorByHits_cfi import *
 from SimTracker.TrackAssociation.trackingParticleRecoTrackAsssociation_cfi import *
 import Validation.RecoTrack.MultiTrackValidator_cfi
@@ -353,6 +353,11 @@ def _getMVASelectors(postfix):
     ptMin = 0,
 )
 
+#ByChi2 association (for jetCore usage, not used by default)
+MTVTrackAssociationByChi2 = trackingParticleRecoTrackAsssociation.clone(
+     associator = cms.InputTag('trackAssociatorByChi2')
+)
+
 # Select jets for JetCore tracking
 highPtJets = cms.EDFilter("CandPtrSelector", src = cms.InputTag("ak4CaloJets"), cut = cms.string("pt()>1000"))
 highPtJetsForTrk = highPtJetsForTrk = highPtJets.clone(src = "ak4CaloJetsForTrk")
@@ -364,6 +369,7 @@ def _getMVASelectors(postfix):
 trackValidator = Validation.RecoTrack.MultiTrackValidator_cfi.multiTrackValidator.clone(
     useLogPt = cms.untracked.bool(True),
     dodEdxPlots = True,
+    # associators=cms.untracked.VInputTag('MTVTrackAssociationByChi2'), #uncomment for byChi2 assoc. for jetcore studies (1/5)
     doPVAssociationPlots = True
     #,minpT = cms.double(-1)
     #,maxpT = cms.double(3)
@@ -387,6 +393,7 @@ def _getMVASelectors(postfix):
                    locals()["_generalTracksHp"+_postfix],
                    "generalTracksPt09",
                    "cutsRecoTracksBtvLike",
+                   "cutsRecoTracksJetCoreRegionalStepByOriginalAlgo",
                ]
     )
     _setForEra(trackValidator.histoProducerAlgoBlock, _eraName, _era, seedingLayerSets=locals()["_seedingLayerSets"+_postfix])
@@ -537,6 +544,9 @@ def _getMVASelectors(postfix):
 trackValidatorBuilding = _trackValidatorSeedingBuilding.clone(
     dirName = "Tracking/TrackBuilding/",
     doMVAPlots = True,
+    doResolutionPlotsForLabels = ['jetCoreRegionalStepTracks'],
+    # associators = ["trackAssociatorByChi2"], #uncomment for byChi2 assoc. for jetcore studies (2/5)
+    # UseAssociators = True, #uncomment for byChi2 assoc. for jetcore studies (3/5)
 )
 trackValidatorBuildingPreSplitting = trackValidatorBuilding.clone(
     associators = ["quickTrackAssociatorByHitsPreSplitting"],
@@ -653,6 +663,8 @@ def _uniqueFirstLayers(layerList):
 tracksValidationTruth = cms.Task(
     tpClusterProducer,
     tpClusterProducerPreSplitting,
+    # trackAssociatorByChi2, #uncomment for byChi2 assoc. for jetcore studies (4/5)
+    # MTVTrackAssociationByChi2, #uncomment for byChi2 assoc. for jetcore studies (5/5)
     quickTrackAssociatorByHits,
     quickTrackAssociatorByHitsPreSplitting,
     trackingParticleRecoTrackAsssociation,
@@ -844,6 +856,7 @@ def _uniqueFirstLayers(layerList):
     dirName = "Tracking/TrackSeeding/",
     label = _seedSelectors,
     doSeedPlots = True,
+    doResolutionPlotsForLabels = [ "seedTracksjetCoreRegionalStepSeeds",]
 )
 trackValidatorSeedingPreSplittingTrackingOnly = trackValidatorSeedingTrackingOnly.clone(
     associators = ["quickTrackAssociatorByHitsPreSplitting"],
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index deba27b0ed3c0..9d8f81c64b35f 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -41,6 +41,7 @@
 _maxPU = [20, 50, 65, 80, 100, 150, 200, 250]
 _minMaxTracks = [0, 200, 500, 1000, 1500, 2000]
 _minMaxMVA = [-1.025, -0.5, 0, 0.5, 1.025]
+_maxDRJ = 0.1
 
 def _minMaxResidual(ma):
     return ([-x for x in ma], ma)
@@ -216,7 +217,7 @@ def _makeMVAPlots(num, hp=False):
 )
 _effandfakeDeltaRPU = PlotGroup("effandfakeDeltaRPU",
                                 _makeEffFakeDupPlots("dr"     , "#DeltaR", effopts=dict(xtitle="TP min #DeltaR"), fakeopts=dict(xtitle="track min #DeltaR"), common=dict(xlog=True)) +
-                                _makeEffFakeDupPlots("drj" , "#DeltaR(track, jet)", effopts=dict(xtitle="#DeltaR(TP, jet)", ytitle="efficiency vs #DeltaR(TP, jet"), fakeopts=dict(xtitle="#DeltaR(track, jet)"), common=dict(xlog=True))+
+                                _makeEffFakeDupPlots("drj" , "#DeltaR(track, jet)", effopts=dict(xtitle="#DeltaR(TP, jet)", ytitle="efficiency vs #DeltaR(TP, jet"), fakeopts=dict(xtitle="#DeltaR(track, jet)"), common=dict(xlog=True, xmax=_maxDRJ))+
                                 _makeEffFakeDupPlots("pu"     , "PU"     , common=dict(xtitle="Pileup", xmin=_minPU, xmax=_maxPU)),
                                 legendDy=_legendDy_4rows
 )
@@ -262,7 +263,7 @@ def _makeMVAPlots(num, hp=False):
 )
 _dupandfakeDeltaRPU = PlotGroup("dupandfakeDeltaRPU",
                                 _makeFakeDupPileupPlots("dr"     , "#DeltaR", xquantity="min #DeltaR", common=dict(xlog=True)) +
-                                _makeFakeDupPileupPlots("drj"     , "#DeltaR(track, jet)", xtitle="#DeltaR(track, jet)", common=dict(xlog=True)) +
+                                _makeFakeDupPileupPlots("drj"     , "#DeltaR(track, jet)", xtitle="#DeltaR(track, jet)", common=dict(xlog=True, xmax=_maxDRJ)) +
                                 _makeFakeDupPileupPlots("pu"     , "PU"     , xtitle="Pileup", common=dict(xmin=_minPU, xmax=_maxPU)),
                                 ncols=3
 )
@@ -373,6 +374,7 @@ def _makeMVAPlots(num, hp=False):
          fallback={"name": "chi2_vs_eta", "profileX": True}),
     Plot("ptres_vs_eta_Mean", scale=100, title="", xtitle="TP #eta (PCA to beamline)", ytitle="< #delta p_{T} / p_{T} > (%)", ymin=_minResidualPt, ymax=_maxResidualPt),
     Plot("chi2mean_vs_pt", title="", xtitle="p_{T}", ytitle="< #chi^{2} / ndf >", ymin=[0, 0.5], ymax=[2, 2.5, 3, 5], xlog=True, fallback={"name": "chi2_vs_pt", "profileX": True}),
+    Plot("chi2mean_vs_drj", title="", xtitle="#DeltaR(track, jet)", ytitle="< #chi^{2} / ndf >", ymin=[0, 0.5], ymax=[2, 2.5, 3, 5], xlog=True, xmax=_maxDRJ, fallback={"name": "chi2_vs_drj", "profileX": True}),
     Plot("ptres_vs_pt_Mean", title="", xtitle="p_{T}", ytitle="< #delta p_{T}/p_{T} > (%)", scale=100, ymin=_minResidualPt, ymax=_maxResidualPt,xlog=True)
 ])
 _common = {"stat": True, "fit": True, "normalizeToUnitArea": True, "drawStyle": "hist", "drawCommand": "", "xmin": -10, "xmax": 10, "ylog": True, "ymin": 5e-5, "ymax": [0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1.025], "ratioUncertainty": False}
@@ -441,7 +443,7 @@ def _makeMVAPlots(num, hp=False):
 )
 _extDistDeltaR = PlotGroup("distDeltaR",
                               _makeDistPlots("dr"     , "min #DeltaR", common=dict(xlog=True)) +
-                              _makeDistPlots("drj"     , "#DeltaR(track, jet)", common=dict(xlog=True)),
+                              _makeDistPlots("drj"     , "#DeltaR(track, jet)", common=dict(xlog=True, xmax=_maxDRJ)),
                               ncols=2, legendDy=_legendDy_2rows,
 )
 _extDistSeedingPlots = _makeDistPlots("seedingLayerSet", "seeding layers", common=dict(xtitle="", **_seedingLayerSet_common))
@@ -512,7 +514,7 @@ def _makeMVAPlots(num, hp=False):
 )
 _extDistSimDeltaR = PlotGroup("distsimDeltaR",
                                  _makeDistSimPlots("dr"     , "min #DeltaR", common=dict(xlog=True)) +
-                                 _makeDistSimPlots("drj" , "#DeltaR(TP, jet)", common=dict(xlog=True)),
+                                 _makeDistSimPlots("drj" , "#DeltaR(TP, jet)", common=dict(xlog=True, xmax=_maxDRJ)),
                                  ncols=2, legendDy=_legendDy_2rows,
 )
 

From a817147d67f12039a6908a182f212778711890c4 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 5 Oct 2020 17:35:45 +0200
Subject: [PATCH 089/102] Further clean up after merging CMSSW_11_2_0_pre7
 (cms-patatrack#556)

Minor bug fixes:
  - fix a typo in EventFilter/EcalRawToDigi/plugins/BuildFile.xml .

Clean up:
  - remove obsolete ArrayShadow class;
  - remove obsolete profiling functions.
---
 CUDADataFormats/Track/src/classes.h           |  1 -
 CUDADataFormats/Track/src/classes_def.xml     |  2 -
 .../customizePixelTracksForProfiling.py       | 54 -------------------
 3 files changed, 57 deletions(-)
 delete mode 100644 RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py

diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 49c71bf03b90a..4843818978cca 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -4,7 +4,6 @@
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/Common/interface/ArrayShadow.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 7c73c676ad13d..9c80ae91baf29 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -3,6 +3,4 @@
   <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
   <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
   <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
-  <class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
-  <class name="edm::Wrapper<ArrayShadow<std::array<unsigned int,2001>>>" persistent="false"/>
 </lcgdict>
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
deleted file mode 100644
index ce97de6650244..0000000000000
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-def customizePixelTracksForProfilingGPUOnly(process):
-    process.MessageLogger.cerr.FwkReport.reportEvery = 100
-
-    process.Raw2Hit = cms.Path(process.offlineBeamSpot+process.offlineBeamSpotCUDA+process.siPixelClustersCUDAPreSplitting+process.siPixelRecHitsCUDAPreSplitting)
-
-    process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi')
-    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi')
-    process.TVreco = cms.Path(process.caHitNtupletCUDA+process.pixelVertexCUDA)
-
-    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco)
-    return process
-
-def customizePixelTracksForProfilingSoAonCPU(process):
-    process = customizePixelTracksForProfilingGPUOnly(process)
-
-    process.pixelVertexSoA = process.pixelVertexCUDA.clone()
-    process.pixelVertexSoA.onGPU = False
-    process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA'
-    process.TVSoAreco = cms.Path(process.caHitNtupletCUDA+process.pixelTrackSoA+process.pixelVertexSoA)
-
-    process.schedule = cms.Schedule(process.Raw2Hit, process.TVSoAreco)
-
-    return process
-
-def customizePixelTracksForProfilingEnableTransfer(process):
-    process = customizePixelTracksForProfilingGPUOnly(process)
-
-    process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi')
-    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi')
-    process.toSoA = cms.Path(process.pixelTrackSoA+process.pixelVertexSoA)
-
-    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco, process.toSoA)
-    return process
-
-def customizePixelTracksForProfilingEnableConversion(process):
-    # use old trick of output path
-    process.MessageLogger.cerr.FwkReport.reportEvery = 100
-
-    process.out = cms.OutputModule("AsciiOutputModule",
-        outputCommands = cms.untracked.vstring(
-            "keep *_pixelTracks_*_*",
-            "keep *_pixelVertices_*_*",
-        ),
-        verbosity = cms.untracked.uint32(0),
-    )
-
-    process.outPath = cms.EndPath(process.out)
-
-    process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.outPath)
-
-    return process
-

From c31b20f280740646f49e83af663f409a5fc20358 Mon Sep 17 00:00:00 2001
From: AdrianoDee <AdrianoDee@users.noreply.github.com>
Date: Mon, 12 Oct 2020 16:49:41 +0200
Subject: [PATCH 090/102] Update the RelVal workflows and the CPU customisation
 (cms-patatrack#549)

Update the RelVal workflows and the CPU customisation:
  - change the .501 workflow to run the full Patatrack pixel track reconstruction on CPU
  - add a customisation to run the Patatrack reconstruction with triplets, on CPU and GPU
  - add the .505 and .506 workflows to reconstruct triplets, on CPU and GPU

Co-authored-by: Andrea Bocci <andrea.bocci@cern.ch>
---
 .../python/customizePixelTracksSoAonCPU.py    | 58 +++++++++++++------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index 9afe654de6c32..1083687a9c354 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -1,30 +1,52 @@
 import FWCore.ParameterSet.Config as cms
 
-def customizePixelTracksSoAonCPU(process) :
+def customizePixelTracksSoAonCPU(process):
+  
+  process.CUDAService = cms.Service("CUDAService",
+    enabled = cms.untracked.bool(False)
+  )
+
+  from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHostSoA_cfi import siPixelRecHitHostSoA
+  process.siPixelRecHitsPreSplitting = siPixelRecHitHostSoA.clone(
+    convertToLegacy = True
+  )
+
+  from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA
+  process.pixelTrackSoA = caHitNtupletCUDA.clone(
+    onGPU = False,
+    pixelRecHitSrc = 'siPixelRecHitsPreSplitting'
+  )
+
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+  process.pixelVertexSoA = pixelVertexCUDA.clone(
+    onGPU = False,
+    pixelTrackSrc = 'pixelTrackSoA'
+  )
 
-  process.load('RecoLocalTracker/SiPixelRecHits/siPixelRecHitHostSoA_cfi')
-  process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi')
-  process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi')
+  from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA
+  process.pixelTracks = pixelTrackProducerFromSoA.clone(
+    pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting'
+  )
 
-  process.pixelTrackSoA = process.caHitNtupletCUDA.clone()
-  process.pixelTrackSoA.onGPU = False
-  process.pixelTrackSoA.pixelRecHitSrc = 'siPixelRecHitHostSoA'
-  process.pixelVertexSoA = process.pixelVertexCUDA.clone()
-  process.pixelVertexSoA.onGPU = False
-  process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA'
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA
+  process.pixelVertices = pixelVertexFromSoA.clone()
+
+  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA
+
+  return process
 
-  process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi')
-  process.pixelTracks = process.pixelTrackProducerFromSoA.clone()
-  process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi')
-  process.pixelVertices = process.pixelVertexFromSoA.clone()
-  process.pixelTracks.pixelRecHitLegacySrc = 'siPixelRecHitHostSoA'
-  process.siPixelRecHitHostSoA.convertToLegacy = True
 
-  process.reconstruction_step += process.siPixelRecHitHostSoA+process.pixelTrackSoA+process.pixelVertexSoA
+def customizePixelTracksForTriplets(process):
 
+  from HLTrigger.Configuration.common import producers_by_type
+  for producer in producers_by_type(process, 'CAHitNtupletCUDA'):
+        producer.includeJumpingForwardDoublets = True
+        producer.minHitsPerNtuplet = 3
+ 
   return process
+ 
 
-def customizePixelTracksSoAonCPUForProfiling(process) :
+def customizePixelTracksSoAonCPUForProfiling(process):
 
   process.MessageLogger.cerr.FwkReport.reportEvery = 100
 

From 2636448339adfcc0b76850f20ad11afda5c46bae Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 22 Oct 2020 16:05:48 +0200
Subject: [PATCH 091/102] Move the siPixelClusters.payloadType to the
 SoA-on-CPU customisation (cms-patatrack#560)

---
 .../Configuration/python/customizePixelTracksSoAonCPU.py     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index 1083687a9c354..bcd9dbdc51ea4 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -2,10 +2,13 @@
 
 def customizePixelTracksSoAonCPU(process):
   
-  process.CUDAService = cms.Service("CUDAService",
+  process.CUDAService = cms.Service('CUDAService',
     enabled = cms.untracked.bool(False)
   )
 
+  # ensure the same results when running on GPU (which supports only the 'HLT' payload) and CPU
+  process.siPixelClustersPreSplitting.cpu.payloadType = cms.string('HLT')
+
   from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHostSoA_cfi import siPixelRecHitHostSoA
   process.siPixelRecHitsPreSplitting = siPixelRecHitHostSoA.clone(
     convertToLegacy = True

From c6ac80655aa65064b11c939307b12ee8ca270b71 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 23 Oct 2020 13:43:58 +0200
Subject: [PATCH 092/102] Synchronise with CMSSW_11_2_0_pre8

---
 .../python/PostProcessorTracker_cfi.py        | 38 ++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index b4745167dd0b1..88a1e21e4bfb5 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -1,5 +1,6 @@
 import FWCore.ParameterSet.Config as cms
 from DQMServices.Core.DQMEDHarvester import DQMEDHarvester
+from Configuration.Eras.Modifier_fastSim_cff import fastSim
 
 def _addNoFlow(module):
     _noflowSeen = set()
@@ -15,8 +16,11 @@ def _addNoFlow(module):
         if not tmp[ind-1] in _noflowSeen:
             module.noFlowDists.append(tmp[ind-1])
 
+_defaultSubdirs = ["Tracking/Track/*", "Tracking/TrackTPPtLess09/*", "Tracking/TrackFromPV/*", "Tracking/TrackFromPVAllTP/*", "Tracking/TrackAllTPEffic/*", "Tracking/TrackBuilding/*","Tracking/TrackConversion/*", "Tracking/TrackGsf/*"]
+_defaultSubdirsSummary = [e.replace("/*","") for e in _defaultSubdirs]
+
 postProcessorTrack = DQMEDHarvester("DQMGenericClient",
-    subDirs = cms.untracked.vstring("Tracking/Track/*", "Tracking/TrackTPPtLess09/*", "Tracking/TrackFromPV/*", "Tracking/TrackFromPVAllTP/*", "Tracking/TrackAllTPEffic/*", "Tracking/TrackBuilding/*", "Tracking/TrackConversion/*", "Tracking/TrackGsf/*", "Tracking/TrackBHadron/*"),
+    subDirs = cms.untracked.vstring(_defaultSubdirs),
     efficiency = cms.vstring(
     "effic 'Efficiency vs #eta' num_assoc(simToReco)_eta num_simul_eta",
     "efficPt 'Efficiency vs p_{T}' num_assoc(simToReco)_pT num_simul_pT",
@@ -252,7 +256,7 @@ def _addNoFlow(module):
 
 postProcessorTrack2D = DQMEDHarvester("DQMGenericClient",
     makeGlobalEffienciesPlot = cms.untracked.bool(False),
-    subDirs = cms.untracked.vstring("Tracking/Track/*", "Tracking/TrackTPPtLess09/*", "Tracking/TrackFromPV/*", "Tracking/TrackFromPVAllTP/*", "Tracking/TrackAllTPEffic/*", "Tracking/TrackBuilding/*", "Tracking/TrackConversion/*", "Tracking/TrackGsf/*", "Tracking/TrackBHadron/*"),
+    subDirs = cms.untracked.vstring(_defaultSubdirs),
     efficiency = cms.vstring(
     "efficPtvseta 'Efficiency in p_{T}-#eta plane' num_assoc(simToReco)_pTvseta num_simul_pTvseta",
     "duplicatesRate_Ptvseta 'Duplicates Rate in (p_{T}-#eta) plane' num_duplicate_pTvseta num_reco_pTvseta",
@@ -294,7 +298,7 @@ def _addNoFlow(module):
 
 
 postProcessorTrackSummary = DQMEDHarvester("DQMGenericClient",
-    subDirs = cms.untracked.vstring("Tracking/Track", "Tracking/TrackTPPtLess09", "Tracking/TrackFromPV", "Tracking/TrackFromPVAllTP", "Tracking/TrackAllTPEffic", "Tracking/TrackBuilding", "Tracking/TrackConversion", "Tracking/TrackGsf", "Tracking/TrackBHadron"),
+    subDirs = cms.untracked.vstring(_defaultSubdirsSummary),
     efficiency = cms.vstring(
     "effic_vs_coll 'Efficiency vs track collection' num_assoc(simToReco)_coll num_simul_coll",
     "effic_vs_coll_allPt 'Efficiency vs track collection' num_assoc(simToReco)_coll_allPt num_simul_coll_allPt",
@@ -313,6 +317,27 @@ def _addNoFlow(module):
     postProcessorTrackSummary
 )
 
+fastSim.toModify(postProcessorTrack, subDirs = [e for e in _defaultSubdirs if e not in ["Tracking/TrackGsf/*","Tracking/TrackConversion/*"]])
+fastSim.toModify(postProcessorTrackSummary, subDirs = [e for e in _defaultSubdirsSummary if e not in ["Tracking/TrackGsf","Tracking/TrackConversion"]])
+
+#######
+# Define a standalone seuquence to support the Standalone harvesting mode
+# see https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideMultiTrackValidator#cmsDriver_MTV_alone_i_e_standalone for more information
+########
+
+postProcessorTrackStandalone = postProcessorTrack.clone(
+    subDirs = _defaultSubdirs+["Tracking/TrackBHadron/*"]
+)
+postProcessorTrackSummaryStandalone = postProcessorTrackSummary.clone(
+    subDirs = _defaultSubdirs+["Tracking/TrackBHadron"]
+)
+
+postProcessorTrackSequenceStandalone = cms.Sequence(
+    postProcessorTrackStandalone+
+    postProcessorTrackNrecVsNsim+
+    postProcessorTrackSummaryStandalone
+)
+
 postProcessorTrackPhase2 = postProcessorTrack.clone()
 postProcessorTrackPhase2.subDirs.extend(["Tracking/TrackTPEtaGreater2p7/*"])
 postProcessorTrackSummaryPhase2 = postProcessorTrackSummary.clone()
@@ -323,12 +348,15 @@ def _addNoFlow(module):
 phase2_tracker.toReplaceWith(postProcessorTrackSummary,postProcessorTrackSummaryPhase2)
 
 postProcessorTrackTrackingOnly = postProcessorTrack.clone()
-postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"])
+postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackBHadron/*", "Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"])
 postProcessorTrackSummaryTrackingOnly = postProcessorTrackSummary.clone()
-postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackSeeding", "Tracking/PixelTrack", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"])
+postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackBHadron", "Tracking/TrackSeeding", "Tracking/PixelTrack", "Tracking/PixelTrackFromPV", "Tracking/PixelTrackFromPVAllTP", "Tracking/PixelTrackBHadron"])
 
 postProcessorTrackSequenceTrackingOnly = cms.Sequence(
     postProcessorTrackTrackingOnly+
     postProcessorTrackNrecVsNsim+
     postProcessorTrackSummaryTrackingOnly
 )
+
+fastSim.toModify(postProcessorTrackTrackingOnly,subDirs = [e for e in _defaultSubdirs if e not in ["Tracking/TrackGsf/*","Tracking/TrackConversion/*","Tracking/TrackBHadron/*"]])
+fastSim.toModify(postProcessorTrackSummaryTrackingOnly,subDirs = [e for e in _defaultSubdirsSummary if e not in ["Tracking/TrackGsf","Tracking/TrackConversion","Tracking/TrackBHadron"]])

From bb61e1c0edcbcb17d5fcd5c149b9de937cbaae87 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 16 Nov 2020 11:56:30 +0100
Subject: [PATCH 093/102] Synchronise with CMSSW_11_2_0_pre9

---
 .../TkSeedGenerator/plugins/BuildFile.xml     | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
index b18d26ee39e11..a743aeca5631f 100644
--- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
+++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
@@ -1,13 +1,49 @@
 <use name="cuda"/>
 <use name="CommonTools/RecoAlgos"/>
+<use name="CommonTools/Utils"/>
+<use name="DataFormats/BeamSpot"/>
+<use name="DataFormats/Common"/>
 <use name="DataFormats/L1TrackTrigger"/>
+<use name="DataFormats/Math"/>
+<use name="DataFormats/SiPixelDetId"/>
+<use name="DataFormats/SiStripDetId"/>
+<use name="DataFormats/TrackReco"/>
+<use name="DataFormats/TrackerCommon"/>
 <use name="DataFormats/TrackerRecHit2D"/>
+<use name="DataFormats/TrackingRecHit"/>
+<use name="DataFormats/TrajectorySeed"/>
+<use name="DataFormats/TrajectoryState"/>
+<use name="DataFormats/VertexReco"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/MessageLogger"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="FWCore/Utilities"/>
+<use name="Geometry/CommonDetUnit"/>
+<use name="Geometry/Records"/>
+<use name="Geometry/TrackerGeometryBuilder"/>
 <use name="HeterogeneousCore/CUDACore"/>
+<use name="MagneticField/Engine"/>
+<use name="MagneticField/Records"/>
+<use name="MagneticField/UniformEngine"/>
 <use name="RecoPixelVertexing/PixelLowPtUtilities"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
 <use name="RecoPixelVertexing/PixelTriplets"/>
+<use name="RecoTracker/MeasurementDet"/>
+<use name="RecoTracker/Record"/>
+<use name="RecoTracker/SpecialSeedGenerators"/>
+<use name="RecoTracker/TkHitPairs"/>
 <use name="RecoTracker/TkSeedGenerator"/>
+<use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
+<use name="RecoTracker/TransientTrackingRecHit"/>
+<use name="TrackingTools/GeomPropagators"/>
+<use name="TrackingTools/KalmanUpdators"/>
+<use name="TrackingTools/MeasurementDet"/>
+<use name="TrackingTools/PatternTools"/>
+<use name="TrackingTools/Records"/>
+<use name="TrackingTools/TrajectoryState"/>
+<use name="TrackingTools/TransientTrackingRecHit"/>
 <library file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>

From 7473687e4eb9e03a47dc57cba404505d15cf1f63 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 28 Nov 2020 00:18:26 +0100
Subject: [PATCH 094/102] Move hit indexes to 32 bits (cms-patatrack#583)

Add a counter for forlorn doublets.
---
 .../Track/interface/PixelTrackHeterogeneous.h |  2 +-
 .../PixelTriplets/plugins/CAConstants.h       |  2 +-
 .../plugins/CAHitNtupletGeneratorKernels.cc   |  1 +
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 33 ++++++++++---------
 .../CAHitNtupletGeneratorKernelsImpl.h        |  5 +--
 5 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index d462be2c5dd7b..e79a32c21daa0 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -16,7 +16,7 @@ class TrackSoAT {
   static constexpr int32_t stride() { return S; }
 
   using Quality = trackQuality::Quality;
-  using hindex_type = uint16_t;
+  using hindex_type = uint32_t;
   using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
 
   // Always check quality is at least loose!
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 0ebbdf3ed3705..9eea4f528fcdb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -44,7 +44,7 @@ namespace CAConstants {
   constexpr uint32_t maxTuples() { return maxNumberOfTuples(); }
 
   // types
-  using hindex_type = uint16_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+  using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
   using tindex_type = uint16_t;  //  for tuples
 
 #ifndef ONLY_PHICUT
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 4d4791b87ad3b..1646cb503ff81 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -134,6 +134,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   if (m_params.doStats_) {
     kernel_checkOverflows(tuples_d,
                           device_tupleMultiplicity_.get(),
+                          device_hitToTuple_.get(),
                           device_hitTuple_apc_,
                           device_theCells_.get(),
                           device_nCells_,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 8a213eee2f579..a8dac7992f4fa 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -125,21 +125,6 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     cudaCheck(cudaGetLastError());
   }
 
-  if (m_params.doStats_) {
-    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
-                                                                        device_tupleMultiplicity_.get(),
-                                                                        device_hitTuple_apc_,
-                                                                        device_theCells_.get(),
-                                                                        device_nCells_,
-                                                                        device_theCellNeighbors_.get(),
-                                                                        device_theCellTracks_.get(),
-                                                                        device_isOuterHitOfCell_.get(),
-                                                                        nhits,
-                                                                        m_params.maxNumberOfDoublets_,
-                                                                        counters_);
-    cudaCheck(cudaGetLastError());
-  }
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
@@ -278,6 +263,24 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     cudaCheck(cudaGetLastError());
   }
 
+  if (m_params.doStats_) {
+    auto nhits = hh.nHits();
+    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitToTuple_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_.get(),
+                                                                        device_theCellTracks_.get(),
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        m_params.maxNumberOfDoublets_,
+                                                                        counters_);
+    cudaCheck(cudaGetLastError());
+  }
+
   if (m_params.doStats_) {
     // counters (add flag???)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 691395887dddb..3a935efbe2b4b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -30,7 +30,8 @@ using TkSoA = pixelTrack::TrackSoA;
 using HitContainer = pixelTrack::HitContainer;
 
 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
-                                      CAConstants::TupleMultiplicity *tupleMultiplicity,
+                                      CAConstants::TupleMultiplicity const *tupleMultiplicity,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
                                       cms::cuda::AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
                                       uint32_t const *__restrict__ nCells,
@@ -95,7 +96,7 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
       atomicAdd(&c.nKilledCells, 1);
     if (0 == thisCell.theUsed)
       atomicAdd(&c.nEmptyCells, 1);
-    if (thisCell.tracks().empty())
+    if (0 == hitToTuple->size(thisCell.get_inner_hit_id()) && 0 == hitToTuple->size(thisCell.get_outer_hit_id()))
       atomicAdd(&c.nZeroTrackCells, 1);
   }
 

From 50f697e0910f598f0f8ce729dc00d2b27a63bc6b Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 18 Dec 2020 18:26:58 +0100
Subject: [PATCH 095/102] Clean up the pixel local reconstruction code
 (cms-patatrack#593)

Address the pixel local reconstruction review comments.

General clean up of the pixel local reconstruction code:
  - remove commented out and obsolete code and data members
  - use named constants more consistently
  - update variable names to follow the coding rules and for better consistency
  - use member initializer lists in the constructors
  - allow `if constexpr` in CUDA code
  - use `std::size` instead of hardcoding the array size
  - convert iterator-based loops to range-based loops
  - replace `cout` and `printf` with `LogDebug` or `LogWarning`
  - use put tokens
  - reorganise the auto-generated cfi files and use them more consistently
  - adjust code after rearranging an `#ifdef GPU_DEBUG` block
  - apply code formatting
  - other minor changes

Improve comments:
  - improve comments and remove obsolete ones
  - clarify comments and types regarding `HostProduct`
  - update comments about `GPU_SMALL_EVENTS` being kept for testing purposes
  - add notes about the original cpu code

Reuse some more common code:
  - move common pixel cluster code to `PixelClusterizerBase`
  - extend the `SiPixelCluster` constructor

Rename classes and modules for better consistency:
  - remove the `TrackingRecHit2DCUDA.h` and `gpuClusteringConstants.h` forwarding headers
  - rename `PixelRecHits` to `PixelRecHitGPUKernel`
  - rename SiPixelRecHitFromSOA to SiPixelRecHitFromCUDA
  - rename `siPixelClustersCUDAPreSplitting` to `siPixelClustersPreSplittingCUDA`
  - rename `siPixelRecHitsCUDAPreSplitting` to `siPixelRecHitsPreSplittingCUDA`
  - rename `siPixelRecHitsLegacyPreSplitting` to `siPixelRecHitsPreSplittingLegacy`
  - rename `siPixelRecHitHostSoA` to `siPixelRecHitSoAFromLegacy`

Re-apply changes from #29805 that were lost in the Patatrack branch.
---
 .../Track/interface/PixelTrackHeterogeneous.h |  2 +
 .../python/customizePixelTracksSoAonCPU.py    |  8 ++--
 .../plugins/PixelTrackProducerFromSoA.cc      |  6 +--
 .../plugins/BrokenLineFitOnGPU.h              |  2 +-
 .../PixelTriplets/plugins/CAConstants.h       | 38 +++++++++----------
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  4 +-
 .../CAHitNtupletGeneratorKernelsAlloc.h       | 10 ++---
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  2 +-
 .../PixelTriplets/plugins/GPUCACell.h         |  5 +--
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  2 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   |  2 +-
 .../plugins/gpuPixelDoubletsAlgos.h           |  6 +--
 .../python/customizePixelOnlyForProfiling.py  |  2 +-
 13 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index e79a32c21daa0..41936b5fc7077 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -57,8 +57,10 @@ class TrackSoAT {
 namespace pixelTrack {
 
 #ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
   constexpr uint32_t maxNumber() { return 2 * 1024; }
 #else
+  // tested on MC events with 55-75 pileup events
   constexpr uint32_t maxNumber() { return 32 * 1024; }
 #endif
 
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index bcd9dbdc51ea4..909959f2d81be 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -9,8 +9,8 @@ def customizePixelTracksSoAonCPU(process):
   # ensure the same results when running on GPU (which supports only the 'HLT' payload) and CPU
   process.siPixelClustersPreSplitting.cpu.payloadType = cms.string('HLT')
 
-  from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHostSoA_cfi import siPixelRecHitHostSoA
-  process.siPixelRecHitsPreSplitting = siPixelRecHitHostSoA.clone(
+  from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy
+  process.siPixelRecHitsPreSplitting = siPixelRecHitSoAFromLegacy.clone(
     convertToLegacy = True
   )
 
@@ -54,8 +54,8 @@ def customizePixelTracksSoAonCPUForProfiling(process):
   process.MessageLogger.cerr.FwkReport.reportEvery = 100
 
   process = customizePixelTracksSoAonCPU(process)
-  process.siPixelRecHitHostSoA.convertToLegacy = False
+  process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
   
-  process.TkSoA = cms.Path(process.offlineBeamSpot+process.siPixelDigis+process.siPixelClustersPreSplitting+process.siPixelRecHitHostSoA+process.pixelTrackSoA+process.pixelVertexSoA)
+  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA)
   process.schedule = cms.Schedule(process.TkSoA)
   return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 522678ce352f5..cdea22c3a8a24 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -47,8 +47,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 
   static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
 
-  //  using HitModuleStart = std::array<uint32_t,gpuClustering::MaxNumModules + 1>;
-  using HMSstorage = HostProduct<unsigned int[]>;
+  //  using HitModuleStart = std::array<uint32_t, gpuClustering::maxNumModules + 1>;
+  using HMSstorage = HostProduct<uint32_t[]>;
 
 private:
   void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
@@ -77,7 +77,7 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
   desc.add<edm::InputTag>("trackSrc", edm::InputTag("pixelTrackSoA"));
-  desc.add<edm::InputTag>("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsLegacyPreSplitting"));
+  desc.add<edm::InputTag>("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsPreSplittingLegacy"));
   desc.add<int>("minNumberOfHits", 0);
 
   descriptions.addWithDefaultLabel(desc);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 7b5c9ea0ce0a4..96a641829d797 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -8,7 +8,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 9eea4f528fcdb..d9c3ff70e35ed 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -10,34 +10,30 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
-// #define ONLY_PHICUT
+//#define ONLY_PHICUT
 
 namespace CAConstants {
 
   // constants
-#ifndef ONLY_PHICUT
+#ifdef ONLY_PHICUT
+  constexpr uint32_t maxNumberOfTuples() { return 48 * 1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
+  constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
+#else
 #ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
   constexpr uint32_t maxNumberOfTuples() { return 3 * 1024; }
+  constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
+  constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
 #else
+  // tested on MC events with 55-75 pileup events
   constexpr uint32_t maxNumberOfTuples() { return 24 * 1024; }
-#endif
-#else
-  constexpr uint32_t maxNumberOfTuples() { return 48 * 1024; }
-#endif
-  constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
-#ifndef ONLY_PHICUT
-#ifndef GPU_SMALL_EVENTS
   constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
   constexpr uint32_t maxCellsPerHit() { return 128; }
-#else
-  constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
-#endif
-#else
-  constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
 #endif
+#endif  // ONLY_PHICUT
   constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }
+  constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 
   constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
   constexpr uint32_t maxNumberOfLayers() { return 10; }
@@ -45,14 +41,14 @@ namespace CAConstants {
 
   // types
   using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
-  using tindex_type = uint16_t;  //  for tuples
+  using tindex_type = uint16_t;  // for tuples
 
-#ifndef ONLY_PHICUT
-  using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
-  using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
-#else
+#ifdef ONLY_PHICUT
   using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
   using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
+#else
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
 #endif
 
   using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 31e5070e55e05..3b1ea6fe158b2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -20,7 +20,7 @@
 
 #include "CAHitNtupletGeneratorOnGPU.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 
 class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 public:
@@ -58,7 +58,7 @@ void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descript
   edm::ParameterSetDescription desc;
 
   desc.add<bool>("onGPU", true);
-  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
 
   CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
   auto label = "caHitNtupletCUDA";
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index fb750267f5c37..1c34275d6bbe2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -1,7 +1,7 @@
-#include "CAHitNtupletGeneratorKernels.h"
-
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include "CAHitNtupletGeneratorKernels.h"
+
 template <>
 #ifdef __CUDACC__
 void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
@@ -25,11 +25,7 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
   device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
 
-  if
-#ifndef __CUDACC__
-      constexpr
-#endif
-      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+  if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index e920ebf7a803d..afb591744bf59 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -2,7 +2,7 @@
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
 
 #include <cuda_runtime.h>
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index e913b77fe0953..2a74d6a064e73 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -9,7 +9,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
@@ -293,8 +293,7 @@ class GPUCACell {
     assert(tmpNtuplet.size() <= 4);
 
     bool last = true;
-    for (int j = 0; j < outerNeighbors().size(); ++j) {
-      auto otherCell = outerNeighbors()[j];
+    for (unsigned int otherCell : outerNeighbors()) {
       if (cells[otherCell].theDoubletId < 0)
         continue;  // killed by earlyFishbone
       last = false;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 05b399e870f58..42f8f0e720b43 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -2,7 +2,7 @@
 #define RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
 
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 
 #include "CAConstants.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 75e9d570a129e..a16374278233a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -6,7 +6,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 4eb6823907bcc..d055c8b7cb867 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -7,7 +7,7 @@
 #include <cstdio>
 #include <limits>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
@@ -105,7 +105,7 @@ namespace gpuPixelDoublets {
 
       // found hit corresponding to our cuda thread, now do the job
       auto mi = hh.detectorIndex(i);
-      if (mi > 2000)
+      if (mi > gpuClustering::maxNumModules)
         continue;  // invalid
 
       /* maybe clever, not effective when zoCut is on
@@ -201,7 +201,7 @@ namespace gpuPixelDoublets {
           assert(oi >= offsets[outer]);
           assert(oi < offsets[outer + 1]);
           auto mo = hh.detectorIndex(oi);
-          if (mo > 2000)
+          if (mo > gpuClustering::maxNumModules)
             continue;  //    invalid
 
           if (doZ0Cut && z0cutoff(oi))
diff --git a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
index d46764dbd7edd..24774bbda649c 100644
--- a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
+++ b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
@@ -25,7 +25,7 @@ def customizePixelOnlyForProfilingGPUOnly(process):
 # tracks and vertices on the CPU in SoA format, without conversion to legacy format.
 def customizePixelOnlyForProfilingGPUWithHostCopy(process):
 
-  #? process.siPixelRecHitHostSoA.convertToLegacy = False
+  #? process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
 
   process.consumer = cms.EDAnalyzer("GenericConsumer",
       eventProducts = cms.untracked.vstring('pixelTrackSoA', 'pixelVertexSoA')

From e2c52fb35f8f9af81ab19ae45ac34a5861982264 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 24 Dec 2020 15:43:59 +0100
Subject: [PATCH 096/102] Synchronise with CMSSW_11_3_0_pre1

---
 Validation/RecoTrack/python/TrackValidation_cff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index fc45589bd1569..05b546c077c55 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -281,7 +281,7 @@ def _getMVASelectors(postfix):
     jets = "ak4PFJets"
 )
 from JetMETCorrections.Configuration.JetCorrectors_cff import *
-import CommonTools.RecoAlgos.jetTracksAssociationToTrackRefs_cfi as jetTracksAssociationToTrackRefs_cfi
+import JetMETCorrections.JetCorrector.jetTracksAssociationToTrackRefs_cfi as jetTracksAssociationToTrackRefs_cfi
 cutsRecoTracksAK4PFJets = jetTracksAssociationToTrackRefs_cfi.jetTracksAssociationToTrackRefs.clone(
     association = "ak4JetTracksAssociatorExplicitAll",
     jets = "ak4PFJets",

From 24554638007bee6c5cab5d954dab260d04ac23e9 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 31 Dec 2020 00:20:31 +0100
Subject: [PATCH 097/102] Clean up the pixel local reconstruction code
 (cms-patatrack#602)

Address the pixel local reconstruction review comments:
  - remove obsolete comments;
  - consistently use named constants;
  - rename data members and methods to be more descriptive;
  - rename local variables according to the coding rules and for
    consistency with cms-sw#32591;
  - update transient dictionaries to match data types.
---
 .../plugins/gpuPixelDoubletsAlgos.h           | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index d055c8b7cb867..5c0d5a252b684 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -50,9 +50,9 @@ namespace gpuPixelDoublets {
 
     bool isOuterLadder = ideal_cond;
 
-    using Hist = TrackingRecHit2DSOAView::Hist;
+    using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
 
-    auto const& __restrict__ hist = hh.phiBinner();
+    auto const& __restrict__ phiBinner = hh.phiBinner();
     uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
     assert(offsets);
 
@@ -93,7 +93,7 @@ namespace gpuPixelDoublets {
       uint8_t outer = layerPairs[2 * pairLayerId + 1];
       assert(outer > inner);
 
-      auto hoff = Hist::histOff(outer);
+      auto hoff = PhiBinner::histOff(outer);
 
       auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
       i += offsets[inner];
@@ -175,10 +175,10 @@ namespace gpuPixelDoublets {
 
       auto iphicut = phicuts[pairLayerId];
 
-      auto kl = Hist::bin(int16_t(mep - iphicut));
-      auto kh = Hist::bin(int16_t(mep + iphicut));
-      auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
-      // bool piWrap = std::abs(kh-kl) > Hist::nbins()/2;
+      auto kl = PhiBinner::bin(int16_t(mep - iphicut));
+      auto kh = PhiBinner::bin(int16_t(mep + iphicut));
+      auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
+      // bool piWrap = std::abs(kh-kl) > PhiBinner::nbins()/2;
 
 #ifdef GPU_DEBUG
       int tot = 0;
@@ -191,10 +191,10 @@ namespace gpuPixelDoublets {
       for (auto kk = kl; kk != khh; incr(kk)) {
 #ifdef GPU_DEBUG
         if (kk != kl && kk != kh)
-          nmin += hist.size(kk + hoff);
+          nmin += phiBinner.size(kk + hoff);
 #endif
-        auto const* __restrict__ p = hist.begin(kk + hoff);
-        auto const* __restrict__ e = hist.end(kk + hoff);
+        auto const* __restrict__ p = phiBinner.begin(kk + hoff);
+        auto const* __restrict__ e = phiBinner.end(kk + hoff);
         p += first;
         for (; p < e; p += stride) {
           auto oi = __ldg(p);

From 2f0c8d4c230068fddce0dbaf1d66ca28797d3beb Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 26 Dec 2020 01:02:39 +0100
Subject: [PATCH 098/102] Exclude the changes related to TP association on GPU

---
 .../RecoTrack/python/TrackValidation_cff.py   | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 05b546c077c55..b5dcb35da03cd 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -528,11 +528,6 @@ def _getMVASelectors(postfix):
 # Built tracks, in the standard sequence mainly for monitoring the track selection MVA
 tpClusterProducerPreSplitting = tpClusterProducer.clone(pixelClusterSrc = "siPixelClustersPreSplitting")
 quickTrackAssociatorByHitsPreSplitting = quickTrackAssociatorByHits.clone(cluster2TPSrc = "tpClusterProducerPreSplitting")
-
-tpClusterProducerCUDAPreSplitting = tpClusterProducerCUDA.clone(
-   pixelClusterSrc = "siPixelClustersPreSplitting"
-)
-
 _trackValidatorSeedingBuilding = trackValidator.clone( # common for built tracks and seeds (in trackingOnly)
     associators = ["quickTrackAssociatorByHits"],
     UseAssociators = True,
@@ -671,16 +666,6 @@ def _uniqueFirstLayers(layerList):
     VertexAssociatorByPositionAndTracks,
     trackingParticleNumberOfLayersProducer
 )
-
-#gpu tp ???
-from Configuration.ProcessModifiers.gpu_cff import gpu
-tpClusterProducerPreSplittingCUDA = cms.Task(
-  tpClusterProducerCUDAPreSplitting
-)
-_tracksValidationTruth_gpu = tracksValidationTruth.copy()
-_tracksValidationTruth_gpu.add(tpClusterProducerPreSplittingCUDA)
-gpu.toReplaceWith(tracksValidationTruth,_tracksValidationTruth_gpu)
-
 fastSim.toModify(tracksValidationTruth, lambda x: x.remove(tpClusterProducer))
 
 tracksPreValidation = cms.Task(
@@ -969,7 +954,6 @@ def _uniqueFirstLayers(layerList):
     dodEdxPlots = False,
 )
 
-
 tracksValidationTruthPixelTrackingOnly = tracksValidationTruth.copy()
 tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation)
 tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks)
@@ -979,7 +963,7 @@ def _uniqueFirstLayers(layerList):
     tracksValidationTruthPixelTrackingOnly,
     trackingParticlesSignal,
     pixelTracksPt09,
-    pixelTracksFromPV, 
+    pixelTracksFromPV,
     pixelTracksFromPVPt09,
 )
 tracksValidationPixelTrackingOnly = cms.Sequence(
@@ -991,7 +975,6 @@ def _uniqueFirstLayers(layerList):
 )
 
 
-
 ### Lite mode (only generalTracks and HP)
 trackValidatorLite = trackValidator.clone(
     label = ["generalTracks", "cutsRecoTracksHp"]

From 35c68176f6af3486c689cf0ca85c5097a6201583 Mon Sep 17 00:00:00 2001
From: Eric Cano <37585813+ericcano@users.noreply.github.com>
Date: Tue, 23 Mar 2021 22:17:52 +0100
Subject: [PATCH 099/102] Clean up the pixel track reconstruction code
 (cms-patatrack#606)

Updat EDM access:
  - switch to consumes() scheme for event setup;
  - simplify some event data access.

Style fixes:
  - make class member private & fixed problematic cast;
  - format of comments for clang-tidy;
  - chang to enum class to avoid creating a namespace (usage becomes: pixelTrack::Quality::loose);
  - add article reference in comment (it was already further down in the file);
  - fix member functions and classes capitalization;
  - fix one letter or upper case variable names in formulas (trying to keep the naming from the reference article).

Avoid some code repetitions.
---
 .../Track/interface/PixelTrackHeterogeneous.h |  75 +--
 .../Track/interface/TrackSoAHeterogeneousT.h  |  73 ++
 ...ectoryStateSoA.h => TrajectoryStateSoAT.h} |   8 +-
 CUDADataFormats/Track/src/classes.h           |   2 +-
 .../Track/test/TrajectoryStateSOA_t.h         |   4 +-
 .../PixelTrackFitting/interface/BrokenLine.h  | 627 ++++++++++--------
 .../PixelTrackFitting/interface/FitResult.h   |  16 +-
 .../PixelTrackFitting/interface/FitUtils.h    | 140 ++--
 .../PixelTrackFitting/interface/RiemannFit.h  | 599 ++++++++---------
 .../plugins/PixelNtupletsFitterProducer.cc    |   8 +-
 .../plugins/PixelTrackProducerFromSoA.cc      |  56 +-
 .../plugins/PixelTrackSoAFromCUDA.cc          |  22 +-
 .../src/PixelNtupletsFitter.cc                |  10 +-
 .../PixelTrackFitting/test/BuildFile.xml      |   8 +-
 ...elTrackRiemannFit.cc => PixelTrackFits.cc} |  52 +-
 .../PixelTrackFitting/test/testEigenGPU.cu    | 158 ++---
 .../test/testEigenJacobian.cpp                |   6 +-
 .../test/{testRiemannFit.cpp => testFits.cpp} |  45 +-
 .../PixelTriplets/interface/CircleEq.h        |  10 -
 .../plugins/BrokenLineFitOnGPU.cc             |  90 +--
 .../plugins/BrokenLineFitOnGPU.cu             |  88 +--
 .../plugins/BrokenLineFitOnGPU.h              |  67 +-
 .../PixelTriplets/plugins/CAConstants.h       |  78 ++-
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  14 +-
 .../plugins/CAHitNtupletGeneratorKernels.cc   |  61 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   |  85 ++-
 .../plugins/CAHitNtupletGeneratorKernels.h    |  44 +-
 .../CAHitNtupletGeneratorKernelsAlloc.h       |   4 +-
 .../CAHitNtupletGeneratorKernelsImpl.h        | 111 ++--
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  14 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |   1 -
 .../PixelTriplets/plugins/GPUCACell.h         | 183 ++---
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  12 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  34 +-
 .../PixelTriplets/plugins/RecHitsMap.h        |  84 ---
 .../PixelTriplets/plugins/RiemannFitOnGPU.cc  | 169 ++---
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  | 170 ++---
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   |  92 +--
 .../PixelTriplets/plugins/gpuFishbone.h       |  34 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  20 +-
 .../plugins/gpuPixelDoubletsAlgos.h           |  21 +-
 .../PixelTriplets/test/CAsizes_t.cpp          |   2 +-
 .../plugins/SeedProducerFromSoA.cc            |  34 +-
 43 files changed, 1705 insertions(+), 1726 deletions(-)
 create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
 rename CUDADataFormats/Track/interface/{TrajectoryStateSoA.h => TrajectoryStateSoAT.h} (90%)
 rename RecoPixelVertexing/PixelTrackFitting/test/{PixelTrackRiemannFit.cc => PixelTrackFits.cc} (90%)
 rename RecoPixelVertexing/PixelTrackFitting/test/{testRiemannFit.cpp => testFits.cpp} (78%)
 delete mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h

diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index 41936b5fc7077..3ee5af80353dd 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -1,76 +1,9 @@
-#ifndef CUDADataFormatsTrackTrackHeterogeneous_H
-#define CUDADataFormatsTrackTrackHeterogeneous_H
-
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
+#define CUDADataFormats_Track_PixelTrackHeterogeneous_h
 
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-
-namespace trackQuality {
-  enum Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
-}
-
-template <int32_t S>
-class TrackSoAT {
-public:
-  static constexpr int32_t stride() { return S; }
-
-  using Quality = trackQuality::Quality;
-  using hindex_type = uint32_t;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
-
-  // Always check quality is at least loose!
-  // CUDA does not support enums  in __lgc ...
-  eigenSoA::ScalarSoA<uint8_t, S> m_quality;
-  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i)); }
-  constexpr Quality &quality(int32_t i) { return (Quality &)(m_quality(i)); }
-  constexpr Quality const *qualityData() const { return (Quality const *)(m_quality.data()); }
-  constexpr Quality *qualityData() { return (Quality *)(m_quality.data()); }
-
-  // this is chi2/ndof as not necessarely all hits are used in the fit
-  eigenSoA::ScalarSoA<float, S> chi2;
-
-  constexpr int nHits(int i) const { return detIndices.size(i); }
-
-  // State at the Beam spot
-  // phi,tip,1/pt,cotan(theta),zip
-  TrajectoryStateSoA<S> stateAtBS;
-  eigenSoA::ScalarSoA<float, S> eta;
-  eigenSoA::ScalarSoA<float, S> pt;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
-  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
-  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
-  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
-
-  // state at the detector of the outermost hit
-  // representation to be decided...
-  // not yet filled on GPU
-  // TrajectoryStateSoA<S> stateAtOuterDet;
-
-  HitContainer hitIndices;
-  HitContainer detIndices;
-
-  // total number of tracks (including those not fitted)
-  uint32_t m_nTracks;
-};
-
-namespace pixelTrack {
-
-#ifdef GPU_SMALL_EVENTS
-  // kept for testing and debugging
-  constexpr uint32_t maxNumber() { return 2 * 1024; }
-#else
-  // tested on MC events with 55-75 pileup events
-  constexpr uint32_t maxNumber() { return 32 * 1024; }
-#endif
-
-  using TrackSoA = TrackSoAT<maxNumber()>;
-  using TrajectoryState = TrajectoryStateSoA<maxNumber()>;
-  using HitContainer = TrackSoA::HitContainer;
-  using Quality = trackQuality::Quality;
-
-}  // namespace pixelTrack
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
 
 using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
 
-#endif  // CUDADataFormatsTrackTrackSoA_H
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
\ No newline at end of file
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
new file mode 100644
index 0000000000000..bd39f3c4d3bfe
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -0,0 +1,73 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
+#define CUDADataFormats_Track_TrackHeterogeneousT_H
+
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+
+namespace pixelTrack {
+  enum class Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAHeterogeneousT {
+public:
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = pixelTrack::Quality;
+  using hindex_type = uint32_t;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+private:
+  eigenSoA::ScalarSoA<uint8_t, S> quality_;
+
+public:
+  constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); }
+  constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); }
+  constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); }
+  constexpr Quality *qualityData() { return (Quality *)(quality_.data()); }
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i); }
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoAT<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+};
+
+namespace pixelTrack {
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
similarity index 90%
rename from CUDADataFormats/Track/interface/TrajectoryStateSoA.h
rename to CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
index 7cd2e93fb914e..64fcd573a6991 100644
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
@@ -1,11 +1,11 @@
-#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H
-#define CUDADataFormatsTrackTrajectoryStateSOA_H
+#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
+#define CUDADataFormats_Track_TrajectoryStateSOAT_H
 
 #include <Eigen/Dense>
 #include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
 
 template <int32_t S>
-struct TrajectoryStateSoA {
+struct TrajectoryStateSoAT {
   using Vector5f = Eigen::Matrix<float, 5, 1>;
   using Vector15f = Eigen::Matrix<float, 15, 1>;
 
@@ -56,4 +56,4 @@ struct TrajectoryStateSoA {
   }
 };
 
-#endif  // CUDADataFormatsTrackTrajectoryStateSOA_H
+#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 4843818978cca..97c116f6c88d3 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -3,7 +3,7 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
index c8e92aca2628f..97b88873c2613 100644
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
 
 using Vector5d = Eigen::Matrix<double, 5, 1>;
 using Matrix5d = Eigen::Matrix<double, 5, 5>;
@@ -17,7 +17,7 @@ __host__ __device__ Matrix5d loadCov(Vector5d const& e) {
   return cov;
 }
 
-using TS = TrajectoryStateSoA<128>;
+using TS = TrajectoryStateSoAT<128>;
 
 __global__ void testTSSoA(TS* pts, int n) {
   assert(n <= 128);
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
index be1b67be89c35..86fe6a278777c 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -5,58 +5,66 @@
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
-namespace BrokenLine {
+namespace brokenline {
 
   //!< Karimäki's parameters: (phi, d, k=1/R)
   /*!< covariance matrix: \n
     |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
     |cov(phi, d )|cov( d , d )|cov( k , d )| \n
-    |cov(phi, k )|cov( d , k )|cov( k , k )|
+    |cov(phi, k )|cov( d , k )|cov( k , k )| \n
+    as defined in Karimäki V., 1990, Effective circle fitting for particle trajectories, 
+    Nucl. Instr. and Meth. A305 (1991) 187.
   */
-  using karimaki_circle_fit = Rfit::circle_fit;
+  using karimaki_circle_fit = riemannFit::CircleFit;
 
   /*!
     \brief data needed for the Broken Line fit procedure.
   */
-  template <int N>
+  template <int n>
   struct PreparedBrokenLineData {
-    int q;                      //!< particle charge
-    Rfit::Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
-    Rfit::VectorNd<N> s;        //!< total distance traveled in the transverse plane
-                                //   starting from the pre-fitted closest approach
-    Rfit::VectorNd<N> S;        //!< total distance traveled (three-dimensional)
-    Rfit::VectorNd<N> Z;        //!< orthogonal coordinate to the pre-fitted line in the sz plane
-    Rfit::VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
+    int qCharge;                          //!< particle charge
+    riemannFit::Matrix2xNd<n> radii;      //!< xy data in the system in which the pre-fitted center is the origin
+    riemannFit::VectorNd<n> sTransverse;  //!< total distance traveled in the transverse plane
+                                          //   starting from the pre-fitted closest approach
+    riemannFit::VectorNd<n> sTotal;       //!< total distance traveled (three-dimensional)
+    riemannFit::VectorNd<n> zInSZplane;   //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    riemannFit::VectorNd<n> varBeta;      //!< kink angles in the SZ plane
   };
 
   /*!
     \brief Computes the Coulomb multiple scattering variance of the planar angle.
     
     \param length length of the track in the material.
-    \param B magnetic field in Gev/cm/c.
-    \param R radius of curvature (needed to evaluate p).
-    \param Layer denotes which of the four layers of the detector is the endpoint of the multiple scattered track. For example, if Layer=3, then the particle has just gone through the material between the second and the third layer.
+    \param bField magnetic field in Gev/cm/c.
+    \param radius radius of curvature (needed to evaluate p).
+    \param layer denotes which of the four layers of the detector is the endpoint of the 
+   *             multiple scattered track. For example, if Layer=3, then the particle has 
+   *             just gone through the material between the second and the third layer.
     
-    \todo add another Layer variable to identify also the start point of the track, so if there are missing hits or multiple hits, the part of the detector that the particle has traversed can be exactly identified.
+    \todo add another Layer variable to identify also the start point of the track, 
+   *      so if there are missing hits or multiple hits, the part of the detector that 
+   *      the particle has traversed can be exactly identified.
     
-    \warning the formula used here assumes beta=1, and so neglects the dependence of theta_0 on the mass of the particle at fixed momentum.
+    \warning the formula used here assumes beta=1, and so neglects the dependence 
+   *         of theta_0 on the mass of the particle at fixed momentum.
     
     \return the variance of the planar angle ((theta_0)^2 /3).
   */
-  __host__ __device__ inline double MultScatt(
-      const double& length, const double B, const double R, int Layer, double slope) {
+  __host__ __device__ inline double multScatt(
+      const double& length, const double bField, const double radius, int layer, double slope) {
     // limit R to 20GeV...
-    auto pt2 = std::min(20., B * R);
+    auto pt2 = std::min(20., bField * radius);
     pt2 *= pt2;
-    constexpr double XXI_0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
+    constexpr double inv_X0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
     //if(Layer==1) XXI_0=0.06/16.;
     // else XXI_0=0.06/16.;
     //XX_0*=1;
-    constexpr double geometry_factor =
-        0.7;  //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
-    constexpr double fact = geometry_factor * Rfit::sqr(13.6 / 1000.);
-    return fact / (pt2 * (1. + Rfit::sqr(slope))) * (std::abs(length) * XXI_0) *
-           Rfit::sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
+
+    //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+    constexpr double geometry_factor = 0.7;
+    constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (std::abs(length) * inv_X0) *
+           riemannFit::sqr(1. + 0.038 * log(std::abs(length) * inv_X0));
   }
 
   /*!
@@ -66,48 +74,67 @@ namespace BrokenLine {
     
     \return 2D rotation matrix.
   */
-  __host__ __device__ inline Rfit::Matrix2d RotationMatrix(double slope) {
-    Rfit::Matrix2d Rot;
-    Rot(0, 0) = 1. / sqrt(1. + Rfit::sqr(slope));
-    Rot(0, 1) = slope * Rot(0, 0);
-    Rot(1, 0) = -Rot(0, 1);
-    Rot(1, 1) = Rot(0, 0);
-    return Rot;
+  __host__ __device__ inline riemannFit::Matrix2d rotationMatrix(double slope) {
+    riemannFit::Matrix2d rot;
+    rot(0, 0) = 1. / sqrt(1. + riemannFit::sqr(slope));
+    rot(0, 1) = slope * rot(0, 0);
+    rot(1, 0) = -rot(0, 1);
+    rot(1, 1) = rot(0, 0);
+    return rot;
   }
 
   /*!
-    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a translation of the coordinate system, such that the old origin has coordinates (x0,y0) in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
+    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a 
+   *       translation of the coordinate system, such that the old origin has coordinates (x0,y0) 
+   *       in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective 
+   *       circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
     
-    \param circle circle fit in the old coordinate system.
+    \param circle circle fit in the old coordinate system. circle.par(0) is phi, circle.par(1) is d and circle.par(2) is rho. 
     \param x0 x coordinate of the translation vector.
     \param y0 y coordinate of the translation vector.
     \param jacobian passed by reference in order to save stack.
   */
-  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle,
+  __host__ __device__ inline void translateKarimaki(karimaki_circle_fit& circle,
                                                     double x0,
                                                     double y0,
-                                                    Rfit::Matrix3d& jacobian) {
-    double A, U, BB, C, DO, DP, uu, xi, v, mu, lambda, zeta;
-    DP = x0 * cos(circle.par(0)) + y0 * sin(circle.par(0));
-    DO = x0 * sin(circle.par(0)) - y0 * cos(circle.par(0)) + circle.par(1);
-    uu = 1 + circle.par(2) * circle.par(1);
-    C = -circle.par(2) * y0 + uu * cos(circle.par(0));
-    BB = circle.par(2) * x0 + uu * sin(circle.par(0));
-    A = 2. * DO + circle.par(2) * (Rfit::sqr(DO) + Rfit::sqr(DP));
-    U = sqrt(1. + circle.par(2) * A);
-    xi = 1. / (Rfit::sqr(BB) + Rfit::sqr(C));
-    v = 1. + circle.par(2) * DO;
-    lambda = (0.5 * A) / (U * Rfit::sqr(1. + U));
-    mu = 1. / (U * (1. + U)) + circle.par(2) * lambda;
-    zeta = Rfit::sqr(DO) + Rfit::sqr(DP);
-
-    jacobian << xi * uu * v, -xi * Rfit::sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
-        mu * zeta - lambda * A, 0, 0, 1.;
-
-    circle.par(0) = atan2(BB, C);
-    circle.par(1) = A / (1 + U);
-    // circle.par(2)=circle.par(2);
-
+                                                    riemannFit::Matrix3d& jacobian) {
+    // Avoid multiple access to the circle.par vector.
+    using scalar = std::remove_reference<decltype(circle.par(0))>::type;
+    scalar phi = circle.par(0);
+    scalar dee = circle.par(1);
+    scalar rho = circle.par(2);
+
+    // Avoid repeated trig. computations
+    scalar sinPhi = sin(phi);
+    scalar cosPhi = cos(phi);
+
+    // Intermediate computations for the circle parameters
+    scalar deltaPara = x0 * cosPhi + y0 * sinPhi;
+    scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee;
+    scalar tempSmallU = 1 + rho * dee;
+    scalar tempC = -rho * y0 + tempSmallU * cosPhi;
+    scalar tempB = rho * x0 + tempSmallU * sinPhi;
+    scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara));
+    scalar tempU = sqrt(1. + rho * tempA);
+
+    // Intermediate computations for the error matrix transform
+    scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC));
+    scalar tempV = 1. + rho * deltaOrth;
+    scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU);
+    scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda;
+    scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara);
+    jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara,
+        2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.;
+
+    // translated circle parameters
+    // phi
+    circle.par(0) = atan2(tempB, tempC);
+    // d
+    circle.par(1) = tempA / (1 + tempU);
+    // rho after translation. It is invariant, so noop
+    // circle.par(2)= rho;
+
+    // translated error matrix
     circle.cov = jacobian * circle.cov * jacobian.transpose();
   }
 
@@ -115,95 +142,97 @@ namespace BrokenLine {
     \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
     
     \param hits hits coordinates.
-    \param hits_cov hits covariance matrix.
     \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-    \param B magnetic field in Gev/cm/c.
+    \param bField magnetic field in Gev/cm/c.
     \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
   */
-  template <typename M3xN, typename V4, int N>
+  template <typename M3xN, typename V4, int n>
   __host__ __device__ inline void prepareBrokenLineData(const M3xN& hits,
                                                         const V4& fast_fit,
-                                                        const double B,
-                                                        PreparedBrokenLineData<N>& results) {
-    constexpr auto n = N;
-    u_int i;
-    Rfit::Vector2d d;
-    Rfit::Vector2d e;
+                                                        const double bField,
+                                                        PreparedBrokenLineData<n>& results) {
+    riemannFit::Vector2d dVec;
+    riemannFit::Vector2d eVec;
 
-    d = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
-    e = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
-    results.q = Rfit::cross2D(d, e) > 0 ? -1 : 1;
+    dVec = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
+    eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
+    results.qCharge = riemannFit::cross2D(dVec, eVec) > 0 ? -1 : 1;
 
-    const double slope = -results.q / fast_fit(3);
+    const double slope = -results.qCharge / fast_fit(3);
 
-    Rfit::Matrix2d R = RotationMatrix(slope);
+    riemannFit::Matrix2d rotMat = rotationMatrix(slope);
 
     // calculate radii and s
-    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * Rfit::MatrixXd::Constant(1, n, 1);
-    e = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
-    for (i = 0; i < n; i++) {
-      d = results.radii.block(0, i, 2, 1);
-      results.s(i) = results.q * fast_fit(2) * atan2(Rfit::cross2D(d, e), d.dot(e));  // calculates the arc length
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1);
+    eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
+    for (u_int i = 0; i < n; i++) {
+      dVec = results.radii.block(0, i, 2, 1);
+      results.sTransverse(i) = results.qCharge * fast_fit(2) *
+                               atan2(riemannFit::cross2D(dVec, eVec), dVec.dot(eVec));  // calculates the arc length
     }
-    Rfit::VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
-
-    //calculate S and Z
-    Rfit::Matrix2xNd<N> pointsSZ = Rfit::Matrix2xNd<N>::Zero();
-    for (i = 0; i < n; i++) {
-      pointsSZ(0, i) = results.s(i);
-      pointsSZ(1, i) = z(i);
-      pointsSZ.block(0, i, 2, 1) = R * pointsSZ.block(0, i, 2, 1);
+    riemannFit::VectorNd<n> zVec = hits.block(2, 0, 1, n).transpose();
+
+    //calculate sTotal and zVec
+    riemannFit::Matrix2xNd<n> pointsSZ = riemannFit::Matrix2xNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      pointsSZ(0, i) = results.sTransverse(i);
+      pointsSZ(1, i) = zVec(i);
+      pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1);
     }
-    results.S = pointsSZ.block(0, 0, 1, n).transpose();
-    results.Z = pointsSZ.block(1, 0, 1, n).transpose();
-
-    //calculate VarBeta
-    results.VarBeta(0) = results.VarBeta(n - 1) = 0;
-    for (i = 1; i < n - 1; i++) {
-      results.VarBeta(i) = MultScatt(results.S(i + 1) - results.S(i), B, fast_fit(2), i + 2, slope) +
-                           MultScatt(results.S(i) - results.S(i - 1), B, fast_fit(2), i + 1, slope);
+    results.sTotal = pointsSZ.block(0, 0, 1, n).transpose();
+    results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose();
+
+    //calculate varBeta
+    results.varBeta(0) = results.varBeta(n - 1) = 0;
+    for (u_int i = 1; i < n - 1; i++) {
+      results.varBeta(i) = multScatt(results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) +
+                           multScatt(results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope);
     }
   }
 
   /*!
-    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. This is the whole matrix in the case of the line fit and the main n-by-n block in the case of the circle fit.
+    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. 
+   *       This is the whole matrix in the case of the line fit and the main n-by-n block in the case 
+   *       of the circle fit.
     
-    \param w weights of the first part of the cost function, the one with the measurements and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
-    \param S total distance traveled by the particle from the pre-fitted closest approach.
-    \param VarBeta kink angles' variance.
+    \param weights weights of the first part of the cost function, the one with the measurements 
+   *         and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
+    \param sTotal total distance traveled by the particle from the pre-fitted closest approach.
+    \param varBeta kink angles' variance.
     
     \return the n-by-n matrix of the linear system
   */
-  template <int N>
-  __host__ __device__ inline Rfit::MatrixNd<N> MatrixC_u(const Rfit::VectorNd<N>& w,
-                                                         const Rfit::VectorNd<N>& S,
-                                                         const Rfit::VectorNd<N>& VarBeta) {
-    constexpr u_int n = N;
-    u_int i;
-
-    Rfit::MatrixNd<N> C_U = Rfit::MatrixNd<N>::Zero();
-    for (i = 0; i < n; i++) {
-      C_U(i, i) = w(i);
+  template <int n>
+  __host__ __device__ inline riemannFit::MatrixNd<n> matrixC_u(const riemannFit::VectorNd<n>& weights,
+                                                               const riemannFit::VectorNd<n>& sTotal,
+                                                               const riemannFit::VectorNd<n>& varBeta) {
+    riemannFit::MatrixNd<n> c_uMat = riemannFit::MatrixNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, i) = weights(i);
       if (i > 1)
-        C_U(i, i) += 1. / (VarBeta(i - 1) * Rfit::sqr(S(i) - S(i - 1)));
+        c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1)));
       if (i > 0 && i < n - 1)
-        C_U(i, i) += (1. / VarBeta(i)) * Rfit::sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+        c_uMat(i, i) +=
+            (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) /
+                                                ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
       if (i < n - 2)
-        C_U(i, i) += 1. / (VarBeta(i + 1) * Rfit::sqr(S(i + 1) - S(i)));
+        c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i)));
 
       if (i > 0 && i < n - 1)
-        C_U(i, i + 1) =
-            1. / (VarBeta(i) * (S(i + 1) - S(i))) * (-(S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+        c_uMat(i, i + 1) =
+            1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
       if (i < n - 2)
-        C_U(i, i + 1) += 1. / (VarBeta(i + 1) * (S(i + 1) - S(i))) *
-                         (-(S(i + 2) - S(i)) / ((S(i + 2) - S(i + 1)) * (S(i + 1) - S(i))));
+        c_uMat(i, i + 1) +=
+            1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))));
 
       if (i < n - 2)
-        C_U(i, i + 2) = 1. / (VarBeta(i + 1) * (S(i + 2) - S(i + 1)) * (S(i + 1) - S(i)));
+        c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)));
 
-      C_U(i, i) *= 0.5;
+      c_uMat(i, i) *= 0.5;
     }
-    return C_U + C_U.transpose();
+    return c_uMat + c_uMat.transpose();
   }
 
   /*!
@@ -217,169 +246,179 @@ namespace BrokenLine {
   */
 
   template <typename M3xN, typename V4>
-  __host__ __device__ inline void BL_Fast_fit(const M3xN& hits, V4& result) {
-    constexpr uint32_t N = M3xN::ColsAtCompileTime;
-    constexpr auto n = N;  // get the number of hits
+  __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) {
+    constexpr uint32_t n = M3xN::ColsAtCompileTime;
 
-    const Rfit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-    const Rfit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
-    const Rfit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+    const riemannFit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
 
-    auto tmp = 0.5 / Rfit::cross2D(c, a);
+    auto tmp = 0.5 / riemannFit::cross2D(c, a);
     result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
     result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
     // check Wikipedia for these formulas
 
-    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(Rfit::cross2D(b, a)));
+    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(riemannFit::cross2D(b, a)));
     // Using Math Olympiad's formula R=abc/(4A)
 
-    const Rfit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-    const Rfit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+    const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
 
-    result(3) = result(2) * atan2(Rfit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+    result(3) = result(2) * atan2(riemannFit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
     // ds/dz slope between last and first point
   }
 
   /*!
-    \brief Performs the Broken Line fit in the curved track case (that is, the fit parameters are the interceptions u and the curvature correction \Delta\kappa).
+    \brief Performs the Broken Line fit in the curved track case (that is, the fit 
+   *       parameters are the interceptions u and the curvature correction \Delta\kappa).
     
     \param hits hits coordinates.
     \param hits_cov hits covariance matrix.
     \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-    \param B magnetic field in Gev/cm/c.
+    \param bField magnetic field in Gev/cm/c.
     \param data PreparedBrokenLineData.
     \param circle_results struct to be filled with the results in this form:
     -par parameter of the line in this form: (phi, d, k); \n
     -cov covariance matrix of the fitted parameter; \n
     -chi2 value of the cost function in the minimum.
     
-    \details The function implements the steps 2 and 3 of the Broken Line fit with the curvature correction.\n
-    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and \Delta\kappa and their covariance matrix.
-    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+    \details The function implements the steps 2 and 3 of the Broken Line fit 
+   *         with the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint on 
+   * the cost function and solving the consequent linear system. It determines the 
+   * fitted parameters u and \Delta\kappa and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their 
+   * covariance matrix are transformed to the original coordinate system.
   */
-  template <typename M3xN, typename M6xN, typename V4, int N>
-  __host__ __device__ inline void BL_Circle_fit(const M3xN& hits,
-                                                const M6xN& hits_ge,
-                                                const V4& fast_fit,
-                                                const double B,
-                                                PreparedBrokenLineData<N>& data,
-                                                karimaki_circle_fit& circle_results) {
-    constexpr u_int n = N;
-    u_int i;
-
-    circle_results.q = data.q;
+  template <typename M3xN, typename M6xN, typename V4, int n>
+  __host__ __device__ inline void circleFit(const M3xN& hits,
+                                            const M6xN& hits_ge,
+                                            const V4& fast_fit,
+                                            const double bField,
+                                            PreparedBrokenLineData<n>& data,
+                                            karimaki_circle_fit& circle_results) {
+    circle_results.qCharge = data.qCharge;
     auto& radii = data.radii;
-    const auto& s = data.s;
-    const auto& S = data.S;
-    auto& Z = data.Z;
-    auto& VarBeta = data.VarBeta;
-    const double slope = -circle_results.q / fast_fit(3);
-    VarBeta *= 1. + Rfit::sqr(slope);  // the kink angles are projected!
-
-    for (i = 0; i < n; i++) {
-      Z(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
+    const auto& sTransverse = data.sTransverse;
+    const auto& sTotal = data.sTotal;
+    auto& zInSZplane = data.zInSZplane;
+    auto& varBeta = data.varBeta;
+    const double slope = -circle_results.qCharge / fast_fit(3);
+    varBeta *= 1. + riemannFit::sqr(slope);  // the kink angles are projected!
+
+    for (u_int i = 0; i < n; i++) {
+      zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
     }
 
-    Rfit::Matrix2d V;     // covariance matrix
-    Rfit::VectorNd<N> w;  // weights
-    Rfit::Matrix2d RR;    // rotation matrix point by point
-    //double Slope; // slope of the circle point by point
-    for (i = 0; i < n; i++) {
-      V(0, 0) = hits_ge.col(i)[0];            // x errors
-      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
-      V(1, 1) = hits_ge.col(i)[2];            // y errors
-      //Slope=-radii(0,i)/radii(1,i);
-      RR = RotationMatrix(-radii(0, i) / radii(1, i));
-      w(i) = 1. / ((RR * V * RR.transpose())(1, 1));  // compute the orthogonal weight point by point
+    riemannFit::Matrix2d vMat;           // covariance matrix
+    riemannFit::VectorNd<n> weightsVec;  // weights
+    riemannFit::Matrix2d rotMat;         // rotation matrix point by point
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      rotMat = rotationMatrix(-radii(0, i) / radii(1, i));
+      weightsVec(i) =
+          1. / ((rotMat * vMat * rotMat.transpose())(1, 1));  // compute the orthogonal weight point by point
     }
 
-    Rfit::VectorNplusONEd<N> r_u;
-    r_u(n) = 0;
-    for (i = 0; i < n; i++) {
-      r_u(i) = w(i) * Z(i);
+    riemannFit::VectorNplusONEd<n> r_uVec;
+    r_uVec(n) = 0;
+    for (u_int i = 0; i < n; i++) {
+      r_uVec(i) = weightsVec(i) * zInSZplane(i);
     }
 
-    Rfit::MatrixNplusONEd<N> C_U;
-    C_U.block(0, 0, n, n) = MatrixC_u(w, s, VarBeta);
-    C_U(n, n) = 0;
-    //add the border to the C_u matrix
-    for (i = 0; i < n; i++) {
-      C_U(i, n) = 0;
+    riemannFit::MatrixNplusONEd<n> c_uMat;
+    c_uMat.block(0, 0, n, n) = matrixC_u(weightsVec, sTransverse, varBeta);
+    c_uMat(n, n) = 0;
+    //add the border to the c_uMat matrix
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, n) = 0;
       if (i > 0 && i < n - 1) {
-        C_U(i, n) +=
-            -(s(i + 1) - s(i - 1)) * (s(i + 1) - s(i - 1)) / (2. * VarBeta(i) * (s(i + 1) - s(i)) * (s(i) - s(i - 1)));
+        c_uMat(i, n) +=
+            -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+            (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1)));
       }
       if (i > 1) {
-        C_U(i, n) += (s(i) - s(i - 2)) / (2. * VarBeta(i - 1) * (s(i) - s(i - 1)));
+        c_uMat(i, n) +=
+            (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1)));
       }
       if (i < n - 2) {
-        C_U(i, n) += (s(i + 2) - s(i)) / (2. * VarBeta(i + 1) * (s(i + 1) - s(i)));
+        c_uMat(i, n) +=
+            (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i)));
       }
-      C_U(n, i) = C_U(i, n);
+      c_uMat(n, i) = c_uMat(i, n);
       if (i > 0 && i < n - 1)
-        C_U(n, n) += Rfit::sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
+        c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i));
     }
 
 #ifdef CPP_DUMP
-    std::cout << "CU5\n" << C_U << std::endl;
+    std::cout << "CU5\n" << c_uMat << std::endl;
 #endif
-    Rfit::MatrixNplusONEd<N> I;
-    math::cholesky::invert(C_U, I);
-    // Rfit::MatrixNplusONEd<N> I = C_U.inverse();
+    riemannFit::MatrixNplusONEd<n> iMat;
+    math::cholesky::invert(c_uMat, iMat);
 #ifdef CPP_DUMP
-    std::cout << "I5\n" << I << std::endl;
+    std::cout << "I5\n" << iMat << std::endl;
 #endif
 
-    Rfit::VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    riemannFit::VectorNplusONEd<n> uVec = iMat * r_uVec;  // obtain the fitted parameters by solving the linear system
 
     // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
 
     radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
     radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
 
-    Rfit::Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
-    Rfit::Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
+    riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1);
+    riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1);
 
-    circle_results.par << atan2((e - d)(1), (e - d)(0)),
-        -circle_results.q * (fast_fit(2) - sqrt(Rfit::sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
-        circle_results.q * (1. / fast_fit(2) + u(n));
+    circle_results.par << atan2((eVec - dVec)(1), (eVec - dVec)(0)),
+        -circle_results.qCharge *
+            (fast_fit(2) - sqrt(riemannFit::sqr(fast_fit(2)) - 0.25 * (eVec - dVec).squaredNorm())),
+        circle_results.qCharge * (1. / fast_fit(2) + uVec(n));
 
-    assert(circle_results.q * circle_results.par(1) <= 0);
+    assert(circle_results.qCharge * circle_results.par(1) <= 0);
 
-    Rfit::Vector2d eMinusd = e - d;
+    riemannFit::Vector2d eMinusd = eVec - dVec;
     double tmp1 = eMinusd.squaredNorm();
+    double tmp2 = sqrt(riemannFit::sqr(2 * fast_fit(2)) - tmp1);
 
-    Rfit::Matrix3d jacobian;
+    riemannFit::Matrix3d jacobian;
     jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
         (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0,
-        (circle_results.q / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) /
-            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
-        (circle_results.q / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) /
-            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
-        0, 0, 0, circle_results.q;
+        (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) / tmp2,
+        (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) / tmp2, 0, 0, 0,
+        circle_results.qCharge;
 
-    circle_results.cov << I(0, 0), I(0, 1), I(0, n), I(1, 0), I(1, 1), I(1, n), I(n, 0), I(n, 1), I(n, n);
+    circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0),
+        iMat(n, 1), iMat(n, n);
 
     circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
 
     //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
 
-    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), jacobian);
-    circle_results.cov(0, 0) += (1 + Rfit::sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
+    auto eMinusDVec = eVec - dVec;
+    translateKarimaki(circle_results, 0.5 * eMinusDVec(0), 0.5 * eMinusDVec(1), jacobian);
+    circle_results.cov(0, 0) +=
+        (1 + riemannFit::sqr(slope)) * multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope);
 
     //...And translate back to the original system
 
-    TranslateKarimaki(circle_results, d(0), d(1), jacobian);
+    translateKarimaki(circle_results, dVec(0), dVec(1), jacobian);
 
     // compute chi2
     circle_results.chi2 = 0;
-    for (i = 0; i < n; i++) {
-      circle_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
+    for (u_int i = 0; i < n; i++) {
+      circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
       if (i > 0 && i < n - 1)
-        circle_results.chi2 += Rfit::sqr(u(i - 1) / (s(i) - s(i - 1)) -
-                                         u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
-                                         u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
-                               VarBeta(i);
+        circle_results.chi2 +=
+            riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) -
+                            uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+                                ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) +
+                            uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) +
+                            (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) /
+            varBeta(i);
     }
 
     // assert(circle_results.chi2>=0);
@@ -389,108 +428,109 @@ namespace BrokenLine {
     \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
     
     \param hits hits coordinates.
-    \param hits_cov hits covariance matrix.
     \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-    \param B magnetic field in Gev/cm/c.
+    \param bField magnetic field in Gev/cm/c.
     \param data PreparedBrokenLineData.
     \param line_results struct to be filled with the results in this form:
     -par parameter of the line in this form: (cot(theta), Zip); \n
     -cov covariance matrix of the fitted parameter; \n
     -chi2 value of the cost function in the minimum.
     
-    \details The function implements the steps 2 and 3 of the Broken Line fit without the curvature correction.\n
-    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and their covariance matrix.
-    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
-  */
-  template <typename V4, typename M6xN, int N>
-  __host__ __device__ inline void BL_Line_fit(const M6xN& hits_ge,
-                                              const V4& fast_fit,
-                                              const double B,
-                                              const PreparedBrokenLineData<N>& data,
-                                              Rfit::line_fit& line_results) {
-    constexpr u_int n = N;
-    u_int i;
-
+    \details The function implements the steps 2 and 3 of the Broken Line fit without 
+   *        the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint 
+   * on the cost function and solving the consequent linear system. It determines 
+   * the fitted parameters u and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their covariance 
+   * matrix are transformed to the original coordinate system.
+   */
+  template <typename V4, typename M6xN, int n>
+  __host__ __device__ inline void lineFit(const M6xN& hits_ge,
+                                          const V4& fast_fit,
+                                          const double bField,
+                                          const PreparedBrokenLineData<n>& data,
+                                          riemannFit::LineFit& line_results) {
     const auto& radii = data.radii;
-    const auto& S = data.S;
-    const auto& Z = data.Z;
-    const auto& VarBeta = data.VarBeta;
-
-    const double slope = -data.q / fast_fit(3);
-    Rfit::Matrix2d R = RotationMatrix(slope);
-
-    Rfit::Matrix3d V = Rfit::Matrix3d::Zero();                 // covariance matrix XYZ
-    Rfit::Matrix2x3d JacobXYZtosZ = Rfit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
-    Rfit::VectorNd<N> w = Rfit::VectorNd<N>::Zero();
-    for (i = 0; i < n; i++) {
-      V(0, 0) = hits_ge.col(i)[0];            // x errors
-      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
-      V(0, 2) = V(2, 0) = hits_ge.col(i)[3];  // cov_xz
-      V(1, 1) = hits_ge.col(i)[2];            // y errors
-      V(2, 1) = V(1, 2) = hits_ge.col(i)[4];  // cov_yz
-      V(2, 2) = hits_ge.col(i)[5];            // z errors
+    const auto& sTotal = data.sTotal;
+    const auto& zInSZplane = data.zInSZplane;
+    const auto& varBeta = data.varBeta;
+
+    const double slope = -data.qCharge / fast_fit(3);
+    riemannFit::Matrix2d rotMat = rotationMatrix(slope);
+
+    riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero();  // covariance matrix XYZ
+    riemannFit::Matrix2x3d jacobXYZtosZ =
+        riemannFit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    riemannFit::VectorNd<n> weights = riemannFit::VectorNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3];  // cov_xz
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4];  // cov_yz
+      vMat(2, 2) = hits_ge.col(i)[5];               // z errors
       auto tmp = 1. / radii.block(0, i, 2, 1).norm();
-      JacobXYZtosZ(0, 0) = radii(1, i) * tmp;
-      JacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
-      JacobXYZtosZ(1, 2) = 1.;
-      w(i) = 1. / ((R * JacobXYZtosZ * V * JacobXYZtosZ.transpose() * R.transpose())(
-                      1, 1));  // compute the orthogonal weight point by point
+      jacobXYZtosZ(0, 0) = radii(1, i) * tmp;
+      jacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
+      jacobXYZtosZ(1, 2) = 1.;
+      weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())(
+                            1, 1));  // compute the orthogonal weight point by point
     }
 
-    Rfit::VectorNd<N> r_u;
-    for (i = 0; i < n; i++) {
-      r_u(i) = w(i) * Z(i);
+    riemannFit::VectorNd<n> r_u;
+    for (u_int i = 0; i < n; i++) {
+      r_u(i) = weights(i) * zInSZplane(i);
     }
 #ifdef CPP_DUMP
-    std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl;
+    std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl;
 #endif
-    Rfit::MatrixNd<N> I;
-    math::cholesky::invert(MatrixC_u(w, S, VarBeta), I);
-    //    Rfit::MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
+    riemannFit::MatrixNd<n> iMat;
+    math::cholesky::invert(matrixC_u(weights, sTotal, varBeta), iMat);
 #ifdef CPP_DUMP
-    std::cout << "I4\n" << I << std::endl;
+    std::cout << "I4\n" << iMat << std::endl;
 #endif
 
-    Rfit::VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    riemannFit::VectorNd<n> uVec = iMat * r_u;  // obtain the fitted parameters by solving the linear system
 
     // line parameters in the system in which the first hit is the origin and with axis along SZ
-    line_results.par << (u(1) - u(0)) / (S(1) - S(0)), u(0);
-    auto idiff = 1. / (S(1) - S(0));
-    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * Rfit::sqr(idiff) +
-                            MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope),
-        (I(0, 1) - I(0, 0)) * idiff, (I(0, 1) - I(0, 0)) * idiff, I(0, 0);
+    line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0);
+    auto idiff = 1. / (sTotal(1) - sTotal(0));
+    line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) +
+                            multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope),
+        (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0);
 
     // translate to the original SZ system
-    Rfit::Matrix2d jacobian;
+    riemannFit::Matrix2d jacobian;
     jacobian(0, 0) = 1.;
     jacobian(0, 1) = 0;
-    jacobian(1, 0) = -S(0);
+    jacobian(1, 0) = -sTotal(0);
     jacobian(1, 1) = 1.;
-    line_results.par(1) += -line_results.par(0) * S(0);
+    line_results.par(1) += -line_results.par(0) * sTotal(0);
     line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // rotate to the original sz system
-    auto tmp = R(0, 0) - line_results.par(0) * R(0, 1);
+    auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1);
     jacobian(1, 1) = 1. / tmp;
     jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
     jacobian(0, 1) = 0;
-    jacobian(1, 0) = line_results.par(1) * R(0, 1) * jacobian(0, 0);
+    jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0);
     line_results.par(1) = line_results.par(1) * jacobian(1, 1);
-    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * jacobian(1, 1);
+    line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1);
     line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // compute chi2
     line_results.chi2 = 0;
-    for (i = 0; i < n; i++) {
-      line_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
+    for (u_int i = 0; i < n; i++) {
+      line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
       if (i > 0 && i < n - 1)
-        line_results.chi2 += Rfit::sqr(u(i - 1) / (S(i) - S(i - 1)) -
-                                       u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
-                                       u(i + 1) / (S(i + 1) - S(i))) /
-                             VarBeta(i);
+        line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) -
+                                             uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) /
+                                                 ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) +
+                                             uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) /
+                             varBeta(i);
     }
-
-    // assert(line_results.chi2>=0);
   }
 
   /*!
@@ -519,7 +559,7 @@ namespace BrokenLine {
     |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
     |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
     |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
-    \param B magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
+    \param bField magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
     
     \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
     
@@ -527,39 +567,40 @@ namespace BrokenLine {
     
     \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
   */
-  template <int N>
-  inline Rfit::helix_fit BL_Helix_fit(const Rfit::Matrix3xNd<N>& hits,
-                                      const Eigen::Matrix<float, 6, 4>& hits_ge,
-                                      const double B) {
-    Rfit::helix_fit helix;
-    Rfit::Vector4d fast_fit;
-    BL_Fast_fit(hits, fast_fit);
-
-    PreparedBrokenLineData<N> data;
+  template <int n>
+  inline riemannFit::HelixFit helixFit(const riemannFit::Matrix3xNd<n>& hits,
+                                       const Eigen::Matrix<float, 6, 4>& hits_ge,
+                                       const double bField) {
+    riemannFit::HelixFit helix;
+    riemannFit::Vector4d fast_fit;
+    fastFit(hits, fast_fit);
+
+    PreparedBrokenLineData<n> data;
     karimaki_circle_fit circle;
-    Rfit::line_fit line;
-    Rfit::Matrix3d jacobian;
+    riemannFit::LineFit line;
+    riemannFit::Matrix3d jacobian;
 
-    prepareBrokenLineData(hits, fast_fit, B, data);
-    BL_Line_fit(hits_ge, fast_fit, B, data, line);
-    BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+    prepareBrokenLineData(hits, fast_fit, bField, data);
+    lineFit(hits_ge, fast_fit, bField, data, line);
+    circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
     // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
-    jacobian << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (Rfit::sqr(circle.par(2)) * circle.par(2));
-    circle.par(2) = B / std::abs(circle.par(2));
+    jacobian << 1., 0, 0, 0, 1., 0, 0, 0,
+        -std::abs(circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2));
+    circle.par(2) = bField / std::abs(circle.par(2));
     circle.cov = jacobian * circle.cov * jacobian.transpose();
 
     helix.par << circle.par, line.par;
-    helix.cov = Rfit::MatrixXd::Zero(5, 5);
+    helix.cov = riemannFit::MatrixXd::Zero(5, 5);
     helix.cov.block(0, 0, 3, 3) = circle.cov;
     helix.cov.block(3, 3, 2, 2) = line.cov;
-    helix.q = circle.q;
+    helix.qCharge = circle.qCharge;
     helix.chi2_circle = circle.chi2;
     helix.chi2_line = line.chi2;
 
     return helix;
   }
 
-}  // namespace BrokenLine
+}  // namespace brokenline
 
 #endif  // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
index b97dda4e65919..01497719d2998 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
@@ -8,7 +8,7 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
-namespace Rfit {
+namespace riemannFit {
 
   using Vector2d = Eigen::Vector2d;
   using Vector3d = Eigen::Vector3d;
@@ -23,7 +23,7 @@ namespace Rfit {
   template <int N>
   using Matrix3xNd = Eigen::Matrix<double, 3, N>;  // used for inputs hits
 
-  struct circle_fit {
+  struct CircleFit {
     Vector3d par;  //!< parameter: (X0,Y0,R)
     Matrix3d cov;
     /*!< covariance matrix: \n
@@ -31,11 +31,11 @@ namespace Rfit {
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
     */
-    int32_t q;  //!< particle charge
+    int32_t qCharge;  //!< particle charge
     float chi2;
   };
 
-  struct line_fit {
+  struct LineFit {
     Vector2d par;  //!<(cotan(theta),Zip)
     Matrix2d cov;
     /*!<
@@ -45,7 +45,7 @@ namespace Rfit {
     double chi2;
   };
 
-  struct helix_fit {
+  struct HelixFit {
     Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
     Matrix5d cov;
     /*!< ()->cov() \n
@@ -58,8 +58,8 @@ namespace Rfit {
     float chi2_circle;
     float chi2_line;
     //    Vector4d fast_fit;
-    int32_t q;  //!< particle charge
-  };            // __attribute__((aligned(16)));
+    int32_t qCharge;  //!< particle charge
+  };                  // __attribute__((aligned(16)));
 
-}  // namespace Rfit
+}  // namespace riemannFit
 #endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
index 8710bdcf6c444..2fe74f53a7bd2 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -5,9 +5,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 
-namespace Rfit {
+namespace riemannFit {
 
-  constexpr double d = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
+  constexpr double epsilon = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
 
   using VectorXd = Eigen::VectorXd;
   using MatrixXd = Eigen::MatrixXd;
@@ -49,13 +49,11 @@ namespace Rfit {
   using Vector4f = Eigen::Vector4f;
   using Vector6f = Eigen::Matrix<double, 6, 1>;
 
-  using u_int = unsigned int;
-
   template <class C>
   __host__ __device__ void printIt(C* m, const char* prefix = "") {
 #ifdef RFIT_DEBUG
-    for (u_int r = 0; r < m->rows(); ++r) {
-      for (u_int c = 0; c < m->cols(); ++c) {
+    for (uint r = 0; r < m->rows(); ++r) {
+      for (uint c = 0; c < m->cols(); ++c) {
         printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
       }
     }
@@ -102,19 +100,19 @@ namespace Rfit {
     // | 3  4  5 |
     constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
     for (uint32_t i = 0; i < hits_in_fit; ++i) {
-      auto ge_idx = 0;
-      auto j = 0;
-      auto l = 0;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2;
-      j = 1;
-      l = 1;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1;
-      j = 1;
-      l = 0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
+      {
+        constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
     }
   }
 
@@ -134,33 +132,33 @@ namespace Rfit {
     // | 3  4  5 |
     constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
     for (uint32_t i = 0; i < hits_in_fit; ++i) {
-      auto ge_idx = 0;
-      auto j = 0;
-      auto l = 0;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2;
-      j = 1;
-      l = 1;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 5;
-      j = 2;
-      l = 2;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1;
-      j = 1;
-      l = 0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
-      ge_idx = 3;
-      j = 2;
-      l = 0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
-      ge_idx = 4;
-      j = 2;
-      l = 1;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
+      {
+        constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 5, j = 2, l = 2;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 3, j = 2, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 4, j = 2, l = 1;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
     }
   }
 
@@ -172,19 +170,19 @@ namespace Rfit {
     \param B magnetic field in Gev/cm/c unit.
     \param error flag for errors computation.
   */
-  __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool error) {
+  __host__ __device__ inline void par_uvrtopak(CircleFit& circle, const double B, const bool error) {
     Vector3d par_pak;
     const double temp0 = circle.par.head(2).squaredNorm();
     const double temp1 = sqrt(temp0);
-    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
-        circle.par(2) * B;
+    par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+        circle.qCharge * (temp1 - circle.par(2)), circle.par(2) * B;
     if (error) {
       const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-      const double temp3 = 1. / temp1 * circle.q;
-      Matrix3d J4;
-      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
-          circle.par(1) * temp3, -circle.q, 0., 0., B;
-      circle.cov = J4 * circle.cov * J4.transpose();
+      const double temp3 = 1. / temp1 * circle.qCharge;
+      Matrix3d j4Mat;
+      j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.qCharge, 0., 0., B;
+      circle.cov = j4Mat * circle.cov * j4Mat.transpose();
     }
     circle.par = par_pak;
   }
@@ -195,19 +193,19 @@ namespace Rfit {
     \param circle_uvr parameter (X0,Y0,R), covariance matrix to
     be transformed and particle charge.
   */
-  __host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) {
+  __host__ __device__ inline void fromCircleToPerigee(CircleFit& circle) {
     Vector3d par_pak;
     const double temp0 = circle.par.head(2).squaredNorm();
     const double temp1 = sqrt(temp0);
-    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
-        circle.q / circle.par(2);
+    par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+        circle.qCharge * (temp1 - circle.par(2)), circle.qCharge / circle.par(2);
 
     const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-    const double temp3 = 1. / temp1 * circle.q;
-    Matrix3d J4;
-    J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
-        circle.par(1) * temp3, -circle.q, 0., 0., -circle.q / (circle.par(2) * circle.par(2));
-    circle.cov = J4 * circle.cov * J4.transpose();
+    const double temp3 = 1. / temp1 * circle.qCharge;
+    Matrix3d j4Mat;
+    j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+        circle.par(1) * temp3, -circle.qCharge, 0., 0., -circle.qCharge / (circle.par(2) * circle.par(2));
+    circle.cov = j4Mat * circle.cov * j4Mat.transpose();
 
     circle.par = par_pak;
   }
@@ -228,18 +226,18 @@ namespace Rfit {
     op(3) = ip(1);
     op(4) = -ip(4);
 
-    Matrix5d J = Matrix5d::Zero();
+    Matrix5d jMat = Matrix5d::Zero();
 
-    J(0, 2) = sinTheta;
-    J(0, 3) = -sinTheta2 * cosTheta * ip(2);
-    J(1, 0) = 1.;
-    J(2, 3) = -1.;
-    J(3, 1) = 1.;
-    J(4, 4) = -1;
+    jMat(0, 2) = sinTheta;
+    jMat(0, 3) = -sinTheta2 * cosTheta * ip(2);
+    jMat(1, 0) = 1.;
+    jMat(2, 3) = -1.;
+    jMat(3, 1) = 1.;
+    jMat(4, 4) = -1;
 
-    ocov = J * icov * J.transpose();
+    ocov = jMat * icov * jMat.transpose();
   }
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 #endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
index 4573205f9c11e..52cf4b637fb37 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -3,12 +3,12 @@
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
-namespace Rfit {
+namespace riemannFit {
 
   /*!  Compute the Radiation length in the uniform hypothesis
  *
- * The Pixel detector, barrel and forward, is considered as an omogeneous
- * cilinder of material, whose radiation lengths has been derived from the TDR
+ * The Pixel detector, barrel and forward, is considered as an homogeneous
+ * cylinder of material, whose radiation lengths has been derived from the TDR
  * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
  * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
  * lengths are computed using this unique number, in both regions, barrel and
@@ -16,12 +16,12 @@ namespace Rfit {
  *
  * NB: no angle corrections nor projections are computed inside this routine.
  * It is therefore the responsibility of the caller to supply the proper
- * lengths in input. These lenghts are the path travelled by the particle along
+ * lengths in input. These lengths are the path traveled by the particle along
  * its trajectory, namely the so called S of the helix in 3D space.
  *
  * \param length_values vector of incremental distances that will be translated
  * into radiation length equivalent. Each radiation length i is computed
- * incrementally with respect to the previous length i-1. The first lenght has
+ * incrementally with respect to the previous length i-1. The first length has
  * no reference point (i.e. it has the dca).
  *
  * \return incremental radiation lengths that correspond to each segment.
@@ -31,11 +31,11 @@ namespace Rfit {
   __host__ __device__ inline void computeRadLenUniformMaterial(const VNd1& length_values, VNd2& rad_lengths) {
     // Radiation length of the pixel detector in the uniform assumption, with
     // 0.06 rad_len at 16 cm
-    constexpr double XX_0_inv = 0.06 / 16.;
-    u_int n = length_values.rows();
-    rad_lengths(0) = length_values(0) * XX_0_inv;
-    for (u_int j = 1; j < n; ++j) {
-      rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * XX_0_inv;
+    constexpr double xx_0_inv = 0.06 / 16.;
+    uint n = length_values.rows();
+    rad_lengths(0) = length_values(0) * xx_0_inv;
+    for (uint j = 1; j < n; ++j) {
+      rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * xx_0_inv;
     }
   }
 
@@ -59,41 +59,41 @@ namespace Rfit {
  */
 
   template <typename V4, typename VNd1, typename VNd2, int N>
-  __host__ __device__ inline auto Scatter_cov_line(Matrix2d const* cov_sz,
-                                                   const V4& fast_fit,
-                                                   VNd1 const& s_arcs,
-                                                   VNd2 const& z_values,
-                                                   const double theta,
-                                                   const double B,
-                                                   MatrixNd<N>& ret) {
+  __host__ __device__ inline auto scatterCovLine(Matrix2d const* cov_sz,
+                                                 const V4& fast_fit,
+                                                 VNd1 const& s_arcs,
+                                                 VNd2 const& z_values,
+                                                 const double theta,
+                                                 const double bField,
+                                                 MatrixNd<N>& ret) {
 #ifdef RFIT_DEBUG
-    Rfit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
+    riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
 #endif
-    constexpr u_int n = N;
-    double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
-    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    constexpr uint n = N;
+    double p_t = std::min(20., fast_fit(2) * bField);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
     VectorNd<N> rad_lengths_S;
     // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
     // Basically, to perform cwise operations on Matrices and Vectors, you need
     // to transform them into Array-like objects.
-    VectorNd<N> S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
-    S_values = S_values.array().sqrt();
-    computeRadLenUniformMaterial(S_values, rad_lengths_S);
+    VectorNd<N> s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    s_values = s_values.array().sqrt();
+    computeRadLenUniformMaterial(s_values, rad_lengths_S);
     VectorNd<N> sig2_S;
     sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
 #ifdef RFIT_DEBUG
-    Rfit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
+    riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
 #endif
     Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
-    for (u_int k = 0; k < n; ++k) {
+    for (uint k = 0; k < n; ++k) {
       tmp(k, k) = cov_sz[k](0, 0);
       tmp(k + n, k + n) = cov_sz[k](1, 1);
       tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
     }
-    for (u_int k = 0; k < n; ++k) {
-      for (u_int l = k; l < n; ++l) {
-        for (u_int i = 0; i < std::min(k, l); ++i) {
-          tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < std::min(k, l); ++i) {
+          tmp(k + n, l + n) += std::abs(s_values(k) - s_values(i)) * std::abs(s_values(l) - s_values(i)) * sig2_S(i);
         }
         tmp(l + n, k + n) = tmp(k + n, l + n);
       }
@@ -101,7 +101,7 @@ namespace Rfit {
     // We are interested only in the errors orthogonal to the rotated s-axis
     // which, in our formalism, are in the lower square matrix.
 #ifdef RFIT_DEBUG
-    Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
+    riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
     ret = tmp.block(n, n, n, n);
   }
@@ -120,41 +120,41 @@ namespace Rfit {
     negligible).
  */
   template <typename M2xN, typename V4, int N>
-  __host__ __device__ inline MatrixNd<N> Scatter_cov_rad(const M2xN& p2D,
+  __host__ __device__ inline MatrixNd<N> scatter_cov_rad(const M2xN& p2D,
                                                          const V4& fast_fit,
                                                          VectorNd<N> const& rad,
                                                          double B) {
-    constexpr u_int n = N;
+    constexpr uint n = N;
     double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
-    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
     double theta = atan(fast_fit(3));
     theta = theta < 0. ? theta + M_PI : theta;
     VectorNd<N> s_values;
     VectorNd<N> rad_lengths;
-    const Vector2d o(fast_fit(0), fast_fit(1));
+    const Vector2d oVec(fast_fit(0), fast_fit(1));
 
     // associated Jacobian, used in weights and errors computation
-    for (u_int i = 0; i < n; ++i) {  // x
-      Vector2d p = p2D.block(0, i, 2, 1) - o;
-      const double cross = cross2D(-o, p);
-      const double dot = (-o).dot(p);
-      const double atan2_ = atan2(cross, dot);
-      s_values(i) = std::abs(atan2_ * fast_fit(2));
+    for (uint i = 0; i < n; ++i) {  // x
+      Vector2d pVec = p2D.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(-oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
+      const double tempAtan2 = atan2(cross, dot);
+      s_values(i) = std::abs(tempAtan2 * fast_fit(2));
     }
-    computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / (fast_fit(3) * fast_fit(3))), rad_lengths);
+    computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths);
     MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
     VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
     sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
-    for (u_int k = 0; k < n; ++k) {
-      for (u_int l = k; l < n; ++l) {
-        for (u_int i = 0; i < std::min(k, l); ++i) {
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < std::min(k, l); ++i) {
           scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
         }
         scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
       }
     }
 #ifdef RFIT_DEBUG
-    Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+    riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
 #endif
     return scatter_cov_rad;
   }
@@ -175,12 +175,12 @@ namespace Rfit {
     printf("Address of p2D: %p\n", &p2D);
 #endif
     printIt(&p2D, "cov_radtocart - p2D:");
-    constexpr u_int n = N;
+    constexpr uint n = N;
     Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
     VectorNd<N> rad_inv = rad.cwiseInverse();
     printIt(&rad_inv, "cov_radtocart - rad_inv:");
-    for (u_int i = 0; i < n; ++i) {
-      for (u_int j = i; j < n; ++j) {
+    for (uint i = 0; i < n; ++i) {
+      for (uint j = i; j < n; ++j) {
         cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
         cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
         cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
@@ -208,10 +208,10 @@ namespace Rfit {
   __host__ __device__ inline VectorNd<N> cov_carttorad(const M2xN& p2D,
                                                        const Matrix2Nd<N>& cov_cart,
                                                        const VectorNd<N>& rad) {
-    constexpr u_int n = N;
+    constexpr uint n = N;
     VectorNd<N> cov_rad;
     const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
-    for (u_int i = 0; i < n; ++i) {
+    for (uint i = 0; i < n; ++i) {
       //!< in case you have (0,0) to avoid dividing by 0 radius
       if (rad(i) < 1.e-4)
         cov_rad(i) = cov_cart(i, i);
@@ -240,9 +240,9 @@ namespace Rfit {
                                                               const Matrix2Nd<N>& cov_cart,
                                                               V4& fast_fit,
                                                               const VectorNd<N>& rad) {
-    constexpr u_int n = N;
+    constexpr uint n = N;
     VectorNd<N> cov_rad;
-    for (u_int i = 0; i < n; ++i) {
+    for (uint i = 0; i < n; ++i) {
       //!< in case you have (0,0) to avoid dividing by 0 radius
       if (rad(i) < 1.e-4)
         cov_rad(i) = cov_cart(i, i);  // TO FIX
@@ -272,7 +272,7 @@ namespace Rfit {
 */
 
   template <int N>
-  __host__ __device__ inline VectorNd<N> Weight_circle(const MatrixNd<N>& cov_rad_inv) {
+  __host__ __device__ inline VectorNd<N> weightCircle(const MatrixNd<N>& cov_rad_inv) {
     return cov_rad_inv.colwise().sum().transpose();
   }
 
@@ -285,7 +285,7 @@ namespace Rfit {
     \return q int 1 or -1.
 */
   template <typename M2xN>
-  __host__ __device__ inline int32_t Charge(const M2xN& p2D, const Vector3d& par_uvr) {
+  __host__ __device__ inline int32_t charge(const M2xN& p2D, const Vector3d& par_uvr) {
     return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
             0)
                ? -1
@@ -342,7 +342,7 @@ namespace Rfit {
 
   /*!
     \brief 2D version of min_eigen3D().
-    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param aMat the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored
     \return the eigenvector associated to the minimum eigenvalue.
     \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix
@@ -350,9 +350,9 @@ namespace Rfit {
     significantly in single precision.
 */
 
-  __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
+  __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& aMat, double& chi2) {
     Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
-    solver.computeDirect(A);
+    solver.computeDirect(aMat);
     int min_index;
     chi2 = solver.eigenvalues().minCoeff(&min_index);
     return solver.eigenvectors().col(min_index);
@@ -372,48 +372,48 @@ namespace Rfit {
 */
 
   template <typename M3xN, typename V4>
-  __host__ __device__ inline void Fast_fit(const M3xN& hits, V4& result) {
+  __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
     constexpr auto n = N;  // get the number of hits
     printIt(&hits, "Fast_fit - hits: ");
 
     // CIRCLE FIT
     // Make segments between middle-to-first(b) and last-to-first(c) hits
-    const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-    const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
-    printIt(&b, "Fast_fit - b: ");
-    printIt(&c, "Fast_fit - c: ");
+    const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+    printIt(&bVec, "Fast_fit - b: ");
+    printIt(&cVec, "Fast_fit - c: ");
     // Compute their lengths
-    auto b2 = b.squaredNorm();
-    auto c2 = c.squaredNorm();
+    auto b2 = bVec.squaredNorm();
+    auto c2 = cVec.squaredNorm();
     // The algebra has been verified (MR). The usual approach has been followed:
     // * use an orthogonal reference frame passing from the first point.
     // * build the segments (chords)
     // * build orthogonal lines through mid points
     // * make a system and solve for X0 and Y0.
     // * add the initial point
-    bool flip = abs(b.x()) < abs(b.y());
-    auto bx = flip ? b.y() : b.x();
-    auto by = flip ? b.x() : b.y();
-    auto cx = flip ? c.y() : c.x();
-    auto cy = flip ? c.x() : c.y();
+    bool flip = abs(bVec.x()) < abs(bVec.y());
+    auto bx = flip ? bVec.y() : bVec.x();
+    auto by = flip ? bVec.x() : bVec.y();
+    auto cx = flip ? cVec.y() : cVec.x();
+    auto cy = flip ? cVec.x() : cVec.y();
     //!< in case b.x is 0 (2 hits with same x)
     auto div = 2. * (cx * by - bx * cy);
     // if aligned TO FIX
-    auto Y0 = (cx * b2 - bx * c2) / div;
-    auto X0 = (0.5 * b2 - Y0 * by) / bx;
-    result(0) = hits(0, 0) + (flip ? Y0 : X0);
-    result(1) = hits(1, 0) + (flip ? X0 : Y0);
-    result(2) = sqrt(sqr(X0) + sqr(Y0));
+    auto y0 = (cx * b2 - bx * c2) / div;
+    auto x0 = (0.5 * b2 - y0 * by) / bx;
+    result(0) = hits(0, 0) + (flip ? y0 : x0);
+    result(1) = hits(1, 0) + (flip ? x0 : y0);
+    result(2) = sqrt(sqr(x0) + sqr(y0));
     printIt(&result, "Fast_fit - result: ");
 
     // LINE FIT
-    const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-    const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
-    printIt(&e, "Fast_fit - e: ");
-    printIt(&d, "Fast_fit - d: ");
+    const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2);
+    const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2);
+    printIt(&eVec, "Fast_fit - e: ");
+    printIt(&dVec, "Fast_fit - d: ");
     // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
-    auto dr = result(2) * atan2(cross2D(d, e), d.dot(e));
+    auto dr = result(2) * atan2(cross2D(dVec, eVec), dVec.dot(eVec));
     // Simple difference in Z between last and first hit
     auto dz = hits(2, n - 1) - hits(2, 0);
 
@@ -432,7 +432,7 @@ namespace Rfit {
     \param hits_cov2D covariance matrix of 2D points.
     \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)).
     (tan(theta) is not used).
-    \param B magnetic field
+    \param bField magnetic field
     \param error flag for error computation.
     \param scattering flag for multiple scattering
     \return circle circle_fit:
@@ -452,18 +452,18 @@ namespace Rfit {
     scattering.
 */
   template <typename M2xN, typename V4, int N>
-  __host__ __device__ inline circle_fit Circle_fit(const M2xN& hits2D,
-                                                   const Matrix2Nd<N>& hits_cov2D,
-                                                   const V4& fast_fit,
-                                                   const VectorNd<N>& rad,
-                                                   const double B,
-                                                   const bool error) {
+  __host__ __device__ inline CircleFit circleFit(const M2xN& hits2D,
+                                                 const Matrix2Nd<N>& hits_cov2D,
+                                                 const V4& fast_fit,
+                                                 const VectorNd<N>& rad,
+                                                 const double bField,
+                                                 const bool error) {
 #ifdef RFIT_DEBUG
     printf("circle_fit - enter\n");
 #endif
     // INITIALIZATION
-    Matrix2Nd<N> V = hits_cov2D;
-    constexpr u_int n = N;
+    Matrix2Nd<N> vMat = hits_cov2D;
+    constexpr uint n = N;
     printIt(&hits2D, "circle_fit - hits2D:");
     printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
 
@@ -472,25 +472,25 @@ namespace Rfit {
 #endif
     // WEIGHT COMPUTATION
     VectorNd<N> weight;
-    MatrixNd<N> G;
+    MatrixNd<N> gMat;
     double renorm;
     {
-      MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad).asDiagonal();
-      MatrixNd<N> scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
-      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
+      MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, vMat, fast_fit, rad).asDiagonal();
+      MatrixNd<N> scatterCovRadMat = scatter_cov_rad(hits2D, fast_fit, rad, bField);
+      printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:");
       printIt(&hits2D, "circle_fit - hits2D bis:");
 #ifdef RFIT_DEBUG
       printf("Address of hits2D: a) %p\n", &hits2D);
 #endif
-      V += cov_radtocart(hits2D, scatter_cov_rad, rad);
-      printIt(&V, "circle_fit - V:");
-      cov_rad += scatter_cov_rad;
+      vMat += cov_radtocart(hits2D, scatterCovRadMat, rad);
+      printIt(&vMat, "circle_fit - V:");
+      cov_rad += scatterCovRadMat;
       printIt(&cov_rad, "circle_fit - cov_rad:");
-      math::cholesky::invert(cov_rad, G);
-      // G = cov_rad.inverse();
-      renorm = G.sum();
-      G *= 1. / renorm;
-      weight = Weight_circle(G);
+      math::cholesky::invert(cov_rad, gMat);
+      // gMat = cov_rad.inverse();
+      renorm = gMat.sum();
+      gMat *= 1. / renorm;
+      weight = weightCircle(gMat);
     }
     printIt(&weight, "circle_fit - weight:");
 
@@ -503,19 +503,19 @@ namespace Rfit {
 #ifdef RFIT_DEBUG
     printf("Address of hits2D: b) %p\n", &hits2D);
 #endif
-    const Vector2d h_ = hits2D.rowwise().mean();  // centroid
-    printIt(&h_, "circle_fit - h_:");
+    const Vector2d hCentroid = hits2D.rowwise().mean();  // centroid
+    printIt(&hCentroid, "circle_fit - h_:");
     Matrix3xNd<N> p3D;
-    p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
+    p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid;
     printIt(&p3D, "circle_fit - p3D: a)");
     Vector2Nd<N> mc;  // centered hits, used in error computation
     mc << p3D.row(0).transpose(), p3D.row(1).transpose();
     printIt(&mc, "circle_fit - mc(centered hits):");
 
     // scale
-    const double q = mc.squaredNorm();
-    const double s = sqrt(n * 1. / q);  // scaling factor
-    p3D *= s;
+    const double tempQ = mc.squaredNorm();
+    const double tempS = sqrt(n * 1. / tempQ);  // scaling factor
+    p3D *= tempS;
 
     // project on paraboloid
     p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
@@ -529,22 +529,22 @@ namespace Rfit {
     // compute
     Vector3d r0;
     r0.noalias() = p3D * weight;  // center of gravity
-    const Matrix3xNd<N> X = p3D.colwise() - r0;
-    Matrix3d A = X * G * X.transpose();
-    printIt(&A, "circle_fit - A:");
+    const Matrix3xNd<N> xMat = p3D.colwise() - r0;
+    Matrix3d aMat = xMat * gMat * xMat.transpose();
+    printIt(&aMat, "circle_fit - A:");
 
 #ifdef RFIT_DEBUG
     printf("circle_fit - MINIMIZE\n");
 #endif
     // minimize
     double chi2;
-    Vector3d v = min_eigen3D(A, chi2);
+    Vector3d vVec = min_eigen3D(aMat, chi2);
 #ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN\n");
 #endif
-    printIt(&v, "v BEFORE INVERSION");
-    v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
-    printIt(&v, "v AFTER INVERSION");
+    printIt(&vVec, "v BEFORE INVERSION");
+    vVec *= (vVec(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+    printIt(&vVec, "v AFTER INVERSION");
     // This hack to be able to run on GPU where the automatic assignment to a
     // double from the vector multiplication is not working.
 #ifdef RFIT_DEBUG
@@ -554,12 +554,11 @@ namespace Rfit {
 #ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN 2\n");
 #endif
-    cm = -v.transpose() * r0;
+    cm = -vVec.transpose() * r0;
 #ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN 3\n");
 #endif
-    const double c = cm(0, 0);
-    //  const double c = -v.transpose() * r0;
+    const double tempC = cm(0, 0);
 
 #ifdef RFIT_DEBUG
     printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
@@ -567,20 +566,20 @@ namespace Rfit {
     // COMPUTE CIRCLE PARAMETER
 
     // auxiliary quantities
-    const double h = sqrt(1. - sqr(v(2)) - 4. * c * v(2));
-    const double v2x2_inv = 1. / (2. * v(2));
-    const double s_inv = 1. / s;
-    Vector3d par_uvr_;  // used in error propagation
-    par_uvr_ << -v(0) * v2x2_inv, -v(1) * v2x2_inv, h * v2x2_inv;
-
-    circle_fit circle;
-    circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
-    circle.q = Charge(hits2D, circle.par);
-    circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
+    const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2));
+    const double v2x2_inv = 1. / (2. * vVec(2));
+    const double s_inv = 1. / tempS;
+    Vector3d par_uvr;  // used in error propagation
+    par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv;
+
+    CircleFit circle;
+    circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv;
+    circle.qCharge = charge(hits2D, circle.par);
+    circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS);
     printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
     printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
 #ifdef RFIT_DEBUG
-    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.q);
+    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge);
 #endif
 
 #ifdef RFIT_DEBUG
@@ -591,28 +590,28 @@ namespace Rfit {
 #ifdef RFIT_DEBUG
       printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
 #endif
-      ArrayNd<N> Vcs_[2][2];  // cov matrix of center & scaled points
-      MatrixNd<N> C[3][3];    // cov matrix of 3D transformed points
+      ArrayNd<N> vcsMat[2][2];  // cov matrix of center & scaled points
+      MatrixNd<N> cMat[3][3];   // cov matrix of 3D transformed points
 #ifdef RFIT_DEBUG
       printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
 #endif
       {
         Eigen::Matrix<double, 1, 1> cm;
         Eigen::Matrix<double, 1, 1> cm2;
-        cm = mc.transpose() * V * mc;
-        const double c = cm(0, 0);
-        Matrix2Nd<N> Vcs;
-        Vcs.template triangularView<Eigen::Upper>() =
-            (sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
-                              (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
-                              (mc * mc.transpose()));
-
-        printIt(&Vcs, "circle_fit - Vcs:");
-        C[0][0] = Vcs.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
-        Vcs_[0][1] = Vcs.block(0, n, n, n);
-        C[1][1] = Vcs.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
-        Vcs_[1][0] = Vcs_[0][1].transpose();
-        printIt(&Vcs, "circle_fit - Vcs:");
+        cm = mc.transpose() * vMat * mc;
+        const double tempC2 = cm(0, 0);
+        Matrix2Nd<N> tempVcsMat;
+        tempVcsMat.template triangularView<Eigen::Upper>() =
+            (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) *
+                                     (2. * vMat.squaredNorm() + 4. * tempC2) *  // mc.transpose() * V * mc) *
+                                     (mc * mc.transpose()));
+
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
+        cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[0][1] = tempVcsMat.block(0, n, n, n);
+        cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[1][0] = vcsMat[0][1].transpose();
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
       }
 
       {
@@ -622,137 +621,139 @@ namespace Rfit {
         const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
         const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
         const ArrayNd<N> t10 = t01.transpose();
-        Vcs_[0][0] = C[0][0];
-        ;
-        C[0][1] = Vcs_[0][1];
-        C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
-        Vcs_[1][1] = C[1][1];
-        C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
+        vcsMat[0][0] = cMat[0][0];
+        cMat[0][1] = vcsMat[0][1];
+        cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1);
+        vcsMat[1][1] = cMat[1][1];
+        cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1);
         MatrixNd<N> tmp;
         tmp.template triangularView<Eigen::Upper>() =
-            (2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
-                   Vcs_[1][1] * Vcs_[1][1]) +
-             4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11))
+            (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] +
+                   vcsMat[1][1] * vcsMat[1][1]) +
+             4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11))
                 .matrix();
-        C[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+        cMat[2][2] = tmp.template selfadjointView<Eigen::Upper>();
       }
-      printIt(&C[0][0], "circle_fit - C[0][0]:");
+      printIt(&cMat[0][0], "circle_fit - C[0][0]:");
 
-      Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
-      for (u_int i = 0; i < 3; ++i) {
-        for (u_int j = i; j < 3; ++j) {
+      Matrix3d c0Mat;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+      for (uint i = 0; i < 3; ++i) {
+        for (uint j = i; j < 3; ++j) {
           Eigen::Matrix<double, 1, 1> tmp;
-          tmp = weight.transpose() * C[i][j] * weight;
-          const double c = tmp(0, 0);
-          C0(i, j) = c;  //weight.transpose() * C[i][j] * weight;
-          C0(j, i) = C0(i, j);
+          tmp = weight.transpose() * cMat[i][j] * weight;
+          // Workaround to get things working in GPU
+          const double tempC = tmp(0, 0);
+          c0Mat(i, j) = tempC;  //weight.transpose() * C[i][j] * weight;
+          c0Mat(j, i) = c0Mat(i, j);
         }
       }
-      printIt(&C0, "circle_fit - C0:");
+      printIt(&c0Mat, "circle_fit - C0:");
 
-      const MatrixNd<N> W = weight * weight.transpose();
-      const MatrixNd<N> H = MatrixNd<N>::Identity().rowwise() - weight.transpose();
-      const MatrixNx3d<N> s_v = H * p3D.transpose();
-      printIt(&W, "circle_fit - W:");
-      printIt(&H, "circle_fit - H:");
+      const MatrixNd<N> wMat = weight * weight.transpose();
+      const MatrixNd<N> hMat = MatrixNd<N>::Identity().rowwise() - weight.transpose();
+      const MatrixNx3d<N> s_v = hMat * p3D.transpose();
+      printIt(&wMat, "circle_fit - W:");
+      printIt(&hMat, "circle_fit - H:");
       printIt(&s_v, "circle_fit - s_v:");
 
-      MatrixNd<N> D_[3][3];  // cov(s_v)
-      {
-        D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
-        D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
-        D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
-        D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
-        D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
-        D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
-        D_[1][0] = D_[0][1].transpose();
-        D_[2][0] = D_[0][2].transpose();
-        D_[2][1] = D_[1][2].transpose();
-      }
-      printIt(&D_[0][0], "circle_fit - D_[0][0]:");
-
-      constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
-
-      Matrix6d E;  // cov matrix of the 6 independent elements of A
-      for (u_int a = 0; a < 6; ++a) {
-        const u_int i = nu[a][0], j = nu[a][1];
-        for (u_int b = a; b < 6; ++b) {
-          const u_int k = nu[b][0], l = nu[b][1];
+      MatrixNd<N> dMat[3][3];  // cov(s_v)
+      dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][0] = dMat[0][1].transpose();
+      dMat[2][0] = dMat[0][2].transpose();
+      dMat[2][1] = dMat[1][2].transpose();
+      printIt(&dMat[0][0], "circle_fit - D_[0][0]:");
+
+      constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+      Matrix6d eMat;  // cov matrix of the 6 independent elements of A
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        for (uint b = a; b < 6; ++b) {
+          const uint k = nu[b][0], l = nu[b][1];
           VectorNd<N> t0(n);
           VectorNd<N> t1(n);
           if (l == k) {
-            t0 = 2. * D_[j][l] * s_v.col(l);
+            t0 = 2. * dMat[j][l] * s_v.col(l);
             if (i == j)
               t1 = t0;
             else
-              t1 = 2. * D_[i][l] * s_v.col(l);
+              t1 = 2. * dMat[i][l] * s_v.col(l);
           } else {
-            t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
+            t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l);
             if (i == j)
               t1 = t0;
             else
-              t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
+              t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l);
           }
 
           if (i == j) {
             Eigen::Matrix<double, 1, 1> cm;
             cm = s_v.col(i).transpose() * (t0 + t1);
-            const double c = cm(0, 0);
-            E(a, b) = 0. + c;
+            // Workaround to get things working in GPU
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;
           } else {
             Eigen::Matrix<double, 1, 1> cm;
             cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
-            const double c = cm(0, 0);
-            E(a, b) = 0. + c;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+            // Workaround to get things working in GPU
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
           }
           if (b != a)
-            E(b, a) = E(a, b);
+            eMat(b, a) = eMat(a, b);
         }
       }
-      printIt(&E, "circle_fit - E:");
-
-      Eigen::Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
-      for (u_int a = 0; a < 6; ++a) {
-        const u_int i = nu[a][0], j = nu[a][1];
-        Matrix3d Delta = Matrix3d::Zero();
-        Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
-        J2.col(a) = min_eigen3D_fast(A + Delta);
-        const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
-        J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
+      printIt(&eMat, "circle_fit - E:");
+
+      Eigen::Matrix<double, 3, 6> j2Mat;  // Jacobian of min_eigen() (numerically computed)
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        Matrix3d delta = Matrix3d::Zero();
+        delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon);
+        j2Mat.col(a) = min_eigen3D_fast(aMat + delta);
+        const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1;
+        j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j);
       }
-      printIt(&J2, "circle_fit - J2:");
+      printIt(&j2Mat, "circle_fit - J2:");
 
-      Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
+      Matrix4d cvcMat;  // joint cov matrix of (v0,v1,v2,c)
       {
-        Matrix3d t0 = J2 * E * J2.transpose();
+        Matrix3d t0 = j2Mat * eMat * j2Mat.transpose();
         Vector3d t1 = -t0 * r0;
-        Cvc.block(0, 0, 3, 3) = t0;
-        Cvc.block(0, 3, 3, 1) = t1;
-        Cvc.block(3, 0, 1, 3) = t1.transpose();
+        cvcMat.block(0, 0, 3, 3) = t0;
+        cvcMat.block(0, 3, 3, 1) = t1;
+        cvcMat.block(3, 0, 1, 3) = t1.transpose();
         Eigen::Matrix<double, 1, 1> cm1;
         Eigen::Matrix<double, 1, 1> cm3;
-        cm1 = (v.transpose() * C0 * v);
-        //      cm2 = (C0.cwiseProduct(t0)).sum();
+        cm1 = (vVec.transpose() * c0Mat * vVec);
+        //      cm2 = (c0Mat.cwiseProduct(t0)).sum();
         cm3 = (r0.transpose() * t0 * r0);
-        const double c = cm1(0, 0) + (C0.cwiseProduct(t0)).sum() + cm3(0, 0);
-        Cvc(3, 3) = c;
-        // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+        // Workaround to get things working in GPU
+        const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0);
+        cvcMat(3, 3) = tempC;
+        // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
       }
-      printIt(&Cvc, "circle_fit - Cvc:");
+      printIt(&cvcMat, "circle_fit - Cvc:");
 
-      Eigen::Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+      Eigen::Matrix<double, 3, 4> j3Mat;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
       {
-        const double t = 1. / h;
-        J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
-            v(0) * v2x2_inv * t, v(1) * v2x2_inv * t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+        const double t = 1. / tempH;
+        j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0,
+            vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t,
+            -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t;
       }
-      printIt(&J3, "circle_fit - J3:");
+      printIt(&j3Mat, "circle_fit - J3:");
 
-      const RowVector2Nd<N> Jq = mc.transpose() * s * 1. / n;  // var(q)
+      const RowVector2Nd<N> Jq = mc.transpose() * tempS * 1. / n;  // var(q)
       printIt(&Jq, "circle_fit - Jq:");
 
-      Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
-                         + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
+      Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                         + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose());
 
       circle.cov = cov_uvr;
     }
@@ -781,15 +782,15 @@ namespace Rfit {
  */
 
   template <typename M3xN, typename M6xN, typename V4>
-  __host__ __device__ inline line_fit Line_fit(const M3xN& hits,
-                                               const M6xN& hits_ge,
-                                               const circle_fit& circle,
-                                               const V4& fast_fit,
-                                               const double B,
-                                               const bool error) {
+  __host__ __device__ inline LineFit lineFit(const M3xN& hits,
+                                             const M6xN& hits_ge,
+                                             const CircleFit& circle,
+                                             const V4& fast_fit,
+                                             const double bField,
+                                             const bool error) {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
     constexpr auto n = N;
-    double theta = -circle.q * atan(fast_fit(3));
+    double theta = -circle.qCharge * atan(fast_fit(3));
     theta = theta < 0. ? theta + M_PI : theta;
 
     // Prepare the Rotation Matrix to rotate the points
@@ -805,10 +806,10 @@ namespace Rfit {
     // z values will be ordinary y-values
 
     Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
-    Eigen::Matrix<double, 2, 6> Jx;
+    Eigen::Matrix<double, 2, 6> jxMat;
 
 #ifdef RFIT_DEBUG
-    printf("Line_fit - B: %g\n", B);
+    printf("Line_fit - B: %g\n", bField);
     printIt(&hits, "Line_fit points: ");
     printIt(&hits_ge, "Line_fit covs: ");
     printIt(&rot, "Line_fit rot: ");
@@ -818,41 +819,41 @@ namespace Rfit {
     // Slide 11
     // a ==> -o i.e. the origin of the circle in XY plane, negative
     // b ==> p i.e. distances of the points wrt the origin of the circle.
-    const Vector2d o(circle.par(0), circle.par(1));
+    const Vector2d oVec(circle.par(0), circle.par(1));
 
     // associated Jacobian, used in weights and errors computation
-    Matrix6d Cov = Matrix6d::Zero();
+    Matrix6d covMat = Matrix6d::Zero();
     Matrix2d cov_sz[N];
-    for (u_int i = 0; i < n; ++i) {
-      Vector2d p = hits.block(0, i, 2, 1) - o;
-      const double cross = cross2D(-o, p);
-      const double dot = (-o).dot(p);
+    for (uint i = 0; i < n; ++i) {
+      Vector2d pVec = hits.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(-oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
       // atan2(cross, dot) give back the angle in the transverse plane so tha the
       // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
-      const double atan2_ = -circle.q * atan2(cross, dot);
+      const double tempQAtan2 = -circle.qCharge * atan2(cross, dot);
       //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
-      p2D(0, i) = atan2_ * circle.par(2);
+      p2D(0, i) = tempQAtan2 * circle.par(2);
 
       // associated Jacobian, used in weights and errors- computation
-      const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+      const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
       double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
       if (error) {
-        d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
-        d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
-        d_R = atan2_;
+        d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross);
+        d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross);
+        d_R = tempQAtan2;
       }
-      const double d_x = temp0 * (o(1) * dot + o(0) * cross);
-      const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-      Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
-
-      Cov.block(0, 0, 3, 3) = circle.cov;
-      Cov(3, 3) = hits_ge.col(i)[0];              // x errors
-      Cov(4, 4) = hits_ge.col(i)[2];              // y errors
-      Cov(5, 5) = hits_ge.col(i)[5];              // z errors
-      Cov(3, 4) = Cov(4, 3) = hits_ge.col(i)[1];  // cov_xy
-      Cov(3, 5) = Cov(5, 3) = hits_ge.col(i)[3];  // cov_xz
-      Cov(4, 5) = Cov(5, 4) = hits_ge.col(i)[4];  // cov_yz
-      Matrix2d tmp = Jx * Cov * Jx.transpose();
+      const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross);
+      const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross);
+      jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+      covMat.block(0, 0, 3, 3) = circle.cov;
+      covMat(3, 3) = hits_ge.col(i)[0];                 // x errors
+      covMat(4, 4) = hits_ge.col(i)[2];                 // y errors
+      covMat(5, 5) = hits_ge.col(i)[5];                 // z errors
+      covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1];  // cov_xy
+      covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3];  // cov_xz
+      covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4];  // cov_yz
+      Matrix2d tmp = jxMat * covMat * jxMat.transpose();
       cov_sz[i].noalias() = rot * tmp * rot.transpose();
     }
     // Math of d_{X0,Y0,R,x,y} all verified by hand
@@ -861,7 +862,7 @@ namespace Rfit {
     // The following matrix will contain errors orthogonal to the rotated S
     // component only, with the Multiple Scattering properly treated!!
     MatrixNd<N> cov_with_ms;
-    Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B, cov_with_ms);
+    scatterCovLine(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms);
 #ifdef RFIT_DEBUG
     printIt(cov_sz, "line_fit - cov_sz:");
     printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
@@ -880,52 +881,54 @@ namespace Rfit {
 #endif
 
     // Build the A Matrix
-    Matrix2xNd<N> A;
-    A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+    Matrix2xNd<N> aMat;
+    aMat << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
 
 #ifdef RFIT_DEBUG
-    printIt(&A, "A Matrix:");
+    printIt(&aMat, "A Matrix:");
 #endif
 
     // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
-    MatrixNd<N> Vy_inv;
-    math::cholesky::invert(cov_with_ms, Vy_inv);
-    // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
-    Eigen::Matrix<double, 2, 2> Cov_params = A * Vy_inv * A.transpose();
+    MatrixNd<N> vyInvMat;
+    math::cholesky::invert(cov_with_ms, vyInvMat);
+    // MatrixNd<N> vyInvMat = cov_with_ms.inverse();
+    Eigen::Matrix<double, 2, 2> covParamsMat = aMat * vyInvMat * aMat.transpose();
     // Compute the Covariance Matrix of the fit parameters
-    math::cholesky::invert(Cov_params, Cov_params);
+    math::cholesky::invert(covParamsMat, covParamsMat);
 
     // Now Compute the Parameters in the form [2,1]
     // The first component is q.
     // The second component is m.
-    Eigen::Matrix<double, 2, 1> sol = Cov_params * A * Vy_inv * p2D_rot.row(1).transpose();
+    Eigen::Matrix<double, 2, 1> sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose();
 
 #ifdef RFIT_DEBUG
     printIt(&sol, "Rotated solutions:");
 #endif
 
     // We need now to transfer back the results in the original s-z plane
-    auto common_factor = 1. / (sin(theta) - sol(1, 0) * cos(theta));
-    Eigen::Matrix<double, 2, 2> J;
-    J << 0., common_factor * common_factor, common_factor, sol(0, 0) * cos(theta) * common_factor * common_factor;
+    const auto sinTheta = sin(theta);
+    const auto cosTheta = cos(theta);
+    auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta);
+    Eigen::Matrix<double, 2, 2> jMat;
+    jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor;
 
-    double m = common_factor * (sol(1, 0) * sin(theta) + cos(theta));
-    double q = common_factor * sol(0, 0);
-    auto cov_mq = J * Cov_params * J.transpose();
+    double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta);
+    double tempQ = common_factor * sol(0, 0);
+    auto cov_mq = jMat * covParamsMat * jMat.transpose();
 
-    VectorNd<N> res = p2D_rot.row(1).transpose() - A.transpose() * sol;
-    double chi2 = res.transpose() * Vy_inv * res;
+    VectorNd<N> res = p2D_rot.row(1).transpose() - aMat.transpose() * sol;
+    double chi2 = res.transpose() * vyInvMat * res;
 
-    line_fit line;
-    line.par << m, q;
+    LineFit line;
+    line.par << tempM, tempQ;
     line.cov << cov_mq;
     line.chi2 = chi2;
 
 #ifdef RFIT_DEBUG
     printf("Common_factor: %g\n", common_factor);
-    printIt(&J, "Jacobian:");
+    printIt(&jMat, "Jacobian:");
     printIt(&sol, "Rotated solutions:");
-    printIt(&Cov_params, "Cov_params:");
+    printIt(&covParamsMat, "Cov_params:");
     printIt(&cov_mq, "Rotated Covariance Matrix:");
     printIt(&(line.par), "Real Parameters:");
     printIt(&(line.cov), "Real Covariance Matrix:");
@@ -959,7 +962,7 @@ namespace Rfit {
    |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n
    |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n
    |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)|
-   \param B magnetic field in the center of the detector in Gev/cm/c
+   \param bField magnetic field in the center of the detector in Gev/cm/c
    unit, in order to perform pt calculation.
    \param error flag for error computation.
    \param scattering flag for multiple scattering treatment.
@@ -969,37 +972,37 @@ namespace Rfit {
 */
 
   template <int N>
-  inline helix_fit Helix_fit(const Matrix3xNd<N>& hits,
-                             const Eigen::Matrix<float, 6, N>& hits_ge,
-                             const double B,
-                             const bool error) {
-    constexpr u_int n = N;
+  inline HelixFit helixFit(const Matrix3xNd<N>& hits,
+                           const Eigen::Matrix<float, 6, N>& hits_ge,
+                           const double bField,
+                           const bool error) {
+    constexpr uint n = N;
     VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
     // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
     Vector4d fast_fit;
-    Fast_fit(hits, fast_fit);
-    Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
-    Rfit::loadCovariance2D(hits_ge, hits_cov);
-    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, B, error);
-    line_fit line = Line_fit(hits, hits_ge, circle, fast_fit, B, error);
+    fastFit(hits, fast_fit);
+    riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+    riemannFit::loadCovariance2D(hits_ge, hits_cov);
+    CircleFit circle = circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, bField, error);
+    LineFit line = lineFit(hits, hits_ge, circle, fast_fit, bField, error);
 
-    par_uvrtopak(circle, B, error);
+    par_uvrtopak(circle, bField, error);
 
-    helix_fit helix;
+    HelixFit helix;
     helix.par << circle.par, line.par;
     if (error) {
       helix.cov = MatrixXd::Zero(5, 5);
       helix.cov.block(0, 0, 3, 3) = circle.cov;
       helix.cov.block(3, 3, 2, 2) = line.cov;
     }
-    helix.q = circle.q;
+    helix.qCharge = circle.qCharge;
     helix.chi2_circle = circle.chi2;
     helix.chi2_line = line.chi2;
 
     return helix;
   }
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 #endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
index 67ebe16e86840..f49d2f01f48c6 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
@@ -16,7 +16,7 @@
 class PixelNtupletsFitterProducer : public edm::global::EDProducer<> {
 public:
   explicit PixelNtupletsFitterProducer(const edm::ParameterSet& iConfig)
-      : useRiemannFit_(iConfig.getParameter<bool>("useRiemannFit")) {
+      : useRiemannFit_(iConfig.getParameter<bool>("useRiemannFit")), idealMagneticFieldToken_(esConsumes()) {
     produces<PixelFitter>();
   }
   ~PixelNtupletsFitterProducer() override {}
@@ -29,14 +29,14 @@ class PixelNtupletsFitterProducer : public edm::global::EDProducer<> {
 
 private:
   bool useRiemannFit_;
+  const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
   void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 };
 
 void PixelNtupletsFitterProducer::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::ESHandle<MagneticField> fieldESH;
-  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
+  auto const& idealField = iSetup.getData(idealMagneticFieldToken_);
   float bField = 1 / PixelRecoUtilities::fieldInInvGev(iSetup);
-  auto impl = std::make_unique<PixelNtupletsFitter>(bField, fieldESH.product(), useRiemannFit_);
+  auto impl = std::make_unique<PixelNtupletsFitter>(bField, &idealField, useRiemannFit_);
   auto prod = std::make_unique<PixelFitter>(std::move(impl));
   iEvent.put(std::move(prod));
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index cdea22c3a8a24..94c490e948575 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -53,10 +53,14 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
 private:
   void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
 
-  edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
-  edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
-  edm::EDGetTokenT<HMSstorage> hmsToken_;
+  // Event Data tokens
+  const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
+  const edm::EDGetTokenT<HMSstorage> hmsToken_;
+  // Event Setup tokens
+  const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
+  const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> ttTopoToken_;
 
   int32_t const minNumberOfHits_;
 };
@@ -66,6 +70,8 @@ PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iC
       tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
       cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      idealMagneticFieldToken_(esConsumes()),
+      ttTopoToken_(esConsumes()),
       minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits")) {
   produces<reco::TrackCollection>();
   produces<TrackingRecHitCollection>();
@@ -91,30 +97,22 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   auto indToEdmP = std::make_unique<IndToEdm>();
   auto &indToEdm = *indToEdmP;
 
-  edm::ESHandle<MagneticField> fieldESH;
-  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
+  auto const &idealField = iSetup.getData(idealMagneticFieldToken_);
 
   pixeltrackfitting::TracksWithRecHits tracks;
-  edm::ESHandle<TrackerTopology> httopo;
-  iSetup.get<TrackerTopologyRcd>().get(httopo);
 
-  edm::Handle<reco::BeamSpot> bsHandle;
-  iEvent.getByToken(tBeamSpot_, bsHandle);
-  const auto &bsh = *bsHandle;
-  // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
+  auto const &httopo = iSetup.getData(ttTopoToken_);
+
+  const auto &bsh = iEvent.get(tBeamSpot_);
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
 
-  edm::Handle<SiPixelRecHitCollectionNew> gh;
-  iEvent.getByToken(cpuHits_, gh);
-  auto const &rechits = *gh;
+  auto const &rechits = iEvent.get(cpuHits_);
   std::vector<TrackingRecHit const *> hitmap;
   auto const &rcs = rechits.data();
   auto nhits = rcs.size();
   hitmap.resize(nhits, nullptr);
 
-  edm::Handle<HMSstorage> hhms;
-  iEvent.getByToken(hmsToken_, hhms);
-  auto const *hitsModuleStart = (*hhms).get();
+  auto const *hitsModuleStart = iEvent.get(hmsToken_).get();
   auto fc = hitsModuleStart;
 
   for (auto const &h : rcs) {
@@ -147,8 +145,8 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
       break;  // this is a guard: maybe we need to move to nTracks...
     indToEdm.push_back(-1);
     auto q = quality[it];
-    if (q != trackQuality::loose)
-      continue;  // FIXME
+    if (q != pixelTrack::Quality::loose)
+      continue;
     if (nHits < minNumberOfHits_)
       continue;
     indToEdm.back() = nt;
@@ -164,10 +162,10 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
     float chi2 = tsoa.chi2(it);
     float phi = tsoa.phi(it);
 
-    Rfit::Vector5d ipar, opar;
-    Rfit::Matrix5d icov, ocov;
+    riemannFit::Vector5d ipar, opar;
+    riemannFit::Matrix5d icov, ocov;
     fit.copyToDense(ipar, icov, it);
-    Rfit::transformToPerigeePlane(ipar, icov, opar, ocov);
+    riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
     AlgebraicSymMatrix55 m;
@@ -180,16 +178,14 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
     Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
 
     Plane impPointPlane(bs, rot);
-    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()),
-                                  impPointPlane.toGlobal(lpar.momentum()),
-                                  lpar.charge(),
-                                  fieldESH.product());
-    JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, *fieldESH.product());
+    GlobalTrajectoryParameters gp(
+        impPointPlane.toGlobal(lpar.position()), impPointPlane.toGlobal(lpar.momentum()), lpar.charge(), &idealField);
+    JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, idealField);
 
     AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
 
     int ndof = 2 * hits.size() - 5;
-    chi2 = chi2 * ndof;  // FIXME
+    chi2 = chi2 * ndof;
     GlobalPoint vv = gp.position();
     math::XYZPoint pos(vv.x(), vv.y(), vv.z());
     GlobalVector pp = gp.momentum();
@@ -202,7 +198,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl;
 
   // store tracks
-  storeTracks(iEvent, tracks, *httopo);
+  storeTracks(iEvent, tracks, httopo);
   iEvent.put(std::move(indToEdmP));
 }
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index c8310bc645db3..2de8ec6c335b5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -17,6 +17,9 @@
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
+// Switch on to enable checks and printout for found tracks
+#undef PIXEL_DEBUG_PRODUCE
+
 class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig);
@@ -33,7 +36,7 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
+  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
@@ -54,29 +57,30 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
-  m_soa = inputData.toHostAsync(ctx.stream());
+  soa_ = inputData.toHostAsync(ctx.stream());
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  /*
-  auto const & tsoa = *m_soa;
+#ifdef PIXEL_DEBUG_PRODUCE
+  auto const& tsoa = *soa_;
   auto maxTracks = tsoa.stride();
   std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
     auto nHits = tsoa.nHits(it);
-    assert(nHits==int(tsoa.hitIndices.size(it)));
-    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    assert(nHits == int(tsoa.hitIndices.size(it)));
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
   }
   std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
-  */
+#endif
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(m_soa)));
+  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(soa_)));
 
-  assert(!m_soa);
+  assert(!soa_);
 }
 
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
index 32a9aeb982094..96f5d5fe03448 100644
--- a/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
@@ -26,7 +26,7 @@ PixelNtupletsFitter::PixelNtupletsFitter(float nominalB, const MagneticField* fi
 std::unique_ptr<reco::Track> PixelNtupletsFitter::run(const std::vector<const TrackingRecHit*>& hits,
                                                       const TrackingRegion& region,
                                                       const edm::EventSetup&) const {
-  using namespace Rfit;
+  using namespace riemannFit;
 
   std::unique_ptr<reco::Track> ret;
 
@@ -47,7 +47,7 @@ std::unique_ptr<reco::Track> PixelNtupletsFitter::run(const std::vector<const Tr
   }
 
   assert(nhits == 4);
-  Rfit::Matrix3xNd<4> hits_gp;
+  riemannFit::Matrix3xNd<4> hits_gp;
 
   Eigen::Matrix<float, 6, 4> hits_ge = Eigen::Matrix<float, 6, 4>::Zero();
 
@@ -58,10 +58,10 @@ std::unique_ptr<reco::Track> PixelNtupletsFitter::run(const std::vector<const Tr
         errors[i].czz();
   }
 
-  helix_fit fittedTrack = useRiemannFit_ ? Rfit::Helix_fit(hits_gp, hits_ge, nominalB_, true)
-                                         : BrokenLine::BL_Helix_fit(hits_gp, hits_ge, nominalB_);
+  HelixFit fittedTrack = useRiemannFit_ ? riemannFit::helixFit(hits_gp, hits_ge, nominalB_, true)
+                                        : brokenline::helixFit(hits_gp, hits_ge, nominalB_);
 
-  int iCharge = fittedTrack.q;
+  int iCharge = fittedTrack.qCharge;
 
   // parameters are:
   // 0: phi
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 8d02db9a0e638..98dc3d9b282f1 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -15,13 +15,13 @@
   <flags CXXFLAGS="-g"/>
 </bin>
 
-<bin file="testRiemannFit.cpp" name="testBrokenLineFit">
+<bin file="testFits.cpp" name="testBrokenLineFit">
   <use name="cuda"/>
   <use name="eigen"/>
   <flags CXXFLAGS="-g -DUSE_BL"/>
 </bin>
 
-<bin file="testRiemannFit.cpp" name="testRiemannFitDump">
+<bin file="testFits.cpp" name="testRiemannFitDump">
   <use name="cuda"/>
   <use name="eigen"/>
   <flags CXXFLAGS="-g -DRFIT_DEBUG"/>
@@ -55,14 +55,14 @@
   <flags CXXFLAGS="-DEIGEN_NO_DEBUG"/>
 </bin>
 
-<bin file="PixelTrackRiemannFit.cc" name="PixelTrackBrokenLineFit">
+<bin file="PixelTrackFits.cc" name="PixelTrackBrokenLineFit">
   <use name="cuda"/>
   <use name="eigen"/>
   <use name="root"/>
   <flags CXXFLAGS="-DEIGEN_NO_DEBUG -DUSE_BL"/>
 </bin>
 
-<bin file="PixelTrackRiemannFit.cc" name="PixelTrackRiemannFit_Debug">
+<bin file="PixelTrackFits.cc" name="PixelTrackRiemannFit_Debug">
   <use name="cuda"/>
   <use name="eigen"/>
   <use name="root"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc
similarity index 90%
rename from RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
rename to RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc
index 5395b93629f49..e5a652e9d43f8 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackRiemannFit.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc
@@ -18,15 +18,15 @@
 
 using namespace std;
 using namespace Eigen;
-using namespace Rfit;
+using namespace riemannFit;
 using std::unique_ptr;
 
-namespace Rfit {
+namespace riemannFit {
   using Vector3i = Eigen::Matrix<int, 3, 1>;
   using Vector4i = Eigen::Matrix<int, 4, 1>;
   using Vector6d = Eigen::Matrix<double, 6, 1>;
   using Vector8d = Eigen::Matrix<double, 8, 1>;
-};  // namespace Rfit
+};  // namespace riemannFit
 
 // quadruplets...
 struct hits_gen {
@@ -64,7 +64,7 @@ void smearing(const Vector5d& err, const bool& isbarrel, double& x, double& y, d
   if (isbarrel) {
     double dev_Rp = dist_Rp(generator);
     double dev_R = dist_R(generator);
-    double R = sqrt(Rfit::sqr(x) + Rfit::sqr(y));
+    double R = sqrt(riemannFit::sqr(x) + riemannFit::sqr(y));
     x += dev_Rp * +y / R + dev_R * -x / R;
     y += dev_Rp * -x / R + dev_R * -y / R;
     z += dist_z(generator);
@@ -83,15 +83,19 @@ void Hits_cov(Eigen::Matrix<float, 6, 4>& V,
               const Vector5d& err,
               bool isbarrel) {
   if (isbarrel) {
-    double R2 = Rfit::sqr(hits(0, i)) + Rfit::sqr(hits(1, i));
-    V.col(i)[0] = (Rfit::sqr(err[1]) * Rfit::sqr(hits(1, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(0, i))) / R2;
-    V.col(i)[2] = (Rfit::sqr(err[1]) * Rfit::sqr(hits(0, i)) + Rfit::sqr(err[0]) * Rfit::sqr(hits(1, i))) / R2;
-    V.col(i)[1] = (Rfit::sqr(err[0]) - Rfit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2;
-    V.col(i)[5] = Rfit::sqr(err[2]);
+    double R2 = riemannFit::sqr(hits(0, i)) + riemannFit::sqr(hits(1, i));
+    V.col(i)[0] = (riemannFit::sqr(err[1]) * riemannFit::sqr(hits(1, i)) +
+                   riemannFit::sqr(err[0]) * riemannFit::sqr(hits(0, i))) /
+                  R2;
+    V.col(i)[2] = (riemannFit::sqr(err[1]) * riemannFit::sqr(hits(0, i)) +
+                   riemannFit::sqr(err[0]) * riemannFit::sqr(hits(1, i))) /
+                  R2;
+    V.col(i)[1] = (riemannFit::sqr(err[0]) - riemannFit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2;
+    V.col(i)[5] = riemannFit::sqr(err[2]);
   } else {
-    V.col(i)[0] = Rfit::sqr(err[3]);
-    V.col(i)[2] = Rfit::sqr(err[3]);
-    V.col(i)[5] = Rfit::sqr(err[4]);
+    V.col(i)[0] = riemannFit::sqr(err[3]);
+    V.col(i)[2] = riemannFit::sqr(err[3]);
+    V.col(i)[5] = riemannFit::sqr(err[4]);
   }
 }
 
@@ -118,15 +122,15 @@ hits_gen Hits_gen(const unsigned int& n, const Matrix<double, 6, 1>& gen_par) {
   for (unsigned int i = 0; i < n; ++i) {
     const double a = gen_par(4);
     const double b = rad[i];
-    const double c = sqrt(Rfit::sqr(x2) + Rfit::sqr(y2));
-    const double beta = acos((Rfit::sqr(a) - Rfit::sqr(b) - Rfit::sqr(c)) / (-2. * b * c));
+    const double c = sqrt(riemannFit::sqr(x2) + riemannFit::sqr(y2));
+    const double beta = acos((riemannFit::sqr(a) - riemannFit::sqr(b) - riemannFit::sqr(c)) / (-2. * b * c));
     const double gamma = alpha + beta;
     gen.hits(0, i) = rad[i] * cos(gamma);
     gen.hits(1, i) = rad[i] * sin(gamma);
     gen.hits(2, i) =
         gen_par(2) +
         1 / tan(gen_par(5) * pi / 180) * 2. *
-            asin(sqrt(Rfit::sqr((gen_par(0) - gen.hits(0, i))) + Rfit::sqr((gen_par(1) - gen.hits(1, i)))) /
+            asin(sqrt(riemannFit::sqr((gen_par(0) - gen.hits(0, i))) + riemannFit::sqr((gen_par(1) - gen.hits(1, i)))) /
                  (2. * gen_par(4))) *
             gen_par(4);
     // isbarrel(i) = ??
@@ -143,10 +147,10 @@ Vector5d True_par(const Matrix<double, 6, 1>& gen_par, const int& charge, const
   Vector5d true_par;
   const double x0 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180);
   const double y0 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180);
-  circle_fit circle;
+  CircleFit circle;
   circle.par << x0, y0, gen_par(4);
-  circle.q = 1;
-  Rfit::par_uvrtopak(circle, B_field, false);
+  circle.qCharge = 1;
+  riemannFit::par_uvrtopak(circle, B_field, false);
   true_par.block(0, 0, 3, 1) = circle.par;
   true_par(3) = 1 / tan(gen_par(5) * pi / 180);
   const int dir = ((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1)) * (gen_par(1) - y0) -
@@ -155,8 +159,8 @@ Vector5d True_par(const Matrix<double, 6, 1>& gen_par, const int& charge, const
                       ? -1
                       : 1;
   true_par(4) = gen_par(2) + 1 / tan(gen_par(5) * pi / 180) * dir * 2.f *
-                                 asin(sqrt(Rfit::sqr((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1))) +
-                                           Rfit::sqr((gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)))) /
+                                 asin(sqrt(riemannFit::sqr((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1))) +
+                                           riemannFit::sqr((gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)))) /
                                       (2.f * gen_par(4))) *
                                  gen_par(4);
   return true_par;
@@ -365,7 +369,7 @@ void test_helix_fit(bool getcin) {
   const int iteration = 5000;
   gen_par = New_par(gen_par, 1, B_field);
   true_par = True_par(gen_par, 1, B_field);
-  std::array<helix_fit, iteration> helixRiemann_fit;
+  std::array<HelixFit, iteration> helixRiemann_fit;
 
   std::cout << "\nTrue parameters: "
             << "phi: " << true_par(0) << " "
@@ -387,9 +391,9 @@ void test_helix_fit(bool getcin) {
     delta -= std::chrono::high_resolution_clock::now() - start;
     helixRiemann_fit[i % iteration] =
 #ifdef USE_BL
-        BrokenLine::BL_Helix_fit(gen.hits, gen.hits_ge, B_field);
+        brokenline::helixFit(gen.hits, gen.hits_ge, B_field);
 #else
-        Rfit::Helix_fit(gen.hits, gen.hits_ge, B_field, true);
+        riemannFit::helixFit(gen.hits, gen.hits_ge, B_field, true);
 #endif
     delta += std::chrono::high_resolution_clock::now() - start;
 
@@ -406,7 +410,7 @@ void test_helix_fit(bool getcin) {
            << true_par(3) << endl
            << "Zip:  " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs "
            << true_par(4) << endl
-           << "charge:" << helixRiemann_fit[i].q << " vs 1" << endl
+           << "charge:" << helixRiemann_fit[i].qCharge << " vs 1" << endl
            << "covariance matrix:" << endl
            << helixRiemann_fit[i].cov << endl
            << "Initial hits:\n"
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index f0b641361aee4..d5eba9be26594 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -16,7 +16,7 @@
 
 using namespace Eigen;
 
-namespace Rfit {
+namespace riemannFit {
   constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
   constexpr uint32_t stride() { return maxNumberOfTracks(); }
   // hits
@@ -32,32 +32,32 @@ namespace Rfit {
   // fast fit
   using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()>>;
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 template <int N>
 __global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
   if (i != 0)
     return;
   printf("GPU sizes %lu %lu %lu %lu %lu\n",
          sizeof(hits[i]),
          sizeof(hits_ge[i]),
          sizeof(Vector4d),
-         sizeof(Rfit::line_fit),
-         sizeof(Rfit::circle_fit));
+         sizeof(riemannFit::LineFit),
+         sizeof(riemannFit::CircleFit));
 }
 
 template <int N>
 __global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d result(presults + i, 4);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d result(presults + i, 4);
 #ifdef USE_BL
-  BrokenLine::BL_Fast_fit(hits, result);
+  brokenline::fastFit(hits, result);
 #else
-  Rfit::Fast_fit(hits, result);
+  riemannFit::fastFit(hits, result);
 #endif
 }
 
@@ -68,24 +68,24 @@ __global__ void kernelBrokenLineFit(double* __restrict__ phits,
                                     float* __restrict__ phits_ge,
                                     double* __restrict__ pfast_fit_input,
                                     double B,
-                                    Rfit::circle_fit* circle_fit,
-                                    Rfit::line_fit* line_fit) {
+                                    riemannFit::CircleFit* circle_fit,
+                                    riemannFit::LineFit* line_fit) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
 
-  BrokenLine::PreparedBrokenLineData<N> data;
-  Rfit::Matrix3d Jacob;
+  brokenline::PreparedBrokenLineData<N> data;
+  riemannFit::Matrix3d Jacob;
 
   auto& line_fit_results = line_fit[i];
   auto& circle_fit_results = circle_fit[i];
 
-  BrokenLine::prepareBrokenLineData(hits, fast_fit_input, B, data);
-  BrokenLine::BL_Line_fit(hits_ge, fast_fit_input, B, data, line_fit_results);
-  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
+  brokenline::prepareBrokenLineData(hits, fast_fit_input, B, data);
+  brokenline::lineFit(hits_ge, fast_fit_input, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
-      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 
@@ -99,21 +99,21 @@ __global__ void kernelBrokenLineFit(double* __restrict__ phits,
 #else
 
 template <int N>
-__global__ void kernelCircleFit(double* __restrict__ phits,
-                                float* __restrict__ phits_ge,
-                                double* __restrict__ pfast_fit_input,
-                                double B,
-                                Rfit::circle_fit* circle_fit_resultsGPU) {
+__global__ void kernel_CircleFit(double* __restrict__ phits,
+                                 float* __restrict__ phits_ge,
+                                 double* __restrict__ pfast_fit_input,
+                                 double B,
+                                 riemannFit::CircleFit* circle_fit_resultsGPU) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
 
   constexpr auto n = N;
 
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
-  Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
-  Rfit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+  riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
 
 #ifdef TEST_DEBUG
   if (0 == i) {
@@ -133,7 +133,7 @@ __global__ void kernelCircleFit(double* __restrict__ phits,
     printf("B: %f\n", B);
   }
 #endif
-  circle_fit_resultsGPU[i] = Rfit::Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
+  circle_fit_resultsGPU[i] = riemannFit::circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
 #ifdef TEST_DEBUG
   if (0 == i) {
     printf("Circle param %f,%f,%f\n",
@@ -148,14 +148,14 @@ template <int N>
 __global__ void kernelLineFit(double* __restrict__ phits,
                               float* __restrict__ phits_ge,
                               double B,
-                              Rfit::circle_fit* circle_fit,
+                              riemannFit::CircleFit* circle_fit,
                               double* __restrict__ pfast_fit_input,
-                              Rfit::line_fit* line_fit) {
+                              riemannFit::LineFit* line_fit) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
-  line_fit[i] = Rfit::Line_fit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  line_fit[i] = riemannFit::lineFit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
 }
 #endif
 
@@ -204,8 +204,8 @@ __device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
 template <int N>
 __global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
   hits_ge = MatrixXf::Zero(6, N);
   fillHitsAndHitsCov(hits, hits_ge);
 }
@@ -213,22 +213,22 @@ __global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phit
 template <int N>
 void testFit() {
   constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd<N> hits;
-  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+  riemannFit::Matrix3xNd<N> hits;
+  riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
   double* hitsGPU = nullptr;
   ;
   float* hits_geGPU = nullptr;
   double* fast_fit_resultsGPU = nullptr;
-  double* fast_fit_resultsGPUret = new double[Rfit::maxNumberOfTracks() * sizeof(Vector4d)];
-  Rfit::circle_fit* circle_fit_resultsGPU = nullptr;
-  Rfit::circle_fit* circle_fit_resultsGPUret = new Rfit::circle_fit();
-  Rfit::line_fit* line_fit_resultsGPU = nullptr;
-  Rfit::line_fit* line_fit_resultsGPUret = new Rfit::line_fit();
+  double* fast_fit_resultsGPUret = new double[riemannFit::maxNumberOfTracks() * sizeof(Vector4d)];
+  riemannFit::CircleFit* circle_fit_resultsGPU = nullptr;
+  riemannFit::CircleFit* circle_fit_resultsGPUret = new riemannFit::CircleFit();
+  riemannFit::LineFit* line_fit_resultsGPU = nullptr;
+  riemannFit::LineFit* line_fit_resultsGPUret = new riemannFit::LineFit();
 
   fillHitsAndHitsCov(hits, hits_ge);
 
   std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' '
-            << sizeof(Rfit::line_fit) << ' ' << sizeof(Rfit::circle_fit) << std::endl;
+            << sizeof(riemannFit::LineFit) << ' ' << sizeof(riemannFit::CircleFit) << std::endl;
 
   std::cout << "Generated hits:\n" << hits << std::endl;
   std::cout << "Generated cov:\n" << hits_ge << std::endl;
@@ -236,23 +236,23 @@ void testFit() {
   // FAST_FIT_CPU
 #ifdef USE_BL
   Vector4d fast_fit_results;
-  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+  brokenline::fastFit(hits, fast_fit_results);
 #else
   Vector4d fast_fit_results;
-  Rfit::Fast_fit(hits, fast_fit_results);
+  riemannFit::fastFit(hits, fast_fit_results);
 #endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
   // for timing    purposes we fit    4096 tracks
   constexpr uint32_t Ntracks = 4096;
-  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix3xNd<N>)));
-  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix6xNf<N>)));
-  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
-  cudaCheck(cudaMalloc(&line_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
-  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMalloc(&hitsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix3xNd<N>)));
+  cudaCheck(cudaMalloc(&hits_geGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix6xNf<N>)));
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMalloc(&line_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::CircleFit)));
 
-  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
-  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
 
   kernelPrintSizes<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
   kernelFillHitsAndHitsCov<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
@@ -263,23 +263,23 @@ void testFit() {
 
   cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
                        fast_fit_resultsGPU,
-                       Rfit::maxNumberOfTracks() * sizeof(Vector4d),
+                       riemannFit::maxNumberOfTracks() * sizeof(Vector4d),
                        cudaMemcpyDeviceToHost));
-  Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
+  riemannFit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
 
 #ifdef USE_BL
   // CIRCLE AND LINE FIT CPU
-  BrokenLine::PreparedBrokenLineData<N> data;
-  BrokenLine::karimaki_circle_fit circle_fit_results;
-  Rfit::line_fit line_fit_results;
-  Rfit::Matrix3d Jacob;
-  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
-  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
-  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  brokenline::PreparedBrokenLineData<N> data;
+  brokenline::karimaki_circle_fit circle_fit_results;
+  riemannFit::LineFit line_fit_results;
+  riemannFit::Matrix3d Jacob;
+  brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
-      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 
@@ -290,19 +290,20 @@ void testFit() {
 
 #else
   // CIRCLE_FIT CPU
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge, hits_cov);
-  Rfit::circle_fit circle_fit_results =
-      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+  riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::CircleFit circle_fit_results =
+      riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
 
   // CIRCLE_FIT GPU
-  kernelCircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  kernel_CircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
   cudaDeviceSynchronize();
 
   // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+  riemannFit::LineFit line_fit_results =
+      riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
 
   kernelLineFit<N>
       <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
@@ -311,14 +312,15 @@ void testFit() {
 
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
-  cudaCheck(
-      cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(
+      circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(riemannFit::CircleFit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
 
   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
   // LINE_FIT GPU
-  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
+  cudaCheck(
+      cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(riemannFit::LineFit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
index 709757a803884..a8e040fa0df38 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
@@ -1,8 +1,8 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 #include <cmath>
 
-using Rfit::Matrix5d;
-using Rfit::Vector5d;
+using riemannFit::Matrix5d;
+using riemannFit::Vector5d;
 
 #include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
 
@@ -76,7 +76,7 @@ int main() {
 
         // Matrix5d covf = transfFast(cov0,par0);
 
-        Rfit::transformToPerigeePlane(par0, cov0, par1, cov1);
+        riemannFit::transformToPerigeePlane(par0, cov0, par1, cov1);
 
         std::cout << "cov1\n" << cov1 << std::endl;
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp
similarity index 78%
rename from RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
rename to RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp
index 370828c4fcef9..7c0dab3be3e00 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testRiemannFit.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp
@@ -13,7 +13,7 @@
 
 using namespace Eigen;
 
-namespace Rfit {
+namespace riemannFit {
   constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
   constexpr uint32_t stride() { return maxNumberOfTracks(); }
   // hits
@@ -29,7 +29,7 @@ namespace Rfit {
   // fast fit
   using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 /*
 Hit global: 641,0 2: 2.934787,0.773211,-10.980247
@@ -89,8 +89,8 @@ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
 template <int N>
 void testFit() {
   constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd<N> hits;
-  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+  riemannFit::Matrix3xNd<N> hits;
+  riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
 
   fillHitsAndHitsCov(hits, hits_ge);
 
@@ -102,37 +102,38 @@ void testFit() {
   // FAST_FIT_CPU
 #ifdef USE_BL
   Vector4d fast_fit_results;
-  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+  brokenline::fastFit(hits, fast_fit_results);
 #else
   Vector4d fast_fit_results;
-  Rfit::Fast_fit(hits, fast_fit_results);
+  riemannFit::fastFit(hits, fast_fit_results);
 #endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
   // CIRCLE_FIT CPU
 
 #ifdef USE_BL
-  BrokenLine::PreparedBrokenLineData<N> data;
-  BrokenLine::karimaki_circle_fit circle_fit_results;
-  Rfit::Matrix3d Jacob;
-
-  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
-  Rfit::line_fit line_fit_results;
-  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
-  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  brokenline::PreparedBrokenLineData<N> data;
+  brokenline::karimaki_circle_fit circle_fit_results;
+  riemannFit::Matrix3d Jacob;
+
+  brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  riemannFit::LineFit line_fit_results;
+  brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
-      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 #else
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
-  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge, hits_cov);
-  Rfit::circle_fit circle_fit_results =
-      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+  riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::CircleFit circle_fit_results =
+      riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
   // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
-  Rfit::par_uvrtopak(circle_fit_results, B, true);
+  riemannFit::LineFit line_fit_results =
+      riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+  riemannFit::par_uvrtopak(circle_fit_results, B, true);
 
 #endif
 
diff --git a/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
index dfe7da010f99e..986fe2e2992b9 100644
--- a/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
+++ b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
@@ -80,16 +80,6 @@ constexpr void CircleEq<T>::compute(T x1, T y1, T x2, T y2, T x3, T y3) {
   auto num = x1p * y3p - y1p * x3p;  // num also gives correct sign for CT
   auto det = d12 * y3p - d32 * y1p;
 
-  /*
-  auto ct  = num/det;
-  auto sn  = det>0 ? T(1.) : T(-1.);
-  auto st2 = (d12*x3p-d32*x1p)/det;
-  auto seq = T(1.) +st2*st2;
-  auto al2 = sn/std::sqrt(seq);
-  auto be2 = -st2*al2;
-  ct *= T(2.)*al2;
-  */
-
   auto st2 = (d12 * x3p - d32 * x1p);
   auto seq = det * det + st2 * st2;
   auto al2 = T(1.) / std::sqrt(seq);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index cc5865d97fd95..bebfe0e08008e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -1,67 +1,69 @@
 #include "BrokenLineFitOnGPU.h"
 
 void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
+  assert(tuples_);
 
   //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto hitsGPU_ =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
   auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    kernel_BLFastFit<3>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
 
-    kernelBLFit<3>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   3,
-                   offset);
+    kernel_BLFit<3>(tupleMultiplicity_,
+                    bField_,
+                    outputSoa_,
+                    hitsGPU_.get(),
+                    hits_geGPU_.get(),
+                    fast_fit_resultsGPU_.get(),
+                    3,
+                    offset);
 
     // fit quads
-    kernelBLFastFit<4>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    kernel_BLFastFit<4>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
 
-    kernelBLFit<4>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   4,
-                   offset);
+    kernel_BLFit<4>(tupleMultiplicity_,
+                    bField_,
+                    outputSoa_,
+                    hitsGPU_.get(),
+                    hits_geGPU_.get(),
+                    fast_fit_resultsGPU_.get(),
+                    4,
+                    offset);
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<4>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
 
-      kernelBLFit<4>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
+      kernel_BLFit<4>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU_.get(),
+                      hits_geGPU_.get(),
+                      fast_fit_resultsGPU_.get(),
+                      5,
+                      offset);
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<5>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
 
-      kernelBLFit<5>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
+      kernel_BLFit<5>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU_.get(),
+                      hits_geGPU_.get(),
+                      fast_fit_resultsGPU_.get(),
+                      5,
+                      offset);
     }
 
   }  // loop on concurrent fits
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 6fc537237286f..d2ca583e86bd0 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -5,79 +5,79 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
                                             cudaStream_t stream) {
-  assert(tuples_d);
+  assert(tuples_);
 
   auto blockSize = 64;
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
   auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
   auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
   auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                             bField_,
-                                                             outputSoa_d,
-                                                             hitsGPU_.get(),
-                                                             hits_geGPU_.get(),
-                                                             fast_fit_resultsGPU_.get(),
-                                                             3,
-                                                             offset);
+    kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                              bField_,
+                                                              outputSoa_,
+                                                              hitsGPU_.get(),
+                                                              hits_geGPU_.get(),
+                                                              fast_fit_resultsGPU_.get(),
+                                                              3,
+                                                              offset);
     cudaCheck(cudaGetLastError());
 
     // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 bField_,
-                                                                 outputSoa_d,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 4,
-                                                                 offset);
+    kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                  bField_,
+                                                                  outputSoa_,
+                                                                  hitsGPU_.get(),
+                                                                  hits_geGPU_.get(),
+                                                                  fast_fit_resultsGPU_.get(),
+                                                                  4,
+                                                                  offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
+      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
       cudaCheck(cudaGetLastError());
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
+      kernel_BLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
       cudaCheck(cudaGetLastError());
     }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 96a641829d797..ee5065e81fc45 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -23,14 +23,14 @@ using OutputSoA = pixelTrack::TrackSoA;
 // #define BL_DUMP_HITS
 
 template <int N>
-__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
-                                CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                HitsOnGPU const *__restrict__ hhp,
-                                double *__restrict__ phits,
-                                float *__restrict__ phits_ge,
-                                double *__restrict__ pfast_fit,
-                                uint32_t nHits,
-                                uint32_t offset) {
+__global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
+                                 caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                 HitsOnGPU const *__restrict__ hhp,
+                                 double *__restrict__ phits,
+                                 float *__restrict__ phits_ge,
+                                 double *__restrict__ pfast_fit,
+                                 uint32_t nHits,
+                                 uint32_t offset) {
   constexpr uint32_t hitsInFit = N;
 
   assert(hitsInFit <= nHits);
@@ -50,7 +50,7 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
   }
 #endif
 
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -62,9 +62,9 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
 
     assert(foundNtuplets->size(tkid) == nHits);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
 #ifdef BL_DUMP_HITS
     __shared__ int done;
@@ -105,7 +105,7 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
       hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
-    BrokenLine::BL_Fast_fit(hits, fast_fit);
+    brokenline::fastFit(hits, fast_fit);
 
     // no NaN here....
     assert(fast_fit(0) == fast_fit(0));
@@ -116,14 +116,14 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
 }
 
 template <int N>
-__global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                            double B,
-                            OutputSoA *results,
-                            double *__restrict__ phits,
-                            float *__restrict__ phits_ge,
-                            double *__restrict__ pfast_fit,
-                            uint32_t nHits,
-                            uint32_t offset) {
+__global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                             double bField,
+                             OutputSoA *results,
+                             double *__restrict__ phits,
+                             float *__restrict__ phits_ge,
+                             double *__restrict__ pfast_fit,
+                             uint32_t nHits,
+                             uint32_t offset) {
   assert(N <= nHits);
 
   assert(results);
@@ -133,7 +133,7 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
 
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -142,22 +142,21 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
     // get it for the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
-    BrokenLine::PreparedBrokenLineData<N> data;
-    Rfit::Matrix3d Jacob;
+    brokenline::PreparedBrokenLineData<N> data;
 
-    BrokenLine::karimaki_circle_fit circle;
-    Rfit::line_fit line;
+    brokenline::karimaki_circle_fit circle;
+    riemannFit::LineFit line;
 
-    BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
-    BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
-    BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+    brokenline::prepareBrokenLineData(hits, fast_fit, bField, data);
+    brokenline::lineFit(hits_ge, fast_fit, bField, data, line);
+    brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
-    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(B), tkid);
-    results->pt(tkid) = float(B) / float(std::abs(circle.par(2)));
+    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+    results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
     results->eta(tkid) = asinhf(line.par(0));
     results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index d9c3ff70e35ed..5342141d2c9e4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -12,54 +12,72 @@
 
 //#define ONLY_PHICUT
 
-namespace CAConstants {
+// Cellular automaton constants
+namespace caConstants {
 
   // constants
 #ifdef ONLY_PHICUT
-  constexpr uint32_t maxNumberOfTuples() { return 48 * 1024; }
-  constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
-#else
+  constexpr uint32_t maxCellNeighbors = 64;
+  constexpr uint32_t maxCellTracks = 64;
+  constexpr uint32_t maxNumberOfTuples = 48 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 2 * 1024 * 1024;
+  constexpr uint32_t maxCellsPerHit = 8 * 128;
+#else  // ONLY_PHICUT
+  constexpr uint32_t maxCellNeighbors = 36;
+  constexpr uint32_t maxCellTracks = 48;
 #ifdef GPU_SMALL_EVENTS
   // kept for testing and debugging
-  constexpr uint32_t maxNumberOfTuples() { return 3 * 1024; }
-  constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
-#else
+  constexpr uint32_t maxNumberOfTuples = 3 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 128 * 1024;
+  constexpr uint32_t maxCellsPerHit = 128 / 2;
+#else   // GPU_SMALL_EVENTS
   // tested on MC events with 55-75 pileup events
-  constexpr uint32_t maxNumberOfTuples() { return 24 * 1024; }
-  constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 128; }
-#endif
+  constexpr uint32_t maxNumberOfTuples = 24 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 512 * 1024;
+  constexpr uint32_t maxCellsPerHit = 128;
+#endif  // GPU_SMALL_EVENTS
 #endif  // ONLY_PHICUT
-  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }
-  constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
+  constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
+  constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples;
 
-  constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
-  constexpr uint32_t maxNumberOfLayers() { return 10; }
-  constexpr uint32_t maxTuples() { return maxNumberOfTuples(); }
+  constexpr uint32_t maxNumberOfLayerPairs = 20;
+  constexpr uint32_t maxNumberOfLayers = 10;
+  constexpr uint32_t maxTuples = maxNumberOfTuples;
+
+  // Modules constants
+  constexpr uint32_t max_ladder_bpx0 = 12;
+  constexpr uint32_t first_ladder_bpx0 = 0;
+  constexpr float module_length_bpx0 = 6.7f;
+  constexpr float module_tolerance_bpx0 = 0.4f;  // projection to cylinder is inaccurate on BPIX1
+  constexpr uint32_t max_ladder_bpx4 = 64;
+  constexpr uint32_t first_ladder_bpx4 = 84;
+  constexpr float radius_even_ladder = 15.815f;
+  constexpr float radius_odd_ladder = 16.146f;
+  constexpr float module_length_bpx4 = 6.7f;
+  constexpr float module_tolerance_bpx4 = 0.2f;
+  constexpr float barrel_z_length = 26.f;
+  constexpr float forward_z_begin = 32.f;
+
+  // Last indexes
+  constexpr uint32_t last_bpix1_detIndex = 96;
+  constexpr uint32_t last_barrel_detIndex = 1184;
 
   // types
   using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
   using tindex_type = uint16_t;  // for tuples
 
-#ifdef ONLY_PHICUT
-  using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
-  using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
-#else
-  using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
-  using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
-#endif
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, maxCellNeighbors>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, maxCellTracks>;
 
   using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
   using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
 
-  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit()>;
-  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
+  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
+  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
   using HitToTuple =
-      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
-  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples()>;
+      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples>;  // 3.5 should be enough
+  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples>;
 
-}  // namespace CAConstants
+}  // namespace caConstants
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 3b1ea6fe158b2..beba54c33f513 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -32,7 +32,7 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 private:
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 
-  bool m_OnGPU;
+  bool onGPU_;
 
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
   edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
@@ -43,8 +43,8 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 };
 
 CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
-    : m_OnGPU(iConfig.getParameter<bool>("onGPU")), gpuAlgo_(iConfig, consumesCollector()) {
-  if (m_OnGPU) {
+    : onGPU_(iConfig.getParameter<bool>("onGPU")), gpuAlgo_(iConfig, consumesCollector()) {
+  if (onGPU_) {
     tokenHitGPU_ =
         consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
@@ -61,16 +61,14 @@ void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descript
   desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
 
   CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
-  auto label = "caHitNtupletCUDA";
-  descriptions.add(label, desc);
+  descriptions.add("caHitNtupletCUDA", desc);
 }
 
 void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
   auto bf = 1. / PixelRecoUtilities::fieldInInvGev(es);
 
-  if (m_OnGPU) {
-    edm::Handle<cms::cuda::Product<TrackingRecHit2DCUDA>> hHits;
-    iEvent.getByToken(tokenHitGPU_, hHits);
+  if (onGPU_) {
+    auto hHits = iEvent.getHandle(tokenHitGPU_);
 
     cms::cuda::ScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 1646cb503ff81..c4b8a5a54847f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -25,12 +25,11 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
       (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
   assert(device_isOuterHitOfCell_.get());
 
-  cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
-                                             CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
+  cellStorage_.reset((unsigned char *)malloc(caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+                                             caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks)));
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
-  device_theCellTracksContainer_ =
-      (GPUCACell::CellTracks *)(cellStorage_.get() +
-                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
+                                                                                      sizeof(GPUCACell::CellNeighbors));
 
   gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
                                  nhits,
@@ -40,15 +39,15 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                  device_theCellTracksContainer_);
 
   // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
-  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
+  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * params_.maxNumberOfDoublets_));
   if (0 == nhits)
     return;  // protect against empty events
 
   // FIXME avoid magic numbers
   auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
+  if (!params_.includeJumpingForwardDoublets_)
     nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
+  if (params_.minHitsPerNtuplet_ > 3) {
     nActualPairs = 13;
   }
 
@@ -60,17 +59,17 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                          hh.view(),
                                          device_isOuterHitOfCell_.get(),
                                          nActualPairs,
-                                         m_params.idealConditions_,
-                                         m_params.doClusterCut_,
-                                         m_params.doZ0Cut_,
-                                         m_params.doPtCut_,
-                                         m_params.maxNumberOfDoublets_);
+                                         params_.idealConditions_,
+                                         params_.doClusterCut_,
+                                         params_.doZ0Cut_,
+                                         params_.doPtCut_,
+                                         params_.maxNumberOfDoublets_);
 }
 
 template <>
 void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
 
   assert(tuples_d && quality_d);
 
@@ -94,14 +93,14 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  device_nCells_,
                  device_theCellNeighbors_.get(),
                  device_isOuterHitOfCell_.get(),
-                 m_params.hardCurvCut_,
-                 m_params.ptmin_,
-                 m_params.CAThetaCutBarrel_,
-                 m_params.CAThetaCutForward_,
-                 m_params.dcaCutInnerTriplet_,
-                 m_params.dcaCutOuterTriplet_);
-
-  if (nhits > 1 && m_params.earlyFishbone_) {
+                 params_.hardCurvCut_,
+                 params_.ptmin_,
+                 params_.CAThetaCutBarrel_,
+                 params_.CAThetaCutForward_,
+                 params_.dcaCutInnerTriplet_,
+                 params_.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && params_.earlyFishbone_) {
     gpuPixelDoublets::fishbone(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
   }
@@ -113,8 +112,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                        tuples_d,
                        device_hitTuple_apc_,
                        quality_d,
-                       m_params.minHitsPerNtuplet_);
-  if (m_params.doStats_)
+                       params_.minHitsPerNtuplet_);
+  if (params_.doStats_)
     kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
 
   cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
@@ -126,12 +125,12 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
-  if (nhits > 1 && m_params.lateFishbone_) {
+  if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
   }
 
-  if (m_params.doStats_) {
+  if (params_.doStats_) {
     kernel_checkOverflows(tuples_d,
                           device_tupleMultiplicity_.get(),
                           device_hitToTuple_.get(),
@@ -142,7 +141,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                           device_theCellTracks_.get(),
                           device_isOuterHitOfCell_.get(),
                           nhits,
-                          m_params.maxNumberOfDoublets_,
+                          params_.maxNumberOfDoublets_,
                           counters_);
   }
 }
@@ -150,12 +149,12 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 template <>
 void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
 
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d);
+  kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);
 
-  if (m_params.lateFishbone_) {
+  if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
     kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
   }
@@ -171,7 +170,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // remove duplicates (tracks that share a hit)
   kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
 
-  if (m_params.doStats_) {
+  if (params_.doStats_) {
     // counters (add flag???)
     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
     kernel_doStatsForTracks(tuples_d, quality_d, counters_);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index a8dac7992f4fa..96639e98939f9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -18,7 +18,7 @@ template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
   auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
 
   // zero tuples
   cms::cuda::launchZero(tuples_d, cudaStream);
@@ -36,10 +36,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   auto nthTot = 64;
   auto stride = 4;
   auto blockSize = nthTot / stride;
-  auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  auto numberOfBlocks = nDoubletBlocks(blockSize);
   auto rescale = numberOfBlocks / 65536;
   blockSize *= (rescale + 1);
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = nDoubletBlocks(blockSize);
   assert(numberOfBlocks < 65536);
   assert(blockSize > 0 && 0 == blockSize % 16);
   dim3 blks(1, numberOfBlocks, 1);
@@ -53,15 +53,15 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       device_nCells_,
       device_theCellNeighbors_.get(),
       device_isOuterHitOfCell_.get(),
-      m_params.hardCurvCut_,
-      m_params.ptmin_,
-      m_params.CAThetaCutBarrel_,
-      m_params.CAThetaCutForward_,
-      m_params.dcaCutInnerTriplet_,
-      m_params.dcaCutOuterTriplet_);
+      params_.hardCurvCut_,
+      params_.ptmin_,
+      params_.CAThetaCutBarrel_,
+      params_.CAThetaCutForward_,
+      params_.dcaCutInnerTriplet_,
+      params_.dcaCutOuterTriplet_);
   cudaCheck(cudaGetLastError());
 
-  if (nhits > 1 && m_params.earlyFishbone_) {
+  if (nhits > 1 && params_.earlyFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
@@ -74,7 +74,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   }
 
   blockSize = 64;
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
@@ -82,10 +82,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                      tuples_d,
                                                                      device_hitTuple_apc_,
                                                                      quality_d,
-                                                                     m_params.minHitsPerNtuplet_);
+                                                                     params_.minHitsPerNtuplet_);
   cudaCheck(cudaGetLastError());
 
-  if (m_params.doStats_)
+  if (params_.doStats_)
     kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
   cudaCheck(cudaGetLastError());
 
@@ -99,13 +99,13 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, quality_d);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
-  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
@@ -113,7 +113,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 
-  if (nhits > 1 && m_params.lateFishbone_) {
+  if (nhits > 1 && params_.lateFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
@@ -152,13 +152,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   assert(device_isOuterHitOfCell_.get());
 
   cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
-      CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
-          CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
+      caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
       stream);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
-  device_theCellTracksContainer_ =
-      (GPUCACell::CellTracks *)(cellStorage_.get() +
-                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
+                                                                                      sizeof(GPUCACell::CellNeighbors));
 
   {
     int threadsPerBlock = 128;
@@ -173,7 +172,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -185,9 +184,9 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 
   // FIXME avoid magic numbers
   auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
+  if (!params_.includeJumpingForwardDoublets_)
     nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
+  if (params_.minHitsPerNtuplet_ > 3) {
     nActualPairs = 13;
   }
 
@@ -204,11 +203,11 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                                                     hh.view(),
                                                                     device_isOuterHitOfCell_.get(),
                                                                     nActualPairs,
-                                                                    m_params.idealConditions_,
-                                                                    m_params.doClusterCut_,
-                                                                    m_params.doZ0Cut_,
-                                                                    m_params.doPtCut_,
-                                                                    m_params.maxNumberOfDoublets_);
+                                                                    params_.idealConditions_,
+                                                                    params_.doClusterCut_,
+                                                                    params_.doZ0Cut_,
+                                                                    params_.doPtCut_,
+                                                                    params_.maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -221,32 +220,32 @@ template <>
 void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
   auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
 
   auto blockSize = 64;
 
   // classify tracks based on kinematics
-  auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, m_params.cuts_, quality_d);
+  auto numberOfBlocks = nQuadrupletBlocks(blockSize);
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, params_.cuts_, quality_d);
   cudaCheck(cudaGetLastError());
 
-  if (m_params.lateFishbone_) {
+  if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+    numberOfBlocks = nDoubletBlocks(blockSize);
     kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         device_theCells_.get(), device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
   }
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
   cudaCheck(cudaGetLastError());
 
-  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
+  if (params_.minHitsPerNtuplet_ < 4 || params_.doStats_) {
     // fill hit->track "map"
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    numberOfBlocks = nQuadrupletBlocks(blockSize);
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
@@ -255,7 +254,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
   }
-  if (m_params.minHitsPerNtuplet_ < 4) {
+  if (params_.minHitsPerNtuplet_ < 4) {
     // remove duplicates (tracks that share a hit)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
     kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
@@ -263,9 +262,9 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     cudaCheck(cudaGetLastError());
   }
 
-  if (m_params.doStats_) {
+  if (params_.doStats_) {
     auto nhits = hh.nHits();
-    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    numberOfBlocks = (std::max(nhits, params_.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
     kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
                                                                         device_tupleMultiplicity_.get(),
                                                                         device_hitToTuple_.get(),
@@ -276,17 +275,17 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
                                                                         device_theCellTracks_.get(),
                                                                         device_isOuterHitOfCell_.get(),
                                                                         nhits,
-                                                                        m_params.maxNumberOfDoublets_,
+                                                                        params_.maxNumberOfDoublets_,
                                                                         counters_);
     cudaCheck(cudaGetLastError());
   }
 
-  if (m_params.doStats_) {
+  if (params_.doStats_) {
     // counters (add flag???)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
     kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
     cudaCheck(cudaGetLastError());
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 7ab3ed010927e..d1a9f3d13a67f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -26,8 +26,8 @@ namespace cAHitNtupletGenerator {
   using HitsView = TrackingRecHit2DSOAView;
   using HitsOnGPU = TrackingRecHit2DSOAView;
 
-  using HitToTuple = CAConstants::HitToTuple;
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+  using HitToTuple = caConstants::HitToTuple;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
   using TkSoA = pixelTrack::TrackSoA;
@@ -39,14 +39,14 @@ namespace cAHitNtupletGenerator {
     float chi2MaxPt;  // GeV
     float chi2Scale;
 
-    struct region {
+    struct Region {
       float maxTip;  // cm
       float minPt;   // GeV
       float maxZip;  // cm
     };
 
-    region triplet;
-    region quadruplet;
+    Region triplet;
+    Region quadruplet;
   };
 
   // params
@@ -152,14 +152,15 @@ class CAHitNtupletGeneratorKernels {
   using HitsOnGPU = TrackingRecHit2DSOAView;
   using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
 
-  using HitToTuple = CAConstants::HitToTuple;
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+  using HitToTuple = caConstants::HitToTuple;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
   using TkSoA = pixelTrack::TrackSoA;
   using HitContainer = pixelTrack::HitContainer;
 
-  CAHitNtupletGeneratorKernels(Params const& params) : m_params(params) {}
+  CAHitNtupletGeneratorKernels(Params const& params)
+      : params_(params), paramsMaxDoubletes3Quarters_(3 * params.maxNumberOfDoublets_ / 4) {}
   ~CAHitNtupletGeneratorKernels() = default;
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
@@ -175,15 +176,17 @@ class CAHitNtupletGeneratorKernels {
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);
-  Counters* counters_ = nullptr;
+  void setCounters(Counters* counters) { counters_ = counters; }
 
 private:
+  Counters* counters_ = nullptr;
+
   // workspace
   unique_ptr<unsigned char[]> cellStorage_;
-  unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
-  CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
-  unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
-  CAConstants::CellTracks* device_theCellTracksContainer_;
+  unique_ptr<caConstants::CellNeighborsVector> device_theCellNeighbors_;
+  caConstants::CellNeighbors* device_theCellNeighborsContainer_;
+  unique_ptr<caConstants::CellTracksVector> device_theCellTracks_;
+  caConstants::CellTracks* device_theCellTracksContainer_;
 
   unique_ptr<GPUCACell[]> device_theCells_;
   unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
@@ -198,7 +201,20 @@ class CAHitNtupletGeneratorKernels {
 
   unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
   // params
-  Params const& m_params;
+  Params const& params_;
+  /// Intermediate result avoiding repeated computations.
+  const uint32_t paramsMaxDoubletes3Quarters_;
+  /// Compute the number of doublet blocks for block size
+  inline uint32_t nDoubletBlocks(uint32_t blockSize) {
+    // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed.
+    return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize;
+  }
+
+  /// Compute the number of quadruplet blocks for block size
+  inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
+    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
+    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+  }
 };
 
 using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index 1c34275d6bbe2..1d19aa43d6e1b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -12,8 +12,8 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
-  device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);
+  device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(stream);
+  device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(stream);
 
   device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 3a935efbe2b4b..7c0cec51b8057 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -22,15 +22,15 @@
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using HitsOnCPU = TrackingRecHit2DCUDA;
 
-using HitToTuple = CAConstants::HitToTuple;
-using TupleMultiplicity = CAConstants::TupleMultiplicity;
+using HitToTuple = caConstants::HitToTuple;
+using TupleMultiplicity = caConstants::TupleMultiplicity;
 
 using Quality = pixelTrack::Quality;
 using TkSoA = pixelTrack::TrackSoA;
 using HitContainer = pixelTrack::HitContainer;
 
 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
-                                      CAConstants::TupleMultiplicity const *tupleMultiplicity,
+                                      caConstants::TupleMultiplicity const *tupleMultiplicity,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
                                       cms::cuda::AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
@@ -60,7 +60,7 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
            apc->get().m,
            apc->get().n,
            nHits);
-    if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) {
+    if (apc->get().m < caConstants::maxNumberOfQuadruplets()) {
       assert(foundNtuplets->size(apc->get().m) == 0);
       assert(foundNtuplets->size() == apc->get().n);
     }
@@ -76,7 +76,7 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
 #endif
 
   if (0 == first) {
-    if (apc->get().m >= CAConstants::maxNumberOfQuadruplets())
+    if (apc->get().m >= caConstants::maxNumberOfQuadruplets)
       printf("Tuples overflow\n");
     if (*nCells >= maxNumberOfDoublets)
       printf("Cells overflow\n");
@@ -89,14 +89,14 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
     if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
-      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
+      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
     if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
-      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
-    if (thisCell.theDoubletId < 0)
+      printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
+    if (thisCell.isKilled())
       atomicAdd(&c.nKilledCells, 1);
-    if (0 == thisCell.theUsed)
+    if (thisCell.unused())
       atomicAdd(&c.nEmptyCells, 1);
-    if (0 == hitToTuple->size(thisCell.get_inner_hit_id()) && 0 == hitToTuple->size(thisCell.get_outer_hit_id()))
+    if (0 == hitToTuple->size(thisCell.inner_hit_id()) && 0 == hitToTuple->size(thisCell.outer_hit_id()))
       atomicAdd(&c.nZeroTrackCells, 1);
   }
 
@@ -107,12 +107,12 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
 }
 
 __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
-  constexpr auto bad = trackQuality::bad;
+  constexpr auto bad = pixelTrack::Quality::bad;
 
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
-    if (thisCell.theDoubletId >= 0)
+    if (!thisCell.isKilled())
       continue;
 
     for (auto it : thisCell.tracks())
@@ -125,7 +125,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
                                              HitContainer *foundNtuplets,
                                              Quality *quality) {
   // constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
+  constexpr auto dup = pixelTrack::Quality::dup;
   // constexpr auto loose = trackQuality::loose;
 
   assert(nCells);
@@ -157,9 +157,9 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
                                             uint32_t const *__restrict__ nCells,
                                             HitContainer const *__restrict__ foundNtuplets,
                                             TkSoA *__restrict__ tracks) {
-  constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
-  constexpr auto loose = trackQuality::loose;
+  constexpr auto bad = pixelTrack::Quality::bad;
+  constexpr auto dup = pixelTrack::Quality::dup;
+  constexpr auto loose = pixelTrack::Quality::loose;
 
   assert(nCells);
 
@@ -178,7 +178,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
       // return tracks->chi2(it);  //chi2
     };
 
-    // find min socre
+    // find min score
     for (auto it : thisCell.tracks()) {
       if (tracks->quality(it) == loose && score(it) < mc) {
         mc = score(it);
@@ -220,30 +220,22 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
   for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
     auto cellIndex = idx;
     auto &thisCell = cells[idx];
-    //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
-    //  continue;
-    auto innerHitId = thisCell.get_inner_hit_id();
+    auto innerHitId = thisCell.inner_hit_id();
     int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
     auto vi = isOuterHitOfCell[innerHitId].data();
 
-    constexpr uint32_t last_bpix1_detIndex = 96;
-    constexpr uint32_t last_barrel_detIndex = 1184;
-    auto ri = thisCell.get_inner_r(hh);
-    auto zi = thisCell.get_inner_z(hh);
+    auto ri = thisCell.inner_r(hh);
+    auto zi = thisCell.inner_z(hh);
 
-    auto ro = thisCell.get_outer_r(hh);
-    auto zo = thisCell.get_outer_z(hh);
-    auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
+    auto ro = thisCell.outer_r(hh);
+    auto zo = thisCell.outer_z(hh);
+    auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
 
     for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
       auto otherCell = __ldg(vi + j);
       auto &oc = cells[otherCell];
-      // if (cells[otherCell].theDoubletId < 0 ||
-      //    cells[otherCell].theUsed>1 )
-      //  continue;
-      auto r1 = oc.get_inner_r(hh);
-      auto z1 = oc.get_inner_z(hh);
-      // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
+      auto r1 = oc.inner_r(hh);
+      auto z1 = oc.inner_z(hh);
       bool aligned = GPUCACell::areAlignedRZ(
           r1,
           z1,
@@ -253,14 +245,14 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
           zo,
           ptmin,
           isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-      if (aligned &&
-          thisCell.dcaCut(hh,
-                          oc,
-                          oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
-                          hardCurvCut)) {  // FIXME tune cuts
+      if (aligned && thisCell.dcaCut(hh,
+                                     oc,
+                                     oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                              : dcaCutOuterTriplet,
+                                     hardCurvCut)) {  // FIXME tune cuts
         oc.addOuterNeighbor(cellIndex, *cellNeighbors);
-        thisCell.theUsed |= 1;
-        oc.theUsed |= 1;
+        thisCell.setUsedBit(1);
+        oc.setUsedBit(1);
       }
     }  // loop on inner cells
   }    // loop on outer cells
@@ -280,10 +272,10 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
-    if (thisCell.theDoubletId < 0)
+    if (thisCell.isKilled())
       continue;  // cut by earlyFishbone
 
-    auto pid = thisCell.theLayerPairId;
+    auto pid = thisCell.layerPairId();
     auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
     if (doit) {
       GPUCACell::TmpTuple stack;
@@ -298,26 +290,25 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
 __global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
                                  GPUCACell *__restrict__ cells,
                                  uint32_t const *nCells) {
-  // auto const &hh = *hhp;
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto &thisCell = cells[idx];
     if (!thisCell.tracks().empty())
-      thisCell.theUsed |= 2;
+      thisCell.setUsedBit(2);
   }
 }
 
 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
                                          Quality const *__restrict__ quality,
-                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
+                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == trackQuality::dup)
+    if (quality[it] == pixelTrack::Quality::dup)
       continue;
-    assert(quality[it] == trackQuality::bad);
+    assert(quality[it] == pixelTrack::Quality::bad);
     if (nhits > 5)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits < 8);
@@ -327,15 +318,15 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
 
 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
                                         Quality const *__restrict__ quality,
-                                        CAConstants::TupleMultiplicity *tupleMultiplicity) {
+                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == trackQuality::dup)
+    if (quality[it] == pixelTrack::Quality::dup)
       continue;
-    assert(quality[it] == trackQuality::bad);
+    assert(quality[it] == pixelTrack::Quality::bad);
     if (nhits > 5)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits < 8);
@@ -354,10 +345,10 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
       break;  // guard
 
     // if duplicate: not even fit
-    if (quality[it] == trackQuality::dup)
+    if (quality[it] == pixelTrack::Quality::dup)
       continue;
 
-    assert(quality[it] == trackQuality::bad);
+    assert(quality[it] == pixelTrack::Quality::bad);
 
     // mark doublets as bad
     if (nhits < 3)
@@ -407,7 +398,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                 (std::abs(tracks->zip(it)) < region.maxZip);
 
     if (isOk)
-      quality[it] = trackQuality::loose;
+      quality[it] = pixelTrack::Quality::loose;
   }
 }
 
@@ -418,7 +409,7 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
   for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  //guard
-    if (quality[idx] != trackQuality::loose)
+    if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     atomicAdd(&(counters->nGoodTracks), 1);
   }
@@ -431,7 +422,7 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
   for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
-    if (quality[idx] != trackQuality::loose)
+    if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
       hitToTuple->countDirect(*h);
@@ -445,7 +436,7 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
   for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
-    if (quality[idx] != trackQuality::loose)
+    if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
       hitToTuple->fillDirect(*h, idx);
@@ -487,17 +478,14 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
                                       TkSoA const *__restrict__ ptracks,
                                       Quality *__restrict__ quality,
                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
-  constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
+  constexpr auto bad = pixelTrack::Quality::bad;
+  constexpr auto dup = pixelTrack::Quality::dup;
   // constexpr auto loose = trackQuality::loose;
 
   auto &hitToTuple = *phitToTuple;
   auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
 
-  //  auto const & hh = *hhp;
-  // auto l1end = hh.hitsLayerStart_d[1];
-
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (hitToTuple.size(idx) < 2)
@@ -521,7 +509,6 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
 
     if (maxNh > 3)
       continue;
-    // if (idx>=l1end) continue;  // only for layer 1
     // for triplets choose best tip!
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 464744594e9a6..c2c7c2b869752 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -140,7 +140,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
   desc.add<bool>("idealConditions", true);
   desc.add<bool>("fillStatistics", false);
   desc.add<unsigned int>("minHitsPerNtuplet", 4);
-  desc.add<unsigned int>("maxNumberOfDoublets", CAConstants::maxNumberOfDoublets());
+  desc.add<unsigned int>("maxNumberOfDoublets", caConstants::maxNumberOfDoublets);
   desc.add<bool>("includeJumpingForwardDoublets", false);
   desc.add<bool>("fit5as4", true);
   desc.add<bool>("doClusterCut", true);
@@ -176,7 +176,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   auto* soa = tracks.get();
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
-  kernels.counters_ = m_counters;
+  kernels.setCounters(m_counters);
 
   kernels.allocateOnGPU(stream);
 
@@ -187,9 +187,9 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   } else {
-    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   }
   kernels.classifyTuples(hits_d, soa, stream);
 
@@ -203,7 +203,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
   assert(soa);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
-  kernels.counters_ = m_counters;
+  kernels.setCounters(m_counters);
   kernels.allocateOnGPU(nullptr);
 
   kernels.buildDoublets(hits_d, nullptr);
@@ -218,9 +218,9 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
   } else {
-    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
   }
 
   kernels.classifyTuples(hits_d, soa, nullptr);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index afb591744bf59..564a870f54796 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -13,7 +13,6 @@
 #include "CAHitNtupletGeneratorKernels.h"
 #include "HelixFitOnGPU.h"
 
-// FIXME  (split header???)
 #include "GPUCACell.h"
 
 namespace edm {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 2a74d6a064e73..0fd514e26d223 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -19,14 +19,14 @@
 
 class GPUCACell {
 public:
-  using ptrAsInt = unsigned long long;
+  using PtrAsInt = unsigned long long;
 
-  static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit();
-  using OuterHitOfCell = CAConstants::OuterHitOfCell;
-  using CellNeighbors = CAConstants::CellNeighbors;
-  using CellTracks = CAConstants::CellTracks;
-  using CellNeighborsVector = CAConstants::CellNeighborsVector;
-  using CellTracksVector = CAConstants::CellTracksVector;
+  static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit;
+  using OuterHitOfCell = caConstants::OuterHitOfCell;
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
 
   using Hits = TrackingRecHit2DSOAView;
   using hindex_type = Hits::hindex_type;
@@ -34,8 +34,8 @@ class GPUCACell {
   using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
 
   using HitContainer = pixelTrack::HitContainer;
-  using Quality = trackQuality::Quality;
-  static constexpr auto bad = trackQuality::bad;
+  using Quality = pixelTrack::Quality;
+  static constexpr auto bad = pixelTrack::Quality::bad;
 
   GPUCACell() = default;
 
@@ -48,9 +48,9 @@ class GPUCACell {
                                        hindex_type outerHitId) {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
-    theDoubletId = doubletId;
-    theLayerPairId = layerPairId;
-    theUsed = 0;
+    theDoubletId_ = doubletId;
+    theLayerPairId_ = layerPairId;
+    theUsed_ = 0;
 
     // optimization that depends on access pattern
     theInnerZ = hh.zGlobal(innerHitId);
@@ -66,14 +66,14 @@ class GPUCACell {
   __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
     // use smart cache
     if (outerNeighbors().empty()) {
-      auto i = cellNeighbors.extend();  // maybe waisted....
+      auto i = cellNeighbors.extend();  // maybe wasted....
       if (i > 0) {
         cellNeighbors[i].reset();
 #ifdef __CUDACC__
-        auto zero = (ptrAsInt)(&cellNeighbors[0]);
-        atomicCAS((ptrAsInt*)(&theOuterNeighbors),
+        auto zero = (PtrAsInt)(&cellNeighbors[0]);
+        atomicCAS((PtrAsInt*)(&theOuterNeighbors),
                   zero,
-                  (ptrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
+                  (PtrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
 #else
         theOuterNeighbors = &cellNeighbors[i];
 #endif
@@ -86,12 +86,12 @@ class GPUCACell {
 
   __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
     if (tracks().empty()) {
-      auto i = cellTracks.extend();  // maybe waisted....
+      auto i = cellTracks.extend();  // maybe wasted....
       if (i > 0) {
         cellTracks[i].reset();
 #ifdef __CUDACC__
-        auto zero = (ptrAsInt)(&cellTracks[0]);
-        atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
+        auto zero = (PtrAsInt)(&cellTracks[0]);
+        atomicCAS((PtrAsInt*)(&theTracks), zero, (PtrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
 #else
         theTracks = &cellTracks[i];
 #endif
@@ -106,30 +106,30 @@ class GPUCACell {
   __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
   __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
   __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
-  __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_z(Hits const& hh) const { return theInnerZ; }
+  __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
+  __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
+  __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; }
   // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
-  __device__ __forceinline__ float get_outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_r(Hits const& hh) const { return theInnerR; }
+  __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; }
   // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
-  __device__ __forceinline__ float get_outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
+  __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
 
-  __device__ __forceinline__ auto get_inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
-  __device__ __forceinline__ auto get_outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
+  __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
+  __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
 
-  __device__ __forceinline__ float get_inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
+  __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
 
-  constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; }
-  constexpr unsigned int get_outer_hit_id() const { return theOuterHitId; }
+  constexpr unsigned int inner_hit_id() const { return theInnerHitId; }
+  constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
 
   __device__ void print_cell() const {
     printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
-           theDoubletId,
-           theLayerPairId,
+           theDoubletId_,
+           theLayerPairId_,
            theInnerHitId,
            theOuterHitId);
   }
@@ -138,24 +138,22 @@ class GPUCACell {
                                   GPUCACell const& otherCell,
                                   const float ptmin,
                                   const float hardCurvCut,
-                                  const float CAThetaCutBarrel,
-                                  const float CAThetaCutForward,
+                                  const float caThetaCutBarrel,
+                                  const float caThetaCutForward,
                                   const float dcaCutInnerTriplet,
                                   const float dcaCutOuterTriplet) const {
     // detIndex of the layerStart for the Phase1 Pixel Detector:
     // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
     // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
-    constexpr uint32_t last_bpix1_detIndex = 96;
-    constexpr uint32_t last_barrel_detIndex = 1184;
-    auto ri = get_inner_r(hh);
-    auto zi = get_inner_z(hh);
+    auto ri = inner_r(hh);
+    auto zi = inner_z(hh);
 
-    auto ro = get_outer_r(hh);
-    auto zo = get_outer_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
 
-    auto r1 = otherCell.get_inner_r(hh);
-    auto z1 = otherCell.get_inner_z(hh);
-    auto isBarrel = otherCell.get_outer_detIndex(hh) < last_barrel_detIndex;
+    auto r1 = otherCell.inner_r(hh);
+    auto z1 = otherCell.inner_z(hh);
+    auto isBarrel = otherCell.outer_detIndex(hh) < caConstants::last_barrel_detIndex;
     bool aligned = areAlignedRZ(r1,
                                 z1,
                                 ri,
@@ -163,12 +161,12 @@ class GPUCACell {
                                 ro,
                                 zo,
                                 ptmin,
-                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-    return (aligned &&
-            dcaCut(hh,
-                   otherCell,
-                   otherCell.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
-                   hardCurvCut));  // FIXME tune cuts
+                                isBarrel ? caThetaCutBarrel : caThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+    return (aligned && dcaCut(hh,
+                              otherCell,
+                              otherCell.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                              : dcaCutOuterTriplet,
+                              hardCurvCut));  // FIXME tune cuts
   }
 
   __device__ __forceinline__ static bool areAlignedRZ(
@@ -187,14 +185,14 @@ class GPUCACell {
                                 GPUCACell const& otherCell,
                                 const float region_origin_radius_plus_tolerance,
                                 const float maxCurv) const {
-    auto x1 = otherCell.get_inner_x(hh);
-    auto y1 = otherCell.get_inner_y(hh);
+    auto x1 = otherCell.inner_x(hh);
+    auto y1 = otherCell.inner_y(hh);
 
-    auto x2 = get_inner_x(hh);
-    auto y2 = get_inner_y(hh);
+    auto x2 = inner_x(hh);
+    auto y2 = inner_y(hh);
 
-    auto x3 = get_outer_x(hh);
-    auto y3 = get_outer_y(hh);
+    auto x3 = outer_x(hh);
+    auto y3 = outer_y(hh);
 
     CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
 
@@ -221,52 +219,48 @@ class GPUCACell {
   }
 
   __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
-    constexpr uint32_t max_ladder_bpx0 = 12;
-    constexpr uint32_t first_ladder_bpx0 = 0;
-    constexpr float module_length = 6.7f;
-    constexpr float module_tolerance = 0.4f;  // projection to cylinder is inaccurate on BPIX1
-    int p = innerCell.get_inner_iphi(hh);
+    using caConstants::first_ladder_bpx0;
+    using caConstants::max_ladder_bpx0;
+    using caConstants::module_length_bpx0;
+    using caConstants::module_tolerance_bpx0;
+    int p = innerCell.inner_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
     p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
     p %= max_ladder_bpx0;
     auto il = first_ladder_bpx0 + p;
     auto r0 = hh.averageGeometry().ladderR[il];
-    auto ri = innerCell.get_inner_r(hh);
-    auto zi = innerCell.get_inner_z(hh);
-    auto ro = get_outer_r(hh);
-    auto zo = get_outer_z(hh);
+    auto ri = innerCell.inner_r(hh);
+    auto zi = innerCell.inner_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
     auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
     auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]);
-    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
-    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    auto z_in_module = z_in_ladder - module_length_bpx0 * int(z_in_ladder / module_length_bpx0);
+    auto gap = z_in_module < module_tolerance_bpx0 || z_in_module > (module_length_bpx0 - module_tolerance_bpx0);
     return gap;
   }
 
   __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
-    constexpr uint32_t max_ladder_bpx4 = 64;
-    constexpr uint32_t first_ladder_bpx4 = 84;
-    // constexpr float radius_even_ladder = 15.815f;
-    // constexpr float radius_odd_ladder = 16.146f;
-    constexpr float module_length = 6.7f;
-    constexpr float module_tolerance = 0.2f;
-    // constexpr float barrel_z_length = 26.f;
-    // constexpr float forward_z_begin = 32.f;
-    int p = get_outer_iphi(hh);
+    using caConstants::first_ladder_bpx4;
+    using caConstants::max_ladder_bpx4;
+    using caConstants::module_length_bpx4;
+    using caConstants::module_tolerance_bpx4;
+    int p = outer_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
     p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
     p %= max_ladder_bpx4;
     auto il = first_ladder_bpx4 + p;
     auto r4 = hh.averageGeometry().ladderR[il];
-    auto ri = innerCell.get_inner_r(hh);
-    auto zi = innerCell.get_inner_z(hh);
-    auto ro = get_outer_r(hh);
-    auto zo = get_outer_z(hh);
+    auto ri = innerCell.inner_r(hh);
+    auto zi = innerCell.inner_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
     auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
     auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]);
-    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
-    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    auto z_in_module = z_in_ladder - module_length_bpx4 * int(z_in_ladder / module_length_bpx4);
+    auto gap = z_in_module < module_tolerance_bpx4 || z_in_module > (module_length_bpx4 - module_tolerance_bpx4);
     auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
     auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1];
     return gap || holeP || holeN;
@@ -289,12 +283,12 @@ class GPUCACell {
     // the ntuplets is then saved if the number of hits it contains is greater
     // than a threshold
 
-    tmpNtuplet.push_back_unsafe(theDoubletId);
+    tmpNtuplet.push_back_unsafe(theDoubletId_);
     assert(tmpNtuplet.size() <= 4);
 
     bool last = true;
     for (unsigned int otherCell : outerNeighbors()) {
-      if (cells[otherCell].theDoubletId < 0)
+      if (cells[otherCell].theDoubletId_ < 0)
         continue;  // killed by earlyFishbone
       last = false;
       cells[otherCell].find_ntuplets(
@@ -327,16 +321,23 @@ class GPUCACell {
     assert(tmpNtuplet.size() < 4);
   }
 
+  // Cell status management
+  __device__ __forceinline__ void kill() { theDoubletId_ = -1; }
+  __device__ __forceinline__ bool isKilled() const { return theDoubletId_ < 0; }
+
+  __device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; }
+
+  __device__ __forceinline__ bool unused() const { return !theUsed_; }
+  __device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; }
+
 private:
   CellNeighbors* theOuterNeighbors;
   CellTracks* theTracks;
 
-public:
-  int32_t theDoubletId;
-  int16_t theLayerPairId;
-  uint16_t theUsed;  // tbd
+  int32_t theDoubletId_;
+  int16_t theLayerPairId_;
+  uint16_t theUsed_;  // tbd
 
-private:
   float theInnerZ;
   float theInnerR;
   hindex_type theInnerHitId;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index becbd0a1a8540..880bdb47dfb5c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -4,13 +4,13 @@
 void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
                                   TupleMultiplicity const *tupleMultiplicity,
                                   OutputSoA *helix_fit_results) {
-  tuples_d = tuples;
-  tupleMultiplicity_d = tupleMultiplicity;
-  outputSoa_d = helix_fit_results;
+  tuples_ = tuples;
+  tupleMultiplicity_ = tupleMultiplicity;
+  outputSoa_ = helix_fit_results;
 
-  assert(tuples_d);
-  assert(tupleMultiplicity_d);
-  assert(outputSoa_d);
+  assert(tuples_);
+  assert(tupleMultiplicity_);
+  assert(outputSoa_);
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 42f8f0e720b43..938994840f8c0 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
-#define RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
+#define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
@@ -7,29 +7,29 @@
 
 #include "CAConstants.h"
 
-namespace Rfit {
+namespace riemannFit {
   // in case of memory issue can be made smaller
-  constexpr uint32_t maxNumberOfConcurrentFits() { return CAConstants::maxNumberOfTuples(); }
-  constexpr uint32_t stride() { return maxNumberOfConcurrentFits(); }
+  constexpr uint32_t maxNumberOfConcurrentFits = caConstants::maxNumberOfTuples;
+  constexpr uint32_t stride = maxNumberOfConcurrentFits;
   using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
-  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride(), stride()> >;
+  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride, stride> >;
   using Matrix6x4f = Eigen::Matrix<float, 6, 4>;
-  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride(), stride()> >;
+  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride, stride> >;
 
   // hits
   template <int N>
   using Matrix3xNd = Eigen::Matrix<double, 3, N>;
   template <int N>
-  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()> >;
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride, stride> >;
   // errors
   template <int N>
   using Matrix6xNf = Eigen::Matrix<float, 6, N>;
   template <int N>
-  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()> >;
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride, stride> >;
   // fast fit
-  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride> >;
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 class HelixFitOnGPU {
 public:
@@ -38,7 +38,7 @@ class HelixFitOnGPU {
   using Tuples = pixelTrack::HitContainer;
   using OutputSoA = pixelTrack::TrackSoA;
 
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
   ~HelixFitOnGPU() { deallocateOnGPU(); }
@@ -54,15 +54,15 @@ class HelixFitOnGPU {
   void deallocateOnGPU();
 
 private:
-  static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
+  static constexpr uint32_t maxNumberOfConcurrentFits_ = riemannFit::maxNumberOfConcurrentFits;
 
   // fowarded
-  Tuples const *tuples_d = nullptr;
-  TupleMultiplicity const *tupleMultiplicity_d = nullptr;
-  OutputSoA *outputSoa_d;
+  Tuples const *tuples_ = nullptr;
+  TupleMultiplicity const *tupleMultiplicity_ = nullptr;
+  OutputSoA *outputSoa_;
   float bField_;
 
   const bool fit5as4_;
 };
 
-#endif  // RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
deleted file mode 100644
index 3279587d2486e..0000000000000
--- a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//FIXME move it to a better place...
-
-#ifndef RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
-#define RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
-
-#include <cstdint>
-#include <unordered_map>
-
-#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
-#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
-#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
-#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-
-// store T for each cluster
-template <typename T>
-class RecHitsMap {
-public:
-  explicit RecHitsMap(T const& d = T()) : dummy(d) {}
-
-  void clear() { m_map.clear(); }
-
-  void error(const GeomDetUnit& gd) const { edm::LogError("RecHitMap") << "hit not found in det " << gd.index(); }
-  void error(uint32_t ind) const { edm::LogError("RecHitMap") << "hit not found in det " << ind; }
-
-  // does not work for matched hits... (easy to extend)
-  void add(TrackingRecHit const& hit, T const& v) {
-    auto const& thit = static_cast<BaseTrackerRecHit const&>(hit);
-    auto const& clus = thit.firstClusterRef();
-
-    if (clus.isPixel())
-      add(clus.pixelCluster(), *thit.detUnit(), v);
-    else
-      add(clus.stripCluster(), *thit.detUnit(), v);
-  }
-
-  template <typename Cluster>
-  void add(const Cluster& cluster, const GeomDetUnit& gd, T const& v) {
-    m_map[encode(cluster, gd)] = v;
-  }
-
-  template <typename Cluster>
-  T const& get(const Cluster& cluster, const GeomDetUnit& gd) const {
-    auto p = m_map.find(encode(cluster, gd));
-    if (p != m_map.end()) {
-      return (*p).second;
-    }
-    error(gd);
-    return dummy;
-  }
-
-  T const& get(uint32_t ind, uint16_t mr, uint16_t mc) const {
-    auto p = m_map.find(encode(ind, mr, mc));
-    if (p != m_map.end()) {
-      return (*p).second;
-    }
-    error(ind);
-    return dummy;
-  }
-
-  static uint64_t encode(uint32_t ind, uint16_t mr, uint16_t mc) {
-    uint64_t u1 = ind;
-    uint64_t u2 = mr;
-    uint64_t u3 = mc;
-    return (u1 << 32) | (u2 << 16) | u3;
-  }
-
-  static uint64_t encode(const SiPixelCluster& cluster, const GeomDetUnit& det) {
-    uint64_t u1 = det.index();
-    uint64_t u2 = cluster.minPixelRow();
-    uint64_t u3 = cluster.minPixelCol();
-    return (u1 << 32) | (u2 << 16) | u3;
-  }
-  static uint64_t encode(const SiStripCluster& cluster, const GeomDetUnit& det) {
-    uint64_t u1 = det.index();
-    uint64_t u2 = cluster.firstStrip();
-    return (u1 << 32) | u2;
-  }
-
-  std::unordered_map<uint64_t, T> m_map;
-  T dummy;
-};
-
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_RecHitsMap_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index 3476362864a79..491dd0df2004f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -1,110 +1,113 @@
 #include "RiemannFitOnGPU.h"
 
 void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
+  assert(tuples_);
 
   //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
-  auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
-  auto circle_fit_resultsGPU_holder = std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+  auto hitsGPU =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
+  auto circle_fit_resultsGPU_holder =
+      std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit));
+  riemannFit::CircleFit *circle_fit_resultsGPU = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernelFastFit<3>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<3>(
+        tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-    kernelCircleFit<3>(tupleMultiplicity_d,
-                       3,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+    kernel_CircleFit<3>(tupleMultiplicity_,
+                        3,
+                        bField_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
 
-    kernelLineFit<3>(tupleMultiplicity_d,
-                     3,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
+    kernel_LineFit<3>(tupleMultiplicity_,
+                      3,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get(),
+                      circle_fit_resultsGPU,
+                      offset);
 
     // quads
-    kernelFastFit<4>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<4>(
+        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-    kernelCircleFit<4>(tupleMultiplicity_d,
-                       4,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+    kernel_CircleFit<4>(tupleMultiplicity_,
+                        4,
+                        bField_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
 
-    kernelLineFit<4>(tupleMultiplicity_d,
-                     4,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
+    kernel_LineFit<4>(tupleMultiplicity_,
+                      4,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get(),
+                      circle_fit_resultsGPU,
+                      offset);
 
     if (fit5as4_) {
       // penta
-      kernelFastFit<4>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<4>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-      kernelCircleFit<4>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
+      kernel_CircleFit<4>(tupleMultiplicity_,
+                          5,
+                          bField_,
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          circle_fit_resultsGPU,
+                          offset);
 
-      kernelLineFit<4>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+      kernel_LineFit<4>(tupleMultiplicity_,
+                        5,
+                        bField_,
+                        outputSoa_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
 
     } else {
       // penta all 5
-      kernelFastFit<5>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<5>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-      kernelCircleFit<5>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
+      kernel_CircleFit<5>(tupleMultiplicity_,
+                          5,
+                          bField_,
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          circle_fit_resultsGPU,
+                          offset);
 
-      kernelLineFit<5>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+      kernel_LineFit<5>(tupleMultiplicity_,
+                        5,
+                        bField_,
+                        outputSoa_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
     }
   }
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 1077bb7736667..90af2ac13730b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -5,126 +5,126 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
                                          cudaStream_t stream) {
-  assert(tuples_d);
+  assert(tuples_);
 
   auto blockSize = 64;
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto hitsGPU = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
   auto circle_fit_resultsGPU_holder =
-      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), stream);
+  riemannFit::CircleFit *circle_fit_resultsGPU_ = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 3,
-                                                                 bField_,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 circle_fit_resultsGPU_,
-                                                                 offset);
+    kernel_CircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                  3,
+                                                                  bField_,
+                                                                  hitsGPU.get(),
+                                                                  hits_geGPU.get(),
+                                                                  fast_fit_resultsGPU.get(),
+                                                                  circle_fit_resultsGPU_,
+                                                                  offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                               3,
-                                                               bField_,
-                                                               outputSoa_d,
-                                                               hitsGPU_.get(),
-                                                               hits_geGPU_.get(),
-                                                               fast_fit_resultsGPU_.get(),
-                                                               circle_fit_resultsGPU_,
-                                                               offset);
+    kernel_LineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                3,
+                                                                bField_,
+                                                                outputSoa_,
+                                                                hitsGPU.get(),
+                                                                hits_geGPU.get(),
+                                                                fast_fit_resultsGPU.get(),
+                                                                circle_fit_resultsGPU_,
+                                                                offset);
     cudaCheck(cudaGetLastError());
 
     // quads
-    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     4,
-                                                                     bField_,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
+    kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      4,
+                                                                      bField_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   4,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   circle_fit_resultsGPU_,
-                                                                   offset);
+    kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    4,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU.get(),
+                                                                    hits_geGPU.get(),
+                                                                    fast_fit_resultsGPU.get(),
+                                                                    circle_fit_resultsGPU_,
+                                                                    offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // penta
-      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
+      kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
+      kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
       cudaCheck(cudaGetLastError());
     } else {
       // penta all 5
-      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
+      kernel_CircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
+      kernel_LineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
       cudaCheck(cudaGetLastError());
     }
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index a16374278233a..5b661bc3be028 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -19,14 +19,14 @@ using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
 
 template <int N>
-__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
-                              CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                              uint32_t nHits,
-                              HitsOnGPU const *__restrict__ hhp,
-                              double *__restrict__ phits,
-                              float *__restrict__ phits_ge,
-                              double *__restrict__ pfast_fit,
-                              uint32_t offset) {
+__global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
+                               caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                               uint32_t nHits,
+                               HitsOnGPU const *__restrict__ hhp,
+                               double *__restrict__ phits,
+                               float *__restrict__ phits_ge,
+                               double *__restrict__ pfast_fit,
+                               uint32_t offset) {
   constexpr uint32_t hitsInFit = N;
 
   assert(hitsInFit <= nHits);
@@ -43,7 +43,7 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
     printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
 #endif
 
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -55,9 +55,9 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
 
     assert(foundNtuplets->size(tkid) == nHits);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
     // Prepare data structure
     auto const *hitId = foundNtuplets->begin(tkid);
@@ -73,7 +73,7 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
       hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
-    Rfit::Fast_fit(hits, fast_fit);
+    riemannFit::fastFit(hits, fast_fit);
 
     // no NaN here....
     assert(fast_fit(0) == fast_fit(0));
@@ -84,14 +84,14 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
 }
 
 template <int N>
-__global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                uint32_t nHits,
-                                double B,
-                                double *__restrict__ phits,
-                                float *__restrict__ phits_ge,
-                                double *__restrict__ pfast_fit_input,
-                                Rfit::circle_fit *circle_fit,
-                                uint32_t offset) {
+__global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                 uint32_t nHits,
+                                 double bField,
+                                 double *__restrict__ phits,
+                                 float *__restrict__ phits_ge,
+                                 double *__restrict__ pfast_fit_input,
+                                 riemannFit::CircleFit *circle_fit,
+                                 uint32_t offset) {
   assert(circle_fit);
   assert(N <= nHits);
 
@@ -99,22 +99,22 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
 
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
       break;
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
-    Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+    riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-    Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
-    Rfit::loadCovariance2D(hits_ge, hits_cov);
+    riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+    riemannFit::loadCovariance2D(hits_ge, hits_cov);
 
-    circle_fit[local_idx] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
+    circle_fit[local_idx] = riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, bField, true);
 
 #ifdef RIEMANN_DEBUG
 //    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
@@ -125,15 +125,15 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
 }
 
 template <int N>
-__global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                              uint32_t nHits,
-                              double B,
-                              OutputSoA *results,
-                              double *__restrict__ phits,
-                              float *__restrict__ phits_ge,
-                              double *__restrict__ pfast_fit_input,
-                              Rfit::circle_fit *__restrict__ circle_fit,
-                              uint32_t offset) {
+__global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                               uint32_t nHits,
+                               double bField,
+                               OutputSoA *results,
+                               double *__restrict__ phits,
+                               float *__restrict__ phits_ge,
+                               double *__restrict__ pfast_fit_input,
+                               riemannFit::CircleFit *__restrict__ circle_fit,
+                               uint32_t offset) {
   assert(results);
   assert(circle_fit);
   assert(N <= nHits);
@@ -142,7 +142,7 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -151,17 +151,17 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
     // get it for the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
-    auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_idx], fast_fit, B, true);
+    auto const &line_fit = riemannFit::lineFit(hits, hits_ge, circle_fit[local_idx], fast_fit, bField, true);
 
-    Rfit::fromCircleToPerigee(circle_fit[local_idx]);
+    riemannFit::fromCircleToPerigee(circle_fit[local_idx]);
 
     results->stateAtBS.copyFromCircle(
-        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(B), tkid);
-    results->pt(tkid) = B / std::abs(circle_fit[local_idx].par(2));
+        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid);
+    results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2));
     results->eta(tkid) = asinhf(line_fit.par(0));
     results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 336dbbc98521f..09cd5c18e65ae 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
 
 #include <algorithm>
 #include <cmath>
@@ -27,7 +27,6 @@ namespace gpuPixelDoublets {
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
     auto const& hh = *hhp;
-    // auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
 
     // x run faster...
     auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
@@ -39,28 +38,27 @@ namespace gpuPixelDoublets {
 
     for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) {
       auto const& vc = isOuterHitOfCell[idy];
-      auto s = vc.size();
-      if (s < 2)
+      auto size = vc.size();
+      if (size < 2)
         continue;
       // if alligned kill one of the two.
       // in principle one could try to relax the cut (only in r-z?) for jumping-doublets
       auto const& c0 = cells[vc[0]];
-      auto xo = c0.get_outer_x(hh);
-      auto yo = c0.get_outer_y(hh);
-      auto zo = c0.get_outer_z(hh);
+      auto xo = c0.outer_x(hh);
+      auto yo = c0.outer_y(hh);
+      auto zo = c0.outer_z(hh);
       auto sg = 0;
-      for (int32_t ic = 0; ic < s; ++ic) {
+      for (int32_t ic = 0; ic < size; ++ic) {
         auto& ci = cells[vc[ic]];
-        if (0 == ci.theUsed)
+        if (ci.unused())
           continue;  // for triplets equivalent to next
         if (checkTrack && ci.tracks().empty())
           continue;
         cc[sg] = vc[ic];
-        d[sg] = ci.get_inner_detIndex(hh);
-        //      l[sg] = layer(d[sg]);
-        x[sg] = ci.get_inner_x(hh) - xo;
-        y[sg] = ci.get_inner_y(hh) - yo;
-        z[sg] = ci.get_inner_z(hh) - zo;
+        d[sg] = ci.inner_detIndex(hh);
+        x[sg] = ci.inner_x(hh) - xo;
+        y[sg] = ci.inner_y(hh) - yo;
+        z[sg] = ci.inner_z(hh) - zo;
         n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
         ++sg;
       }
@@ -78,10 +76,10 @@ namespace gpuPixelDoublets {
           if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
             // alligned:  kill farthest  (prefer consecutive layers)
             if (n[ic] > n[jc]) {
-              ci.theDoubletId = -1;
+              ci.kill();
               break;
             } else {
-              cj.theDoubletId = -1;
+              cj.kill();
             }
           }
         }  //cj
@@ -90,4 +88,4 @@ namespace gpuPixelDoublets {
   }
 }  // namespace gpuPixelDoublets
 
-#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 5b0d3e8833a52..6de3f1a51acaa 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
 
 #include "RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h"
 
@@ -8,7 +8,7 @@
 namespace gpuPixelDoublets {
 
   constexpr int nPairs = 13 + 2 + 4;
-  static_assert(nPairs <= CAConstants::maxNumberOfLayerPairs());
+  static_assert(nPairs <= caConstants::maxNumberOfLayerPairs);
 
   // start constants
   // clang-format off
@@ -58,10 +58,10 @@ namespace gpuPixelDoublets {
   // end constants
   // clang-format on
 
-  using CellNeighbors = CAConstants::CellNeighbors;
-  using CellTracks = CAConstants::CellTracks;
-  using CellNeighborsVector = CAConstants::CellNeighborsVector;
-  using CellTracksVector = CAConstants::CellTracksVector;
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
 
   __global__ void initDoublets(GPUCACell::OuterHitOfCell* isOuterHitOfCell,
                                int nHits,
@@ -75,8 +75,8 @@ namespace gpuPixelDoublets {
       isOuterHitOfCell[i].reset();
 
     if (0 == first) {
-      cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
-      cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
+      cellNeighbors->construct(caConstants::maxNumOfActiveDoublets, cellNeighborsContainer);
+      cellTracks->construct(caConstants::maxNumOfActiveDoublets, cellTracksContainer);
       auto i = cellNeighbors->extend();
       assert(0 == i);
       (*cellNeighbors)[0].reset();
@@ -127,4 +127,4 @@ namespace gpuPixelDoublets {
 
 }  // namespace gpuPixelDoublets
 
-#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 5c0d5a252b684..a12dee0785b36 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
 
 #include <algorithm>
 #include <cmath>
@@ -17,10 +17,10 @@
 
 namespace gpuPixelDoublets {
 
-  using CellNeighbors = CAConstants::CellNeighbors;
-  using CellTracks = CAConstants::CellTracks;
-  using CellNeighborsVector = CAConstants::CellNeighborsVector;
-  using CellTracksVector = CAConstants::CellTracksVector;
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
 
   __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
                                                     uint32_t nPairs,
@@ -61,7 +61,7 @@ namespace gpuPixelDoublets {
     // nPairsMax to be optimized later (originally was 64).
     // If it should be much bigger, consider using a block-wide parallel prefix scan,
     // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
-    const int nPairsMax = CAConstants::maxNumberOfLayerPairs();
+    const int nPairsMax = caConstants::maxNumberOfLayerPairs;
     assert(nPairs <= nPairsMax);
     __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
     __shared__ uint32_t ntot;
@@ -142,8 +142,8 @@ namespace gpuPixelDoublets {
       // all cuts: true if fails
       constexpr float z0cut = 12.f;      // cm
       constexpr float hardPtCut = 0.5f;  // GeV
-      constexpr float minRadius =
-          hardPtCut * 87.78f;  // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius = hardPtCut * 87.78f;
       constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
       auto ptcut = [&](int j, int16_t idphi) {
         auto r2t4 = minRadius2T4;
@@ -178,7 +178,6 @@ namespace gpuPixelDoublets {
       auto kl = PhiBinner::bin(int16_t(mep - iphicut));
       auto kh = PhiBinner::bin(int16_t(mep + iphicut));
       auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
-      // bool piWrap = std::abs(kh-kl) > PhiBinner::nbins()/2;
 
 #ifdef GPU_DEBUG
       int tot = 0;
@@ -241,4 +240,4 @@ namespace gpuPixelDoublets {
 
 }  // namespace gpuPixelDoublets
 
-#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
index 5c57eb7005691..5cf2e6526b860 100644
--- a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
+++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
@@ -9,7 +9,7 @@ void print() {
 }
 
 int main() {
-  using namespace CAConstants;
+  using namespace caConstants;
 
   print<GPUCACell>();
   print<CellNeighbors>();
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index 5c5e5fecf41cf..0e5823fc46c46 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -43,15 +43,22 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
 private:
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 
-  edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
-
+  // Event data tokens
+  const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  // Event setup tokens
+  const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
+  const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> trackerDigiGeometryToken_;
+  const edm::ESGetToken<Propagator, TrackingComponentsRecord> trackerPropagatorToken_;
   int32_t minNumberOfHits_;
 };
 
 SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
       tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
+      idealMagneticFieldToken_(esConsumes()),
+      trackerDigiGeometryToken_(esConsumes()),
+      trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))),
       minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
 
 {
@@ -71,20 +78,13 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
   // std::cout << "Converting gpu helix to trajectory seed" << std::endl;
   auto result = std::make_unique<TrajectorySeedCollection>();
 
-  edm::ESHandle<MagneticField> fieldESH;
-  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
-
-  edm::ESHandle<TrackerGeometry> tracker;
-  iSetup.get<TrackerDigiGeometryRecord>().get(tracker);
+  auto const& fieldESH = iSetup.getHandle(idealMagneticFieldToken_);
+  auto const& tracker = iSetup.getHandle(trackerDigiGeometryToken_);
   auto const& dus = tracker->detUnits();
 
-  edm::ESHandle<Propagator> propagatorHandle;
-  iSetup.get<TrackingComponentsRecord>().get("PropagatorWithMaterial", propagatorHandle);
+  auto const& propagatorHandle = iSetup.getHandle(trackerPropagatorToken_);
   const Propagator* propagator = &(*propagatorHandle);
 
-  edm::ESHandle<TrackerTopology> httopo;
-  iSetup.get<TrackerTopologyRcd>().get(httopo);
-
   const auto& bsh = iEvent.get(tBeamSpot_);
   // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
@@ -103,7 +103,7 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
       break;  // this is a guard: maybe we need to move to nTracks...
 
     auto q = quality[it];
-    if (q != trackQuality::loose)
+    if (q != pixelTrack::Quality::loose)
       continue;  // FIXME
     if (nHits < minNumberOfHits_)
       continue;
@@ -122,10 +122,10 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
 
     float phi = tsoa.phi(it);
 
-    Rfit::Vector5d ipar, opar;
-    Rfit::Matrix5d icov, ocov;
+    riemannFit::Vector5d ipar, opar;
+    riemannFit::Matrix5d icov, ocov;
     fit.copyToDense(ipar, icov, it);
-    Rfit::transformToPerigeePlane(ipar, icov, opar, ocov);
+    riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
     AlgebraicSymMatrix55 m;

From 8d19af69c012d1411a83619fe373da626461b782 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 24 Mar 2021 16:29:47 +0100
Subject: [PATCH 100/102] Further clean up the CA ntuplet generator
 (cms-patatrack#610)

---
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 33 +++++++++++--------
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 11 ++++---
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  6 ++--
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index c4b8a5a54847f..041254e78aa36 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -18,15 +18,18 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
 #endif
 
-  // in principle we can use "nhits" to heuristically dimension the workspace...
-  // overkill to use template here (std::make_unique would suffice)
-  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
-  device_isOuterHitOfCell_.reset(
-      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
+  // use "nhits" to heuristically dimension the workspace
+
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits));
   assert(device_isOuterHitOfCell_.get());
 
-  cellStorage_.reset((unsigned char *)malloc(caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
-                                             caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks)));
+  auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+                         caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
+  cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
                                                                                       sizeof(GPUCACell::CellNeighbors));
@@ -38,17 +41,21 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                  device_theCellTracks_.get(),
                                  device_theCellTracksContainer_);
 
-  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
-  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * params_.maxNumberOfDoublets_));
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
+  device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
   if (0 == nhits)
     return;  // protect against empty events
 
-  // FIXME avoid magic numbers
+  // take all layer pairs into account
   auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!params_.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
+  if (not params_.includeJumpingForwardDoublets_) {
+    // exclude forward "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
+  }
   if (params_.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
+    // for quadruplets, exclude all "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
   }
 
   assert(nActualPairs <= gpuPixelDoublets::nPairs);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 96639e98939f9..179e73f3f23f8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -182,12 +182,15 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   if (0 == nhits)
     return;  // protect against empty events
 
-  // FIXME avoid magic numbers
+  // take all layer pairs into account
   auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!params_.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
+  if (not params_.includeJumpingForwardDoublets_) {
+    // exclude forward "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
+  }
   if (params_.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
+    // for quadruplets, exclude all "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
   }
 
   assert(nActualPairs <= gpuPixelDoublets::nPairs);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 6de3f1a51acaa..4e2e241e92605 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -7,7 +7,9 @@
 
 namespace gpuPixelDoublets {
 
-  constexpr int nPairs = 13 + 2 + 4;
+  constexpr int nPairsForQuadruplets = 13;                     // quadruplets require hits in all layers
+  constexpr int nPairsForTriplets = nPairsForQuadruplets + 2;  // include barrel "jumping" layer pairs
+  constexpr int nPairs = nPairsForTriplets + 4;                // include forward "jumping" layer pairs
   static_assert(nPairs <= caConstants::maxNumberOfLayerPairs);
 
   // start constants
@@ -15,7 +17,7 @@ namespace gpuPixelDoublets {
 
   CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
       0, 1, 0, 4, 0, 7,              // BPIX1 (3)
-      1, 2, 1, 4, 1, 7,              // BPIX2 (5)
+      1, 2, 1, 4, 1, 7,              // BPIX2 (6)
       4, 5, 7, 8,                    // FPIX1 (8)
       2, 3, 2, 4, 2, 7, 5, 6, 8, 9,  // BPIX3 & FPIX2 (13)
       0, 2, 1, 3,                    // Jumping Barrel (15)

From 5bde441bc06c26ee820c0c7067be71ca78861dc1 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 25 Mar 2021 22:04:02 +0100
Subject: [PATCH 101/102] Minor fixes and clean up for the pixel track
 reconstruction code (cms-patatrack#611)

Fix RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml following file renames.

Remove unnecessary customisation from
RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py .
---
 .../python/customizePixelTracksSoAonCPU.py             |  4 ++--
 .../PixelTrackFitting/test/BuildFile.xml               | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index 909959f2d81be..75cfe205027f8 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -51,11 +51,11 @@ def customizePixelTracksForTriplets(process):
 
 def customizePixelTracksSoAonCPUForProfiling(process):
 
-  process.MessageLogger.cerr.FwkReport.reportEvery = 100
-
   process = customizePixelTracksSoAonCPU(process)
+
   process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
   
   process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA)
   process.schedule = cms.Schedule(process.TkSoA)
+
   return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 98dc3d9b282f1..f45da7a1880de 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -9,7 +9,7 @@
   <flags EDM_PLUGIN="1"/>
 </library>
 
-<bin file="testRiemannFit.cpp">
+<bin file="testFits.cpp">
   <use name="cuda"/>
   <use name="eigen"/>
   <flags CXXFLAGS="-g"/>
@@ -21,13 +21,13 @@
   <flags CXXFLAGS="-g -DUSE_BL"/>
 </bin>
 
-<bin file="testFits.cpp" name="testRiemannFitDump">
+<bin file="testFits.cpp" name="testFitsDump">
   <use name="cuda"/>
   <use name="eigen"/>
   <flags CXXFLAGS="-g -DRFIT_DEBUG"/>
 </bin>
 
-<bin file="testEigenGPU.cu" name="testRiemannFitGPU_t">
+<bin file="testEigenGPU.cu" name="testFitsGPU_t">
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <use name="cuda"/>
   <use name="eigen"/>
@@ -48,7 +48,7 @@
   <flags CXXFLAGS="-g"/>
 </bin>
 
-<bin file="PixelTrackRiemannFit.cc">
+<bin file="PixelTrackFits.cc">
   <use name="cuda"/>
   <use name="eigen"/>
   <use name="root"/>
@@ -62,7 +62,7 @@
   <flags CXXFLAGS="-DEIGEN_NO_DEBUG -DUSE_BL"/>
 </bin>
 
-<bin file="PixelTrackFits.cc" name="PixelTrackRiemannFit_Debug">
+<bin file="PixelTrackFits.cc" name="PixelTrackFits_Debug">
   <use name="cuda"/>
   <use name="eigen"/>
   <use name="root"/>

From 16e7fdbac0421964904e27710ba79314cab614cd Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 25 Mar 2021 22:43:22 +0100
Subject: [PATCH 102/102] Temporarily revert the python customisation for the
 Pixel vertices

---
 .../python/customizePixelTracksSoAonCPU.py         | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index 75cfe205027f8..24cc16e02b463 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -20,21 +20,12 @@ def customizePixelTracksSoAonCPU(process):
     pixelRecHitSrc = 'siPixelRecHitsPreSplitting'
   )
 
-  from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
-  process.pixelVertexSoA = pixelVertexCUDA.clone(
-    onGPU = False,
-    pixelTrackSrc = 'pixelTrackSoA'
-  )
-
   from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA
   process.pixelTracks = pixelTrackProducerFromSoA.clone(
     pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting'
   )
 
-  from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA
-  process.pixelVertices = pixelVertexFromSoA.clone()
-
-  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA
+  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA
 
   return process
 
@@ -55,7 +46,8 @@ def customizePixelTracksSoAonCPUForProfiling(process):
 
   process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
   
-  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA)
+  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA)
+
   process.schedule = cms.Schedule(process.TkSoA)
 
   return process