From f761369a6d9195748dab2c68202456a41edd1578 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 4 Dec 2019 12:43:13 +0100
Subject: [PATCH 01/22] more cudacompat

---
 .../CUDAUtilities/interface/cudaCompat.h      |  6 +++-
 .../CUDAUtilities/interface/launch.h          |  8 +++++
 .../CUDAUtilities/test/BuildFile.xml          |  7 +++++
 .../CUDAUtilities/test/Launch_t.cpp           | 31 +++++++++++++++++++
 .../CUDAUtilities/test/Launch_t.cu            | 10 ++++++
 5 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index 32c7d7b481a50..602e876a12e15 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -6,6 +6,10 @@
  */
 
 #ifndef __CUDACC__
+  #define CUDA_KERNELS_ON_CPU
+#endif
+
+#ifdef CUDA_KERNELS_ON_CPU
 
 #include <algorithm>
 #include <cstdint>
@@ -98,6 +102,6 @@ namespace cudaCompat {
 using namespace cudaCompat;
 #endif
 
-#endif
+#endif  // CUDA_KERNELS_ON_CPU
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
index 24282d690005e..a624f0262c34d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/launch.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -96,8 +96,12 @@ namespace cudautils {
   // wrappers for cudaLaunchKernel
 
   void launch(void (*kernel)(), LaunchParameters config) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel();
+#else
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+#endif
   }
 
   template <typename F, typename... Args>
@@ -107,6 +111,9 @@ namespace cudautils {
   std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
 #endif
   launch(F* kernel, LaunchParameters config, Args&&... args) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel(args...);
+#else
     using function_type = detail::kernel_traits<F>;
     typename function_type::argument_type_tuple args_copy(args...);
 
@@ -116,6 +123,7 @@ namespace cudautils {
     detail::pointer_setter<size>()(pointers, args_copy);
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+#endif
   }
 
   // wrappers for cudaLaunchCooperativeKernel
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index a700c0865f0f2..d9edcba07c182 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -7,6 +7,13 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<bin file="Launch_t.cpp Launch_t.cu" name="gpuLaunch_t">
+</bin>
+
+<bin file="Launch_t.cpp" name="cpuLaunch_t">
+  <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU"/>
+</bin>
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
new file mode 100644
index 0000000000000..11a6876ff3665
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
@@ -0,0 +1,31 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+
+
+#include <cuda_runtime.h>
+#include<cstdio>
+
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+
+
+__global__
+void hello(float k) {
+
+  printf("hello %f\n",k);
+
+}
+
+
+
+
+int main() {
+
+  requireCUDADevices();
+
+  cudautils::launch(hello,{1, 1},3.14);
+  cudaDeviceSynchronize();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
new file mode 100644
index 0000000000000..7983512739b60
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
@@ -0,0 +1,10 @@
+#include<cstdio>
+
+__global__
+void hello(float k) {
+
+  printf("hello %f\n",k);
+
+}
+
+

From 466b1aec8cb02ae849d4192e1e3dd9d04647bb0a Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 4 Dec 2019 13:56:11 +0100
Subject: [PATCH 02/22] more cudacompat

---
 .../CUDAUtilities/interface/cpu_unique_ptr.h  | 85 +++++++++++++++++++
 .../CUDAUtilities/test/cpu_unique_ptr_t.cpp   | 31 +++++++
 2 files changed, 116 insertions(+)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
new file mode 100644
index 0000000000000..f4e97f9e47523
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
@@ -0,0 +1,85 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include <cstdlib>
+
+namespace cudautils {
+  namespace cpu {
+    namespace impl {
+      // Additional layer of types to distinguish from device:: and host::unique_ptr
+      class CPUDeleter {
+      public:
+        CPUDeleter() = default;
+
+        void operator()(void *ptr) {
+            ::free(ptr);
+        }
+      };
+    }  // namespace impl
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::CPUDeleter>;
+
+    namespace impl {
+      template <typename T>
+      struct make_cpu_unique_selector {
+        using non_array = cudautils::cpu::unique_ptr<T>;
+      };
+      template <typename T>
+      struct make_cpu_unique_selector<T[]> {
+        using unbounded_array = cudautils::cpu::unique_ptr<T[]>;
+      };
+      template <typename T, size_t N>
+      struct make_cpu_unique_selector<T[N]> {
+        struct bounded_array {};
+      };
+    }  // namespace impl
+  }    // namespace cpu
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique() {
+    static_assert(std::is_trivially_constructible<T>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique(Args &&...) = delete;
+
+  // No check for the trivial constructor, make it clear in the interface
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized() {
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n) {
+    using element_type = typename std::remove_extent<T>::type;
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique_uninitialized(Args &&...) =
+      delete;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..e7f3077e999a0
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
@@ -0,0 +1,31 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
+
+TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
+
+  SECTION("Single element") {
+    auto ptr = cudautils::make_cpu_unique<int>();
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Reset") {
+    auto ptr = cudautils::make_cpu_unique<int>();
+    REQUIRE(ptr != nullptr);
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr = cudautils::make_host_unique<int[]>(10);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Allocating too much") {
+    constexpr size_t maxSize = 1 << 30;  // 8**10
+    auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1);
+    ptr.reset();
+    REQUIRE(ptr != nullptr);
+  }
+}

From 69b6daac5b895f76db1c52f54c28096a4706b03f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 4 Dec 2019 14:45:58 +0100
Subject: [PATCH 03/22] improve test

---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 125 +++++++++++++++---
 1 file changed, 108 insertions(+), 17 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5261069a6b283..039135efa852f 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -56,7 +56,7 @@ struct Event {
 
 struct ClusterGenerator {
   explicit ClusterGenerator(float nvert, float ntrack)
-      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
+      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(0.001,1.) {}
 
   void operator()(Event& ev) {
     int nclus = clusGen(reng);
@@ -68,19 +68,28 @@ struct ClusterGenerator {
 
     ev.ztrack.clear();
     ev.eztrack.clear();
-    ev.ivert.clear();
+    ev.pttrack.clear();
+    ev.ivert.clear();    
+    float ptMax=0; float pt5=0;
     for (int iv = 0; iv < nclus; ++iv) {
       auto nt = trackGen(reng);
-      ev.itrack[nclus] = nt;
+      if (iv == 5) nt *= 3;
+      ev.itrack[iv] = nt;
+      float ptSum=0;
       for (int it = 0; it < nt; ++it) {
         auto err = errgen(reng);  // reality is not flat....
         ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
         ev.eztrack.push_back(err * err);
         ev.ivert.push_back(iv);
-        ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
+        ev.pttrack.push_back(std::pow(ptGen(reng),iv==5 ?-1.0f:-0.5f));
         ev.pttrack.back() *= ev.pttrack.back();
+        ptSum += ev.pttrack.back();
       }
+      ptMax = std::max(ptMax,ptSum);
+      if (iv == 5) pt5 = ptSum;
     }
+    std::cout << "PV, ptMax " << std::sqrt(pt5) << ' ' << std::sqrt(ptMax) << std::endl;
+
     // add noise
     auto nt = 2 * trackGen(reng);
     for (int it = 0; it < nt; ++it) {
@@ -88,7 +97,7 @@ struct ClusterGenerator {
       ev.ztrack.push_back(rgen(reng));
       ev.eztrack.push_back(err * err);
       ev.ivert.push_back(9999);
-      ev.pttrack.push_back(0.5f + ptGen(reng));
+      ev.pttrack.push_back(std::pow(ptGen(reng),-0.5f));
       ev.pttrack.back() *= ev.pttrack.back();
     }
   }
@@ -99,7 +108,7 @@ struct ClusterGenerator {
   std::poisson_distribution<int> clusGen;
   std::poisson_distribution<int> trackGen;
   std::normal_distribution<float> gauss;
-  std::exponential_distribution<float> ptGen;
+  std::uniform_real_distribution<float> ptGen;  // becomes a power low
 };
 
 // a macro SORRY
@@ -112,6 +121,12 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
   printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
 }
 
+__global__ void linit(ZVertices * pdata, WorkSpace * pws, int nt) {
+  auto & __restrict__ data = *pdata;
+  auto & __restrict__ ws = *pws;
+  for (int i=0; i<nt; ++i) {  ws.itrk[i]=i; data.idv[i] = -1;}
+}
+
 int main() {
 #ifdef __CUDACC__
   requireCUDADevices();
@@ -127,28 +142,36 @@ int main() {
 
   float eps = 0.1f;
   std::array<float, 3> par{{eps, 0.01f, 9.0f}};
-  for (int nav = 30; nav < 80; nav += 20) {
+  for (int nav = 30; nav < 100; nav += 20) {
     ClusterGenerator gen(nav, 10);
 
-    for (int i = 8; i < 20; ++i) {
-      auto kk = i / 4;  // M param
+    for (int iii = 8; iii < 20; ++iii) {
+      auto kk = iii / 4;  // M param
 
       gen(ev);
 
+      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+      int nt = ev.ztrack.size();
+      int nvori = ev.zvert.size();
+      int ntori = nt;
+      assert(ntori<int(ZVertexSoA::MAXTRACKS));
+      assert(nvori< int(ZVertexSoA::MAXVTX));
+
 #ifdef __CUDACC__
       init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      linit<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get(),ntori);
 #else
       onGPU_d->init();
       ws_d->init();
+      for (int16_t i=0; i<ntori; ++i) {  ws_d->itrk[i]=i; onGPU_d->idv[i] = -1;}  // FIXME do the same on GPU....
 #endif
 
-      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
-      auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
       cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+
 #else
       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
@@ -156,15 +179,15 @@ int main() {
       ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #endif
 
-      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (iii % 4) << std::endl;
 
-      if ((i % 4) == 0)
+      if ((iii % 4) == 0)
         par = {{eps, 0.02f, 12.0f}};
-      if ((i % 4) == 1)
+      if ((iii % 4) == 1)
         par = {{eps, 0.02f, 9.0f}};
-      if ((i % 4) == 2)
+      if ((iii % 4) == 2)
         par = {{eps, 0.01f, 9.0f}};
-      if ((i % 4) == 3)
+      if ((iii % 4) == 3)
         par = {{0.7f * eps, 0.01f, 9.0f}};
 
       uint32_t nv = 0;
@@ -200,6 +223,7 @@ int main() {
         continue;
       }
 
+      int16_t * idv = nullptr;
       float* zv = nullptr;
       float* wv = nullptr;
       float* ptv2 = nullptr;
@@ -210,18 +234,21 @@ int main() {
       float chi2[2 * nv];  // make space for splitting...
 
 #ifdef __CUDACC__
+      int16_t hidv[16000];
       float hzv[2 * nv];
       float hwv[2 * nv];
       float hptv2[2 * nv];
       int32_t hnn[2 * nv];
       uint16_t hind[2 * nv];
 
+      idv = hidv;
       zv = hzv;
       wv = hwv;
       ptv2 = hptv2;
       nn = hnn;
       ind = hind;
 #else
+      idv = onGPU_d->idv;
       zv = onGPU_d->zv;
       wv = onGPU_d->wv;
       ptv2 = onGPU_d->ptv2;
@@ -236,6 +263,64 @@ int main() {
       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
+   auto verifyMatch = [&]() {
+
+      // matching-merging metrics
+      constexpr int MAXMA = 32;
+      struct Match { Match() {for (auto&e:vid)e=-1; for (auto&e:nt)e=0;} std::array<int,MAXMA> vid; std::array<int,MAXMA> nt; };
+
+      auto nnn=0;
+      Match matches[nv]; for (auto kv = 0U; kv < nv; ++kv) { matches[kv] =  Match();}
+      auto iPV =  ind[nv - 1];
+      for (int it=0; it<nt; ++it) {
+        auto const iv = idv[it];
+        if (iv>9990) continue;
+        assert(iv<int(nv));
+        if (iv<0) continue;
+        auto const tiv = ev.ivert[it];
+        if (tiv>9990) continue;
+        ++nnn;
+        for (int i=0; i<MAXMA; ++i) {
+          if (matches[iv].vid[i]<0) { matches[iv].vid[i]=tiv; matches[iv].nt[i]=1; break;}
+          else if (tiv==matches[iv].vid[i]) { ++(matches[iv].nt[i]); break;}
+        }
+      }
+
+      float frac[nv];
+      int nok=0; int merged50=0; int merged75=0; int nmess=0;
+      float dz=0;
+      for (auto kv = 0U; kv < nv; ++kv) {
+        auto mx = std::max_element(matches[kv].nt.begin(),matches[kv].nt.end())-matches[kv].nt.begin();
+        assert(mx>=0 && mx<MAXMA);
+        if (0==matches[kv].nt[mx]) std::cout <<"????? " << kv << ' ' << matches[kv].vid[mx] << ' ' << matches[kv].vid[0] << std::endl;
+        auto itv = matches[kv].vid[mx];
+        frac[kv] = itv<0 ? 0.f : float(matches[kv].nt[mx])/float(ev.itrack[itv]);
+        assert(frac[kv]<1.1f);
+        if (frac[kv]>0.75f) ++nok;
+        if (frac[kv]<0.5f) ++nmess;
+        auto ldz = std::abs(zv[kv] - ev.zvert[itv]);
+        dz = std::max(dz,ldz);
+        int nm5=0; int nm7=0;
+        int ntt=0;
+        for (int i=0; i<MAXMA; ++i) {
+          ntt+=matches[kv].nt[i];
+          auto itv = matches[kv].vid[i];
+          float f = itv<0 ? 0.f : float(matches[kv].nt[i])/float(ev.itrack[itv]);
+          if (f>0.5f) ++nm5;
+          if (f>0.75f) ++nm7;
+        }
+        if (nm5>1) ++merged50;
+        if (nm7>1) ++merged75;
+        if (kv ==  iPV ) std::cout << "PV " << itv << ' ' << std::sqrt(ptv2[kv]) << ' ' << float(ntt)/float(ev.itrack[itv]) << '/' <<  frac[kv] << '/' << nm5 << '/' << nm7 << ' ' << dz << std::endl;
+      }
+      // for (auto f: frac) std::cout << f << ' ';
+      // std::cout << std::endl;
+      std::cout << "ori/tot/matched/merged5//merged7/random/dz "
+                << nvori << '/' << nv << '/' << nok << '/' << merged50 << '/' << merged75  << '/' << nmess
+                << '/' << dz << std::endl;
+      }; // verifyMatch
+
+
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)
           chi2[j] /= float(nn[j]);
@@ -302,7 +387,9 @@ int main() {
       cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(idv, LOC_ONGPU(idv), nt * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
+
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)
           chi2[j] /= float(nn[j]);
@@ -333,7 +420,7 @@ int main() {
         }
         dd[kv] = md;
       }
-      if (i == 6) {
+      if (iii == 6) {
         for (auto d : dd)
           std::cout << d << ' ';
         std::cout << std::endl;
@@ -345,6 +432,10 @@ int main() {
       rms = std::sqrt(rms) / (nv - 1);
       std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
 
+
+      verifyMatch();
+
+
     }  // loop on events
   }    // lopp on ave vert
 

From f398b454e94ee3cb3f9b792315d8fb52a392c6b1 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 4 Dec 2019 15:46:02 +0100
Subject: [PATCH 04/22] compiles

---
 .../Common/interface/HeterogeneousSoA.h       | 32 +++++++++++++------
 .../CUDAUtilities/interface/launch.h          |  4 +--
 .../plugins/CAHitNtupletGeneratorKernels.cc   |  8 ++---
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  2 +-
 .../src/gpuVertexFinderImpl.h                 | 30 ++++++++---------
 .../PixelVertexFinding/test/VertexFinder_t.h  | 25 ++++++---------
 6 files changed, 50 insertions(+), 51 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 907b7647a3452..e0b6d330ce27a 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -5,6 +5,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
@@ -21,15 +22,15 @@ class HeterogeneousSoA {
 
   explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
   explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cudautils::cpu::unique_ptr<T> &&p) : cm_ptr(std::move(p)) {}
 
-  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto const &operator*() const { return *get(); }
 
   auto const *operator-> () const { return get(); }
 
-  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto &operator*() { return *get(); }
 
@@ -47,7 +48,7 @@ class HeterogeneousSoA {
   // a union wan't do it, a variant will not be more efficienct
   cudautils::device::unique_ptr<T> dm_ptr;  //!
   cudautils::host::unique_ptr<T> hm_ptr;    //!
-  std::unique_ptr<T> std_ptr;               //!
+  cudautils::cpu::unique_ptr<T> cm_ptr;    //!
 };
 
 namespace cudaCompat {
@@ -109,31 +110,42 @@ namespace cudaCompat {
 
   struct CPUTraits {
     template <typename T>
-    using unique_ptr = std::unique_ptr<T>;
+    using unique_ptr = cudautils::cpu::unique_ptr<T>;;
+
+
+    template <typename T>
+    static auto make_unique() {
+      return cudautils::make_cpu_unique<T>();
+    }
+
+    template <typename T>
+    static auto make_unique(size_t size) {
+      return cudautils::make_cpu_unique<T>(size);
+    }
 
     template <typename T>
     static auto make_unique(cudaStream_t) {
-      return std::make_unique<T>();
+      return cudautils::make_cpu_unique<T>();
     }
 
     template <typename T>
     static auto make_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+      return cudautils::make_cpu_unique<T>(size);
     }
 
     template <typename T>
     static auto make_host_unique(cudaStream_t) {
-      return std::make_unique<T>();
+      return cudautils::make_cpu_unique<T>();
     }
 
     template <typename T>
     static auto make_device_unique(cudaStream_t) {
-      return std::make_unique<T>();
+      return cudautils::make_cpu_unique<T>();
     }
 
     template <typename T>
     static auto make_device_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+      return cudautils::make_cpu_unique<T>(size);
     }
   };
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
index a624f0262c34d..c8c95b0744300 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/launch.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -94,7 +94,7 @@ namespace cudautils {
   }  // namespace detail
 
   // wrappers for cudaLaunchKernel
-
+  inline
   void launch(void (*kernel)(), LaunchParameters config) {
 #ifdef CUDA_KERNELS_ON_CPU
     kernel();
@@ -127,7 +127,7 @@ namespace cudautils {
   }
 
   // wrappers for cudaLaunchCooperativeKernel
-
+  inline
   void launch_cooperative(void (*kernel)(), LaunchParameters config) {
     cudaCheck(cudaLaunchCooperativeKernel(
         (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 75066458dc170..b4cd1f8aa1abc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -19,10 +19,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  // overkill to use template here (std::make_unique would suffice)
-  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
-  device_isOuterHitOfCell_.reset(
-      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
+  device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits),stream);
   assert(device_isOuterHitOfCell_.get());
   gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
                                  nhits,
@@ -31,8 +28,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                  device_theCellTracks_,
                                  device_theCellTracksContainer_.get());
 
-  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
-  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
+  device_theCells_ = Traits:: template make_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_,stream);
   if (0 == nhits)
     return;  // protect against empty events
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 2e875caba7130..9801d17de8118 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -187,7 +187,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 }
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+  PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique<pixelTrack::TrackSoA>());
 
   auto* soa = tracks.get();
   assert(soa);
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
index d6e63227ccf85..6c72937ae590c 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -1,12 +1,14 @@
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksIterative.h"
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
 
 #include "gpuFitVertices.h"
 #include "gpuSortByPt2.h"
 #include "gpuSplitVertices.h"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
 
 namespace gpuVertexFinder {
 
@@ -84,38 +86,32 @@ namespace gpuVertexFinder {
   }
 #endif
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
     // std::cout << "producing Vertices on GPU" << std::endl;
     ZVertexHeterogeneous vertices(cudautils::make_device_unique<ZVertexSoA>(stream));
 #else
   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
+    cudaStream_t stream = 0;
     // std::cout << "producing Vertices on  CPU" <<    std::endl;
-    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+    ZVertexHeterogeneous vertices(cudautils::make_cpu_unique<ZVertexSoA>());
 #endif
     assert(tksoa);
     auto* soa = vertices.get();
     assert(soa);
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
     auto ws_d = cudautils::make_device_unique<WorkSpace>(stream);
 #else
-    auto ws_d = std::make_unique<WorkSpace>();
+    auto ws_d = cudautils::make_cpu_unique<WorkSpace>();
 #endif
 
-#ifdef __CUDACC__
-    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
     auto blockSize = 128;
     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
-    cudaCheck(cudaGetLastError());
-#else
-    cudaCompat::resetGrid();
-    init(soa, ws_d.get());
-    loadTracks(tksoa, soa, ws_d.get(), ptMin);
-#endif
+    cudautils::launch(init,{1,1,0,stream},soa, ws_d.get());
+    cudautils::launch(loadTracks, {numberOfBlocks, blockSize, 0, stream}, tksoa, soa, ws_d.get(), ptMin);
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 039135efa852f..1bf4e223ab52f 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -22,7 +22,6 @@
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
 
 #ifdef ONE_KERNEL
-#ifdef __CUDACC__
 __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
                                       gpuVertexFinder::WorkSpace* pws,
                                       int minT,      // min number of neighbours to be "seed"
@@ -41,7 +40,6 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
   sortByPt2(pdata, pws);
 }
 #endif
-#endif
 
 using namespace gpuVertexFinder;
 
@@ -128,7 +126,7 @@ __global__ void linit(ZVertices * pdata, WorkSpace * pws, int nt) {
 }
 
 int main() {
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
   requireCUDADevices();
 
   auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
@@ -157,7 +155,7 @@ int main() {
       assert(ntori<int(ZVertexSoA::MAXTRACKS));
       assert(nvori< int(ZVertexSoA::MAXVTX));
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
       linit<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get(),ntori);
 #else
@@ -166,7 +164,7 @@ int main() {
       for (int16_t i=0; i<ntori; ++i) {  ws_d->itrk[i]=i; onGPU_d->idv[i] = -1;}  // FIXME do the same on GPU....
 #endif
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
@@ -191,7 +189,7 @@ int main() {
         par = {{0.7f * eps, 0.01f, 9.0f}};
 
       uint32_t nv = 0;
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
@@ -233,7 +231,7 @@ int main() {
       // keep chi2 separated...
       float chi2[2 * nv];  // make space for splitting...
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       int16_t hidv[16000];
       float hzv[2 * nv];
       float hwv[2 * nv];
@@ -256,7 +254,7 @@ int main() {
       ind = onGPU_d->sortInd;
 #endif
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
@@ -329,7 +327,7 @@ int main() {
         std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
       }
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
@@ -348,7 +346,7 @@ int main() {
         std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
       }
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       // one vertex per block!!!
       cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
@@ -361,12 +359,9 @@ int main() {
 #endif
       std::cout << "after split " << nv << std::endl;
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
-      cudaCheck(cudaGetLastError());
-
       cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
-      cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
@@ -380,7 +375,7 @@ int main() {
         continue;
       }
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));

From f7ec457013d94a5d30b074051175d02e36cd835f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 4 Dec 2019 16:37:57 +0100
Subject: [PATCH 05/22] compiles

---
 .../Common/interface/HeterogeneousSoA.h       | 24 ++++----
 .../CUDAUtilities/interface/cpu_unique_ptr.h  |  9 +--
 .../CUDAUtilities/test/cpu_unique_ptr_t.cpp   |  8 +--
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |  2 +-
 .../src/PixelVertexProducerCUDA.cc            |  3 +-
 .../PixelVertexFinding/src/gpuVertexFinder.h  |  2 +-
 .../src/gpuVertexFinderImpl.h                 | 59 ++++++-------------
 7 files changed, 42 insertions(+), 65 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index e0b6d330ce27a..f4b70aa7188fd 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -115,37 +115,37 @@ namespace cudaCompat {
 
     template <typename T>
     static auto make_unique() {
-      return cudautils::make_cpu_unique<T>();
+      return cudautils::make_cpu_unique<T>(0);
     }
 
     template <typename T>
     static auto make_unique(size_t size) {
-      return cudautils::make_cpu_unique<T>(size);
+      return cudautils::make_cpu_unique<T>(size,0);
     }
 
     template <typename T>
-    static auto make_unique(cudaStream_t) {
-      return cudautils::make_cpu_unique<T>();
+    static auto make_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_unique(size_t size, cudaStream_t) {
-      return cudautils::make_cpu_unique<T>(size);
+    static auto make_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
 
     template <typename T>
-    static auto make_host_unique(cudaStream_t) {
-      return cudautils::make_cpu_unique<T>();
+    static auto make_host_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(cudaStream_t) {
-      return cudautils::make_cpu_unique<T>();
+    static auto make_device_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t) {
-      return cudautils::make_cpu_unique<T>(size);
+    static auto make_device_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
   };
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
index f4e97f9e47523..dab7d15f40f92 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
@@ -5,6 +5,7 @@
 #include <functional>
 
 #include <cstdlib>
+#include <cuda_runtime.h>
 
 namespace cudautils {
   namespace cpu {
@@ -40,7 +41,7 @@ namespace cudautils {
   }    // namespace cpu
 
   template <typename T>
-  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique() {
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique(cudaStream_t) {
     static_assert(std::is_trivially_constructible<T>::value,
                   "Allocating with non-trivial constructor on the cpu memory is not supported");
     void *mem = ::malloc(sizeof(T));
@@ -49,7 +50,7 @@ namespace cudautils {
   }
 
   template <typename T>
-  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n) {
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n, cudaStream_t) {
     using element_type = typename std::remove_extent<T>::type;
     static_assert(std::is_trivially_constructible<element_type>::value,
                   "Allocating with non-trivial constructor on the cpu memory is not supported");
@@ -63,14 +64,14 @@ namespace cudautils {
 
   // No check for the trivial constructor, make it clear in the interface
   template <typename T>
-  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized() {
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized(cudaStream_t) {
     void *mem = ::malloc(sizeof(T));
     return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
                                                                             cpu::impl::CPUDeleter()};
   }
 
   template <typename T>
-  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n) {
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) {
     using element_type = typename std::remove_extent<T>::type;
     void *mem = ::malloc(n * sizeof(element_type));
     return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
index e7f3077e999a0..35577dd1e508c 100644
--- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
@@ -5,12 +5,12 @@
 TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
 
   SECTION("Single element") {
-    auto ptr = cudautils::make_cpu_unique<int>();
+    auto ptr = cudautils::make_cpu_unique<int>(0);
     REQUIRE(ptr != nullptr);
   }
 
   SECTION("Reset") {
-    auto ptr = cudautils::make_cpu_unique<int>();
+    auto ptr = cudautils::make_cpu_unique<int>(0);
     REQUIRE(ptr != nullptr);
 
     ptr.reset();
@@ -18,13 +18,13 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
   }
 
   SECTION("Multiple elements") {
-    auto ptr = cudautils::make_host_unique<int[]>(10);
+    auto ptr = cudautils::make_host_unique<int[]>(10,0);
     REQUIRE(ptr != nullptr);
   }
 
   SECTION("Allocating too much") {
     constexpr size_t maxSize = 1 << 30;  // 8**10
-    auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1);
+    auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1,0);
     ptr.reset();
     REQUIRE(ptr != nullptr);
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 9801d17de8118..4d8ecd96815c5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -187,7 +187,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 }
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique<pixelTrack::TrackSoA>());
+  PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique<pixelTrack::TrackSoA>(0));
 
   auto* soa = tracks.get();
   assert(soa);
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
index e959abcfef949..acf7a07808add 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -117,7 +117,8 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
     std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
     */
 
-    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+    cudaStream_t stream=0;
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index 6cd86c93a6737..351a7a47bdbf1 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -64,7 +64,7 @@ namespace gpuVertexFinder {
     ~Producer() = default;
 
     ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
-    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
+    ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
 
   private:
     const bool oneKernel_;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
index 6c72937ae590c..ba3644e09d8bb 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -85,16 +85,17 @@ namespace gpuVertexFinder {
     sortByPt2(pdata, pws);
   }
 #endif
-
 #ifndef CUDA_KERNELS_ON_CPU
   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+#else
+  ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+#endif
+#ifndef CUDA_KERNELS_ON_CPU
     // std::cout << "producing Vertices on GPU" << std::endl;
     ZVertexHeterogeneous vertices(cudautils::make_device_unique<ZVertexSoA>(stream));
 #else
-  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
-    cudaStream_t stream = 0;
     // std::cout << "producing Vertices on  CPU" <<    std::endl;
-    ZVertexHeterogeneous vertices(cudautils::make_cpu_unique<ZVertexSoA>());
+    ZVertexHeterogeneous vertices(cudautils::make_cpu_unique<ZVertexSoA>(stream));
 #endif
     assert(tksoa);
     auto* soa = vertices.get();
@@ -103,7 +104,7 @@ namespace gpuVertexFinder {
 #ifndef CUDA_KERNELS_ON_CPU
     auto ws_d = cudautils::make_device_unique<WorkSpace>(stream);
 #else
-    auto ws_d = cudautils::make_cpu_unique<WorkSpace>();
+    auto ws_d = cudautils::make_cpu_unique<WorkSpace>(stream);
 #endif
 
     auto blockSize = 128;
@@ -111,56 +112,30 @@ namespace gpuVertexFinder {
     cudautils::launch(init,{1,1,0,stream},soa, ws_d.get());
     cudautils::launch(loadTracks, {numberOfBlocks, blockSize, 0, stream}, tksoa, soa, ws_d.get(), ptMin);
 
-#ifndef CUDA_KERNELS_ON_CPU
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudautils::launch(vertexFinderOneKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
 #else
-      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-      cudaCheck(cudaGetLastError());
+      cudautils::launch(vertexFinderKernel1,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
-      cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+      cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f);
+      cudautils::launch(vertexFinderKernel2,{1, 1024 - 256, 0, stream},soa, ws_d.get());
 #endif
     } else {  // five kernels
       if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        cudautils::launch(clusterTracksByDensityKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        cudautils::launch(clusterTracksDBSCAN,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
-        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        cudautils::launch(clusterTracksIterative,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       }
-      cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
-      cudaCheck(cudaGetLastError());
+      cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 50.);
       // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
-      cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
-      cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
-    }
-    cudaCheck(cudaGetLastError());
-#else  // __CUDACC__
-    if (useDensity_) {
-      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useDBSCAN_) {
-      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useIterative_) {
-      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f);
+      cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 5000.);
+      cudautils::launch(sortByPt2Kernel,{1, 1024 - 256, 0, stream},soa, ws_d.get());
     }
-    // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
-    fitVertices(soa, ws_d.get(), 50.);
-    // one block per vertex!
-    blockIdx.x = 0;
-    gridDim.x = 1;
-    splitVertices(soa, ws_d.get(), 9.f);
-    resetGrid();
-    fitVertices(soa, ws_d.get(), 5000.);
-    sortByPt2(soa, ws_d.get());
-#endif
 
     return vertices;
   }

From 332cb4559b36d2c67f9d65f7989d03f58ff6de49 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 5 Dec 2019 10:05:26 +0100
Subject: [PATCH 06/22] compiles

---
 .../Common/interface/HeterogeneousSoA.h       |  6 ++++
 .../src/PixelVertexProducerCUDA.cc            |  4 +--
 ...gpuVertexFinder.cpp => gpuVertexFinder.cc} |  0
 .../PixelVertexFinding/src/gpuVertexFinder.h  |  6 ++--
 .../src/gpuVertexFinderImpl.h                 | 30 ++++++++-----------
 5 files changed, 25 insertions(+), 21 deletions(-)
 rename RecoPixelVertexing/PixelVertexFinding/src/{gpuVertexFinder.cpp => gpuVertexFinder.cc} (100%)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index f4b70aa7188fd..8edc45f8671b2 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -54,6 +54,8 @@ class HeterogeneousSoA {
 namespace cudaCompat {
 
   struct GPUTraits {
+    static constexpr const char * name = "GPU"; 
+
     template <typename T>
     using unique_ptr = cudautils::device::unique_ptr<T>;
 
@@ -84,6 +86,8 @@ namespace cudaCompat {
   };
 
   struct HostTraits {
+    static constexpr const char * name = "HOST";
+
     template <typename T>
     using unique_ptr = cudautils::host::unique_ptr<T>;
 
@@ -109,6 +113,8 @@ namespace cudaCompat {
   };
 
   struct CPUTraits {
+    static constexpr const char * name = "CPU";
+
     template <typename T>
     using unique_ptr = cudautils::cpu::unique_ptr<T>;;
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
index acf7a07808add..f49b0d7b6dced 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
 
     assert(tracks);
 
-    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make(ctx.stream(), tracks, m_ptMin,m_OnGPU));
 
   } else {
     auto const* tracks = iEvent.get(tokenCPUTrack_).get();
@@ -118,7 +118,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
     */
 
     cudaStream_t stream=0;
-    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin));
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin,m_OnGPU));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc
similarity index 100%
rename from RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp
rename to RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index 351a7a47bdbf1..9f21fa4b650a7 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -63,8 +63,10 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
-    ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+    template<typename Traits>
+    ZVertexHeterogeneous makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+
+    ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const;
 
   private:
     const bool oneKernel_;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
index ba3644e09d8bb..98103e89ab012 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -85,27 +85,16 @@ namespace gpuVertexFinder {
     sortByPt2(pdata, pws);
   }
 #endif
-#ifndef CUDA_KERNELS_ON_CPU
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
-#else
-  ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
-#endif
-#ifndef CUDA_KERNELS_ON_CPU
-    // std::cout << "producing Vertices on GPU" << std::endl;
-    ZVertexHeterogeneous vertices(cudautils::make_device_unique<ZVertexSoA>(stream));
-#else
-    // std::cout << "producing Vertices on  CPU" <<    std::endl;
-    ZVertexHeterogeneous vertices(cudautils::make_cpu_unique<ZVertexSoA>(stream));
-#endif
+
+  template<typename Traits>
+  ZVertexHeterogeneous Producer::makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+    std::cout << "producing Vertices on " << Traits::name << std::endl;
+    ZVertexHeterogeneous vertices(Traits::template make_unique<ZVertexSoA>(stream));
     assert(tksoa);
     auto* soa = vertices.get();
     assert(soa);
 
-#ifndef CUDA_KERNELS_ON_CPU
-    auto ws_d = cudautils::make_device_unique<WorkSpace>(stream);
-#else
-    auto ws_d = cudautils::make_cpu_unique<WorkSpace>(stream);
-#endif
+    auto ws_d = Traits::template make_unique<WorkSpace>(stream);
 
     auto blockSize = 128;
     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
@@ -140,6 +129,13 @@ namespace gpuVertexFinder {
     return vertices;
   }
 
+    ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const {
+       return onGPU ?
+         makeImpl<cudaCompat::GPUTraits>(stream,tksoa,ptMin) :
+         makeImpl<cudaCompat::CPUTraits>(stream,tksoa,ptMin);
+    }
+
+
 }  // namespace gpuVertexFinder
 
 #undef FROM

From 069615973f50b64bc42d11c574e1436f28b43f8a Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 5 Dec 2019 10:33:47 +0100
Subject: [PATCH 07/22] now runs properly

---
 .../PixelVertexFinding/src/PixelVertexProducerCUDA.cc     | 4 ++--
 .../PixelVertexFinding/src/gpuVertexFinder.cc             | 8 ++++++++
 .../PixelVertexFinding/src/gpuVertexFinder.cu             | 7 +++++++
 .../PixelVertexFinding/src/gpuVertexFinder.h              | 1 +
 .../PixelVertexFinding/src/gpuVertexFinderImpl.h          | 7 -------
 5 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
index f49b0d7b6dced..5d81407d00f5b 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
 
     assert(tracks);
 
-    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make(ctx.stream(), tracks, m_ptMin,m_OnGPU));
+    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make<cudaCompat::GPUTraits>(ctx.stream(), tracks, m_ptMin,m_OnGPU));
 
   } else {
     auto const* tracks = iEvent.get(tokenCPUTrack_).get();
@@ -118,7 +118,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
     */
 
     cudaStream_t stream=0;
-    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin,m_OnGPU));
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make<cudaCompat::CPUTraits>(stream,tracks, m_ptMin,m_OnGPU));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc
index 084763385bdb4..e8c6a812d86d2 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc
@@ -1 +1,9 @@
+#define CUDA_KERNELS_ON_CPU // not needed yet
 #include "gpuVertexFinderImpl.h"
+namespace gpuVertexFinder {
+template<>
+    ZVertexHeterogeneous Producer::make<cudaCompat::CPUTraits>(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const {
+       assert(!onGPU);
+       return makeImpl<cudaCompat::CPUTraits>(stream,tksoa,ptMin);
+    }
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
index 084763385bdb4..96739a42c3b0a 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
@@ -1 +1,8 @@
 #include "gpuVertexFinderImpl.h"
+namespace gpuVertexFinder {
+template<>
+    ZVertexHeterogeneous Producer::make<cudaCompat::GPUTraits>(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const {
+       assert(onGPU);
+       return makeImpl<cudaCompat::GPUTraits>(stream,tksoa,ptMin);
+    }
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index 9f21fa4b650a7..4dc4572da6e7e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -66,6 +66,7 @@ namespace gpuVertexFinder {
     template<typename Traits>
     ZVertexHeterogeneous makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
 
+    template<typename Traits>
     ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const;
 
   private:
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
index 98103e89ab012..480cfe3a1b0e0 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -129,13 +129,6 @@ namespace gpuVertexFinder {
     return vertices;
   }
 
-    ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const {
-       return onGPU ?
-         makeImpl<cudaCompat::GPUTraits>(stream,tksoa,ptMin) :
-         makeImpl<cudaCompat::CPUTraits>(stream,tksoa,ptMin);
-    }
-
-
 }  // namespace gpuVertexFinder
 
 #undef FROM

From 34f370434bbf2938d08b35b2f0641c3803968b85 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 5 Dec 2019 10:35:14 +0100
Subject: [PATCH 08/22] remove cout

---
 RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
index 480cfe3a1b0e0..6fd6e985f25b9 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -88,7 +88,7 @@ namespace gpuVertexFinder {
 
   template<typename Traits>
   ZVertexHeterogeneous Producer::makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
-    std::cout << "producing Vertices on " << Traits::name << std::endl;
+    // std::cout << "producing Vertices on " << Traits::name << std::endl;
     ZVertexHeterogeneous vertices(Traits::template make_unique<ZVertexSoA>(stream));
     assert(tksoa);
     auto* soa = vertices.get();

From c87c860bfb5381437f5979559f5cc79b32e39c01 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 10:10:42 +0100
Subject: [PATCH 09/22] a more complete test

---
 .../Common/interface/HeterogeneousSoA.h       |  4 +-
 .../CUDAUtilities/test/BuildFile.xml          |  5 ++
 .../CUDAUtilities/test/Launch_t.cpp           | 31 +++++--------
 .../CUDAUtilities/test/Launch_t.cu            | 22 +++++----
 .../CUDAUtilities/test/Launch_t.h             | 46 +++++++++++++++++++
 .../CUDAUtilities/test/cpu_unique_ptr_t.cpp   |  8 ++--
 .../src/PixelVertexProducerCUDA.cc            |  2 +-
 7 files changed, 82 insertions(+), 36 deletions(-)
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/Launch_t.h

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 8edc45f8671b2..08bdb42b28a98 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -121,12 +121,12 @@ namespace cudaCompat {
 
     template <typename T>
     static auto make_unique() {
-      return cudautils::make_cpu_unique<T>(0);
+      return cudautils::make_cpu_unique<T>(cudaStreamDefault);
     }
 
     template <typename T>
     static auto make_unique(size_t size) {
-      return cudautils::make_cpu_unique<T>(size,0);
+      return cudautils::make_cpu_unique<T>(size,cudaStreamDefault);
     }
 
     template <typename T>
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index d9edcba07c182..893d1ab1745eb 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -14,6 +14,11 @@
   <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU"/>
 </bin>
 
+<bin file="Launch_t.cpp Launch_t.cu" name="mixedLaunch_t">
+  <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU -DLaunchInCU"/>
+</bin>
+
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
index 11a6876ff3665..77c824fc833c4 100644
--- a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
@@ -1,31 +1,24 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
 
+#include "Launch_t.h"
 
-
-#include <cuda_runtime.h>
-#include<cstdio>
-
-#undef __global__
-#define __global__ inline __attribute__((always_inline))
-
-
-__global__
-void hello(float k) {
-
-  printf("hello %f\n",k);
-
-}
-
-
+#ifdef LaunchInCU
+  void wrapperInCU();
+#endif
 
 
 int main() {
 
   requireCUDADevices();
 
-  cudautils::launch(hello,{1, 1},3.14);
-  cudaDeviceSynchronize();
+  printf("in Main\n");
+  printEnv();
+
+  wrapper();
+
+#ifdef LaunchInCU
+  wrapperInCU();
+#endif
 
   return 0;
 }
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
index 7983512739b60..90b24fb6fec4f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
@@ -1,10 +1,12 @@
-#include<cstdio>
-
-__global__
-void hello(float k) {
-
-  printf("hello %f\n",k);
-
-}
-
-
+#ifdef CUDA_KERNELS_ON_CPU
+#undef CUDA_KERNELS_ON_CPU
+#endif
+
+#include "Launch_t.h"
+
+#ifdef LaunchInCU
+  void wrapperInCU() {
+    printf("in cu wrapper\n");
+    wrapper();
+  }
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.h b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h
new file mode 100644
index 0000000000000..3237f6383f47d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h
@@ -0,0 +1,46 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include <cuda_runtime.h>
+#include<cstdio>
+
+// these are in cudaCompat: here we do just a minimal test of "launch"
+#ifndef __CUDACC__
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+#endif
+
+inline
+constexpr void printEnv() {
+  printf("ENV\n");
+#ifdef CUDA_KERNELS_ON_CPU
+  printf("CUDA_KERNELS_ON_CPU defined\n");
+#endif
+#ifdef __CUDACC__
+  printf("__CUDACC__ defined\n");
+#endif
+#ifdef __CUDA_RUNTIME_H__
+  printf("__CUDA_RUNTIME_H__ defined\n");
+#endif
+#ifdef __CUDA_ARCH__
+  printf("__CUDA_ARCH__ defined\n");
+#endif
+  printf("---\n");
+}
+
+__global__
+void hello(float k) {
+
+  printf("hello from kernel %f\n",k);
+  printEnv();
+}
+
+
+inline void wrapper() {
+
+  printf("in Wrapper\n");
+  printEnv();
+
+  cudautils::launch(hello,{1, 1},3.14);
+  cudaDeviceSynchronize();
+
+}
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
index 35577dd1e508c..828b7912ed6c4 100644
--- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
@@ -5,12 +5,12 @@
 TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
 
   SECTION("Single element") {
-    auto ptr = cudautils::make_cpu_unique<int>(0);
+    auto ptr = cudautils::make_cpu_unique<int>(cudaStreamDefault);
     REQUIRE(ptr != nullptr);
   }
 
   SECTION("Reset") {
-    auto ptr = cudautils::make_cpu_unique<int>(0);
+    auto ptr = cudautils::make_cpu_unique<int>(cudaStreamDefault);
     REQUIRE(ptr != nullptr);
 
     ptr.reset();
@@ -18,13 +18,13 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
   }
 
   SECTION("Multiple elements") {
-    auto ptr = cudautils::make_host_unique<int[]>(10,0);
+    auto ptr = cudautils::make_cpu_unique<int[]>(10,cudaStreamDefault);
     REQUIRE(ptr != nullptr);
   }
 
   SECTION("Allocating too much") {
     constexpr size_t maxSize = 1 << 30;  // 8**10
-    auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1,0);
+    auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1,cudaStreamDefault);
     ptr.reset();
     REQUIRE(ptr != nullptr);
   }
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
index 5d81407d00f5b..347476cf1b6a3 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -117,7 +117,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
     std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
     */
 
-    cudaStream_t stream=0;
+    cudaStream_t stream=cudaStreamDefault;
     iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make<cudaCompat::CPUTraits>(stream,tracks, m_ptMin,m_OnGPU));
   }
 }

From b26b7f4204974a6f444b679d3c10aff6d3242bb7 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 10:19:05 +0100
Subject: [PATCH 10/22] make sure cuda kernels are seen as inline functions
 when not compiled as such

---
 .../CUDAUtilities/interface/cudaCompat.h           | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index 602e876a12e15..892cb7cf5271a 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -90,18 +90,20 @@ namespace cudaCompat {
 #define __forceinline__
 #endif
 
-// make sure function are inlined to avoid multiple definition
 #ifndef __CUDA_ARCH__
+using namespace cudaCompat;
+#endif
+
+#endif  // CUDA_KERNELS_ON_CPU
+
+
+// make sure function are inlined to avoid multiple definition
+#ifndef __CUDACC__
 #undef __global__
 #define __global__ inline __attribute__((always_inline))
 #undef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 #endif
 
-#ifndef __CUDA_ARCH__
-using namespace cudaCompat;
-#endif
-
-#endif  // CUDA_KERNELS_ON_CPU
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h

From e89d8864cea88b0a213b57d7f5327806683133d3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 11:32:21 +0100
Subject: [PATCH 11/22] move to single source

---
 .../Common/interface/HeterogeneousSoA.h       |  4 +-
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  4 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 58 -------------------
 .../plugins/CAHitNtupletGeneratorOnGPU.h      | 51 +++++++++++++++-
 4 files changed, 53 insertions(+), 64 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 08bdb42b28a98..0c3e8bd37cb6a 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -55,6 +55,7 @@ namespace cudaCompat {
 
   struct GPUTraits {
     static constexpr const char * name = "GPU"; 
+    static constexpr bool runOnDevice = true;
 
     template <typename T>
     using unique_ptr = cudautils::device::unique_ptr<T>;
@@ -87,6 +88,7 @@ namespace cudaCompat {
 
   struct HostTraits {
     static constexpr const char * name = "HOST";
+    static constexpr bool runOnDevice = false;
 
     template <typename T>
     using unique_ptr = cudautils::host::unique_ptr<T>;
@@ -114,11 +116,11 @@ namespace cudaCompat {
 
   struct CPUTraits {
     static constexpr const char * name = "CPU";
+    static constexpr bool runOnDevice = false;
 
     template <typename T>
     using unique_ptr = cudautils::cpu::unique_ptr<T>;;
 
-
     template <typename T>
     static auto make_unique() {
       return cudautils::make_cpu_unique<T>(cudaStreamDefault);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 11b644d466768..a87b45f47cbe5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -74,10 +74,10 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
     CUDAScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
 
-    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuples<cudaCompat::GPUTraits>(hits, bf, ctx.stream()));
   } else {
     auto const& hits = iEvent.get(tokenHitCPU_);
-    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
+    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples<cudaCompat::CPUTraits>(hits, bf,cudaStreamDefault));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 4d8ecd96815c5..e6525bd16a49b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -159,61 +159,3 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
           "cuts\" based on the fit results (pT, Tip, Zip).");
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
-                                                                    float bfield,
-                                                                    cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
-
-  auto* soa = tracks.get();
-
-  CAHitNtupletGeneratorKernelsGPU kernels(m_params);
-  kernels.counters_ = m_counters;
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
-
-  kernels.allocateOnGPU(stream);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
-
-  kernels.buildDoublets(hits_d, stream);
-  kernels.launchKernels(hits_d, soa, stream);
-  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
-  if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
-  } else {
-    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
-  }
-  kernels.classifyTuples(hits_d, soa, stream);
-
-  return tracks;
-}
-
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique<pixelTrack::TrackSoA>(0));
-
-  auto* soa = tracks.get();
-  assert(soa);
-
-  CAHitNtupletGeneratorKernelsCPU kernels(m_params);
-  kernels.counters_ = m_counters;
-  kernels.allocateOnGPU(nullptr);
-
-  kernels.buildDoublets(hits_d, nullptr);
-  kernels.launchKernels(hits_d, soa, nullptr);
-  kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
-
-  if (0 == hits_d.nHits())
-    return tracks;
-
-  // now fit
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
-
-  if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
-  } else {
-    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
-  }
-
-  kernels.classifyTuples(hits_d, soa, nullptr);
-
-  return tracks;
-}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index de2e1913dd18b..82ccb6b835d68 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -47,9 +47,8 @@ class CAHitNtupletGeneratorOnGPU {
   static void fillDescriptions(edm::ParameterSetDescription& desc);
   static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
-
-  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+  template<typename Traits, typename Hits2D>
+  PixelTrackHeterogeneous makeTuples(Hits2D const& hits_d, float bfield, cudaStream_t stream) const;
 
 private:
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
@@ -63,4 +62,50 @@ class CAHitNtupletGeneratorOnGPU {
   Counters* m_counters = nullptr;
 };
 
+template<typename Traits, typename Hits2D>
+inline
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hits_d,
+                                                                    float bfield,
+                                                                    cudaStream_t stream) const {
+  PixelTrackHeterogeneous tracks(Traits:: template make_unique<pixelTrack::TrackSoA>(stream));
+
+  using NTupletGeneratorKernels = CAHitNtupletGeneratorKernels<Traits>;
+
+  auto* soa = tracks.get();
+
+  NTupletGeneratorKernels kernels(m_params);
+  kernels.counters_ = m_counters;
+  kernels.allocateOnGPU(stream);
+
+  kernels.buildDoublets(hits_d, stream);
+  kernels.launchKernels(hits_d, soa, stream);
+  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
+
+  if constexpr (!Traits::runOnDevice)
+  if (0 == hits_d.nHits())
+    return tracks;
+
+  // now fit
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  if (m_params.useRiemannFit_) {
+    if constexpr (Traits::runOnDevice) {
+      fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    }else {
+      fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+    }
+  } else {
+    if constexpr (Traits::runOnDevice) {
+      fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    } else {
+      fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+    }
+  }
+  kernels.classifyTuples(hits_d, soa, stream);
+
+  return tracks;
+}
+
+
+
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h

From a7d242e9ed163f74ca9bc9533fb8127809f672b2 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 12:22:56 +0100
Subject: [PATCH 12/22] explicit instantiation

---
 .../plugins/BrokenLineFitOnGPU.cc             | 71 ++--------------
 .../plugins/BrokenLineFitOnGPU.cu             | 84 +------------------
 .../plugins/BrokenLineFitOnGPU.h              | 81 ++++++++++++++++++
 .../plugins/CAHitNtupletGeneratorKernels.h    |  2 +-
 .../CAHitNtupletGeneratorKernelsAlloc.h       |  4 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.h      | 10 +--
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  4 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  9 +-
 8 files changed, 102 insertions(+), 163 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index cc5865d97fd95..6c90ab1dd8f6f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -1,68 +1,7 @@
 #include "BrokenLineFitOnGPU.h"
 
-void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
-
-  //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
-  auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // fit triplets
-    kernelBLFastFit<3>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
-
-    kernelBLFit<3>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   3,
-                   offset);
-
-    // fit quads
-    kernelBLFastFit<4>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
-
-    kernelBLFit<4>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   4,
-                   offset);
-
-    if (fit5as4_) {
-      // fit penta (only first 4)
-      kernelBLFastFit<4>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-
-      kernelBLFit<4>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
-    } else {
-      // fit penta (all 5)
-      kernelBLFastFit<5>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-
-      kernelBLFit<5>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
-    }
-
-  }  // loop on concurrent fits
-}
+template
+void HelixFitOnGPU::launchBrokenLineKernels<cudaCompat::CPUTraits>(HitsView const *hv,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples,
+                                            cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 660cf75e1f460..2eba3e9af5965 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,85 +1,7 @@
 #include "BrokenLineFitOnGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
+template
+void HelixFitOnGPU::launchBrokenLineKernels<cudaCompat::GPUTraits>(HitsView const *hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
-                                            cudaStream_t stream) {
-  assert(tuples_d);
-
-  auto blockSize = 64;
-  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
-
-  //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // fit triplets
-    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                             bField_,
-                                                             outputSoa_d,
-                                                             hitsGPU_.get(),
-                                                             hits_geGPU_.get(),
-                                                             fast_fit_resultsGPU_.get(),
-                                                             3,
-                                                             offset);
-    cudaCheck(cudaGetLastError());
-
-    // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 bField_,
-                                                                 outputSoa_d,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 4,
-                                                                 offset);
-    cudaCheck(cudaGetLastError());
-
-    if (fit5as4_) {
-      // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
-      cudaCheck(cudaGetLastError());
-    } else {
-      // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
-      cudaCheck(cudaGetLastError());
-    }
-
-  }  // loop on concurrent fits
-}
+                                            cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 82a5bee443f88..aa7c1e680929e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -185,3 +185,84 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
 #endif
   }
 }
+
+
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+template<typename Traits>
+void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples,
+                                            cudaStream_t stream) {
+  assert(tuples_d);
+
+  int blockSize = 64;
+  int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU_ = Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ =  Traits:: template make_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ =  Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    cudautils::launch(kernelBLFastFit<3>,{numberOfBlocks, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelBLFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d,
+                                                             bField_,
+                                                             outputSoa_d,
+                                                             hitsGPU_.get(),
+                                                             hits_geGPU_.get(),
+                                                             fast_fit_resultsGPU_.get(),
+                                                             3,
+                                                             offset);
+
+    // fit quads
+    cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                 bField_,
+                                                                 outputSoa_d,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 4,
+                                                                 offset);
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
+    } else {
+      // fit penta (all 5)
+      cudautils::launch(kernelBLFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      cudautils::launch(kernelBLFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
+    }
+
+  }  // loop on concurrent fits
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 0140958aa9ef2..8213cc9c2af03 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -171,7 +171,7 @@ class CAHitNtupletGeneratorKernels {
   void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
-  void allocateOnGPU(cudaStream_t stream);
+  void allocate(cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index b91911c66924e..e688052f1faf0 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -4,9 +4,9 @@
 
 template <>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU::allocate(cudaStream_t stream) {
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU::allocate(cudaStream_t stream) {
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 82ccb6b835d68..dac6faa5188b9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -75,7 +75,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit
 
   NTupletGeneratorKernels kernels(m_params);
   kernels.counters_ = m_counters;
-  kernels.allocateOnGPU(stream);
+  kernels.allocate(stream);
 
   kernels.buildDoublets(hits_d, stream);
   kernels.launchKernels(hits_d, soa, stream);
@@ -87,7 +87,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  fitter.allocate(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
     if constexpr (Traits::runOnDevice) {
       fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
@@ -95,11 +95,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit
       fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
     }
   } else {
-    if constexpr (Traits::runOnDevice) {
-      fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
-    } else {
-      fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
-    }
+    fitter.launchBrokenLineKernels<Traits>(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   }
   kernels.classifyTuples(hits_d, soa, stream);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index becbd0a1a8540..050a5c83431ce 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,7 +1,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
+void HelixFitOnGPU::allocate(Tuples const *tuples,
                                   TupleMultiplicity const *tupleMultiplicity,
                                   OutputSoA *helix_fit_results) {
   tuples_d = tuples;
@@ -13,4 +13,4 @@ void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
   assert(outputSoa_d);
 }
 
-void HelixFitOnGPU::deallocateOnGPU() {}
+void HelixFitOnGPU::deallocate() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 05b399e870f58..11ec0a663aed3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -41,17 +41,18 @@ class HelixFitOnGPU {
   using TupleMultiplicity = CAConstants::TupleMultiplicity;
 
   explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
-  ~HelixFitOnGPU() { deallocateOnGPU(); }
+  ~HelixFitOnGPU() { deallocate(); }
 
   void setBField(double bField) { bField_ = bField; }
   void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+
+  template<typename Traits>
   void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
   void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
-  void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
-  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
-  void deallocateOnGPU();
+  void allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
+  void deallocate();
 
 private:
   static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();

From 3a530f19a860dd567efd3103af9541e04302bf45 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 13:51:53 +0100
Subject: [PATCH 13/22] all ported but one

---
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 81 ++++++++-----------
 1 file changed, 33 insertions(+), 48 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index aaf882633f17d..bf1461b2119c5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,13 +1,15 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+
 template <>
 void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto blockSize = 128;
-  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+  int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
 
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+  cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream},
       &tracks_d->hitIndices, hv, &tracks_d->detIndices);
-  cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
@@ -36,7 +38,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   auto nthTot = 64;
   auto stride = 4;
   auto blockSize = nthTot / stride;
-  auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   auto rescale = numberOfBlocks / 65536;
   blockSize *= (rescale + 1);
   numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
@@ -45,7 +47,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   dim3 blks(1, numberOfBlocks, 1);
   dim3 thrs(stride, blockSize, 1);
 
-  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
+  cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream},
       device_hitTuple_apc_,
       device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
       hh.view(),
@@ -59,23 +61,21 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       m_params.CAThetaCutForward_,
       m_params.dcaCutInnerTriplet_,
       m_params.dcaCutOuterTriplet_);
-  cudaCheck(cudaGetLastError());
 
   if (nhits > 1 && m_params.earlyFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
-    auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    int blockSize = nthTot / stride;
+    int numberOfBlocks = (nhits + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
+    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
-    cudaCheck(cudaGetLastError());
   }
 
   blockSize = 64;
   numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+  cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(),
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
                                                                      device_theCellTracks_,
@@ -83,35 +83,32 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                      device_hitTuple_apc_,
                                                                      quality_d,
                                                                      m_params.minHitsPerNtuplet_);
-  cudaCheck(cudaGetLastError());
 
   if (m_params.doStats_)
-    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
-  cudaCheck(cudaGetLastError());
+    cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
-
   blockSize = 128;
   numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+//  cudautils::launch(cudautils::finalizeBulk<decltype(*tuples_d)>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d);
   cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
+
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+  cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
       device_theCells_.get(), device_nCells_, tuples_d, quality_d);
-  cudaCheck(cudaGetLastError());
 
   blockSize = 128;
   numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+  cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+  cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
       tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudaCheck(cudaGetLastError());
 
   if (nhits > 1 && m_params.lateFishbone_) {
     auto nthTot = 128;
@@ -120,14 +117,13 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
+    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
-    cudaCheck(cudaGetLastError());
   }
 
   if (m_params.doStats_) {
     numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+    cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d,
                                                                         device_tupleMultiplicity_.get(),
                                                                         device_hitTuple_apc_,
                                                                         device_theCells_.get(),
@@ -138,7 +134,6 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                         nhits,
                                                                         m_params.maxNumberOfDoublets_,
                                                                         counters_);
-    cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -166,13 +161,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     int threadsPerBlock = 128;
     // at least one block!
     int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
-    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
+    cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(),
                                                                            nhits,
                                                                            device_theCellNeighbors_,
                                                                            device_theCellNeighborsContainer_.get(),
                                                                            device_theCellTracks_,
                                                                            device_theCellTracksContainer_.get());
-    cudaCheck(cudaGetLastError());
   }
 
   device_theCells_ = cudautils::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
@@ -199,7 +193,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
   dim3 blks(1, blocks, 1);
   dim3 thrs(stride, threadsPerBlock, 1);
-  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
+  cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(),
                                                                     device_nCells_,
                                                                     device_theCellNeighbors_,
                                                                     device_theCellTracks_,
@@ -211,7 +205,6 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                                                     m_params.doZ0Cut_,
                                                                     m_params.doPtCut_,
                                                                     m_params.maxNumberOfDoublets_);
-  cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -225,54 +218,46 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = (Quality *)(&tracks_d->m_quality);
 
-  auto blockSize = 64;
+  int blockSize = 64;
 
   // classify tracks based on kinematics
-  auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, m_params.cuts_, quality_d);
-  cudaCheck(cudaGetLastError());
+  int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d);
 
   if (m_params.lateFishbone_) {
     // apply fishbone cleaning to good tracks
     numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+    cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
         device_theCells_.get(), device_nCells_, quality_d);
-    cudaCheck(cudaGetLastError());
   }
 
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+  cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
       device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
-  cudaCheck(cudaGetLastError());
 
   if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
     // fill hit->track "map"
     numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+    cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},
         tuples_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
     cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
     cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
+    cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get());
   }
   if (m_params.minHitsPerNtuplet_ < 4) {
     // remove duplicates (tracks that share a hit)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
-    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+    cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
         hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
   }
 
   if (m_params.doStats_) {
     // counters (add flag???)
     numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
-    kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
-    cudaCheck(cudaGetLastError());
+    cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_);
     numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
-    cudaCheck(cudaGetLastError());
+    cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_);
   }
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -282,12 +267,12 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 #ifdef DUMP_GPU_TK_TUPLES
   static std::atomic<int> iev(0);
   ++iev;
-  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
+  cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream},
       hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
 #endif
 }
 
 template <>
 void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) {
-  kernel_printCounters<<<1, 1>>>(counters);
+  cudautils::launch(kernel_printCounters,{1, 1},counters);
 }

From 324667ee67e349e6fd7729c7bdbddd930635017a Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 14:31:26 +0100
Subject: [PATCH 14/22] compiles

---
 .../plugins/CAHitNtupletGeneratorKernels.cc   | 173 +----------
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 281 +----------------
 .../CAHitNtupletGeneratorKernelsImpl.h        | 284 ++++++++++++++++++
 3 files changed, 304 insertions(+), 434 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index b4cd1f8aa1abc..03a8b927289b8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -1,170 +1,17 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
-  kernel_printCounters(counters);
-}
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) {
-  kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
-}
+template
+void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  auto nhits = hh.nHits();
+template
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-#ifdef NTUPLE_DEBUG
-  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
-#endif
+template
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream);
 
-  // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits),stream);
-  assert(device_isOuterHitOfCell_.get());
-  gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
-                                 nhits,
-                                 device_theCellNeighbors_,
-                                 device_theCellNeighborsContainer_.get(),
-                                 device_theCellTracks_,
-                                 device_theCellTracksContainer_.get());
+template
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-  device_theCells_ = Traits:: template make_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_,stream);
-  if (0 == nhits)
-    return;  // protect against empty events
-
-  // FIXME avoid magic numbers
-  auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
-  }
-
-  assert(nActualPairs <= gpuPixelDoublets::nPairs);
-  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
-                                         device_nCells_,
-                                         device_theCellNeighbors_,
-                                         device_theCellTracks_,
-                                         hh.view(),
-                                         device_isOuterHitOfCell_.get(),
-                                         nActualPairs,
-                                         m_params.idealConditions_,
-                                         m_params.doClusterCut_,
-                                         m_params.doZ0Cut_,
-                                         m_params.doPtCut_,
-                                         m_params.maxNumberOfDoublets_);
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
-
-  assert(tuples_d && quality_d);
-
-  // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
-
-  auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
-
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
-
-  //
-  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
-  //
-
-  kernel_connect(device_hitTuple_apc_,
-                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-                 hh.view(),
-                 device_theCells_.get(),
-                 device_nCells_,
-                 device_theCellNeighbors_,
-                 device_isOuterHitOfCell_.get(),
-                 m_params.hardCurvCut_,
-                 m_params.ptmin_,
-                 m_params.CAThetaCutBarrel_,
-                 m_params.CAThetaCutForward_,
-                 m_params.dcaCutInnerTriplet_,
-                 m_params.dcaCutOuterTriplet_);
-
-  if (nhits > 1 && m_params.earlyFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
-  }
-
-  kernel_find_ntuplets(hh.view(),
-                       device_theCells_.get(),
-                       device_nCells_,
-                       device_theCellTracks_,
-                       tuples_d,
-                       device_hitTuple_apc_,
-                       quality_d,
-                       m_params.minHitsPerNtuplet_);
-  if (m_params.doStats_)
-    kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
-
-  cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
-
-  // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
-
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-
-  if (nhits > 1 && m_params.lateFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
-  }
-
-  if (m_params.doStats_) {
-    kernel_checkOverflows(tuples_d,
-                          device_tupleMultiplicity_.get(),
-                          device_hitTuple_apc_,
-                          device_theCells_.get(),
-                          device_nCells_,
-                          device_theCellNeighbors_,
-                          device_theCellTracks_,
-                          device_isOuterHitOfCell_.get(),
-                          nhits,
-                          m_params.maxNumberOfDoublets_,
-                          counters_);
-  }
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
-
-  // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d);
-
-  if (m_params.lateFishbone_) {
-    // apply fishbone cleaning to good tracks
-    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
-  }
-
-  // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
-
-  // fill hit->track "map"
-  kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
-  kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-
-  // remove duplicates (tracks that share a hit)
-  kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
-
-  if (m_params.doStats_) {
-    // counters (add flag???)
-    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
-    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
-  }
-
-#ifdef DUMP_GPU_TK_TUPLES
-  static std::atomic<int> iev(0);
-  ++iev;
-  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
-#endif
-}
+template
+void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index bf1461b2119c5..8939acff2ebc2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,278 +1,17 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
+template
+void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto blockSize = 128;
-  int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+template
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-  cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream},
-      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-}
+template
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream);
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  // these are pointer on GPU!
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+template
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-  // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
-
-  auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
-
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
-
-  //
-  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
-  //
-
-  auto nthTot = 64;
-  auto stride = 4;
-  auto blockSize = nthTot / stride;
-  int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  auto rescale = numberOfBlocks / 65536;
-  blockSize *= (rescale + 1);
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  assert(numberOfBlocks < 65536);
-  assert(blockSize > 0 && 0 == blockSize % 16);
-  dim3 blks(1, numberOfBlocks, 1);
-  dim3 thrs(stride, blockSize, 1);
-
-  cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream},
-      device_hitTuple_apc_,
-      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-      hh.view(),
-      device_theCells_.get(),
-      device_nCells_,
-      device_theCellNeighbors_,
-      device_isOuterHitOfCell_.get(),
-      m_params.hardCurvCut_,
-      m_params.ptmin_,
-      m_params.CAThetaCutBarrel_,
-      m_params.CAThetaCutForward_,
-      m_params.dcaCutInnerTriplet_,
-      m_params.dcaCutOuterTriplet_);
-
-  if (nhits > 1 && m_params.earlyFishbone_) {
-    auto nthTot = 128;
-    auto stride = 16;
-    int blockSize = nthTot / stride;
-    int numberOfBlocks = (nhits + blockSize - 1) / blockSize;
-    dim3 blks(1, numberOfBlocks, 1);
-    dim3 thrs(stride, blockSize, 1);
-    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
-  }
-
-  blockSize = 64;
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(),
-                                                                     device_theCells_.get(),
-                                                                     device_nCells_,
-                                                                     device_theCellTracks_,
-                                                                     tuples_d,
-                                                                     device_hitTuple_apc_,
-                                                                     quality_d,
-                                                                     m_params.minHitsPerNtuplet_);
-
-  if (m_params.doStats_)
-    cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_);
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-  blockSize = 128;
-  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
-//  cudautils::launch(cudautils::finalizeBulk<decltype(*tuples_d)>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d);
-  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
-
-
-  // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
-      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
-
-  blockSize = 128;
-  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
-  cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
-  cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
-
-  if (nhits > 1 && m_params.lateFishbone_) {
-    auto nthTot = 128;
-    auto stride = 16;
-    auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
-    dim3 blks(1, numberOfBlocks, 1);
-    dim3 thrs(stride, blockSize, 1);
-    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
-  }
-
-  if (m_params.doStats_) {
-    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
-    cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d,
-                                                                        device_tupleMultiplicity_.get(),
-                                                                        device_hitTuple_apc_,
-                                                                        device_theCells_.get(),
-                                                                        device_nCells_,
-                                                                        device_theCellNeighbors_,
-                                                                        device_theCellTracks_,
-                                                                        device_isOuterHitOfCell_.get(),
-                                                                        nhits,
-                                                                        m_params.maxNumberOfDoublets_,
-                                                                        counters_);
-  }
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  auto nhits = hh.nHits();
-
-#ifdef NTUPLE_DEBUG
-  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
-#endif
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-
-  // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
-  assert(device_isOuterHitOfCell_.get());
-  {
-    int threadsPerBlock = 128;
-    // at least one block!
-    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
-    cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(),
-                                                                           nhits,
-                                                                           device_theCellNeighbors_,
-                                                                           device_theCellNeighborsContainer_.get(),
-                                                                           device_theCellTracks_,
-                                                                           device_theCellTracksContainer_.get());
-  }
-
-  device_theCells_ = cudautils::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-
-  if (0 == nhits)
-    return;  // protect against empty events
-
-  // FIXME avoid magic numbers
-  auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
-  }
-
-  assert(nActualPairs <= gpuPixelDoublets::nPairs);
-  int stride = 4;
-  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
-  int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
-  dim3 blks(1, blocks, 1);
-  dim3 thrs(stride, threadsPerBlock, 1);
-  cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(),
-                                                                    device_nCells_,
-                                                                    device_theCellNeighbors_,
-                                                                    device_theCellTracks_,
-                                                                    hh.view(),
-                                                                    device_isOuterHitOfCell_.get(),
-                                                                    nActualPairs,
-                                                                    m_params.idealConditions_,
-                                                                    m_params.doClusterCut_,
-                                                                    m_params.doZ0Cut_,
-                                                                    m_params.doPtCut_,
-                                                                    m_params.maxNumberOfDoublets_);
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  // these are pointer on GPU!
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
-
-  int blockSize = 64;
-
-  // classify tracks based on kinematics
-  int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-  cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d);
-
-  if (m_params.lateFishbone_) {
-    // apply fishbone cleaning to good tracks
-    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-    cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
-        device_theCells_.get(), device_nCells_, quality_d);
-  }
-
-  // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
-      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
-
-  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
-    // fill hit->track "map"
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-    cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},
-        tuples_d, quality_d, device_hitToTuple_.get());
-    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
-    cudaCheck(cudaGetLastError());
-    cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get());
-  }
-  if (m_params.minHitsPerNtuplet_ < 4) {
-    // remove duplicates (tracks that share a hit)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
-    cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
-        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
-  }
-
-  if (m_params.doStats_) {
-    // counters (add flag???)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
-    cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_);
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-    cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_);
-  }
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-
-#ifdef DUMP_GPU_TK_TUPLES
-  static std::atomic<int> iev(0);
-  ++iev;
-  cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream},
-      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
-#endif
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) {
-  cudautils::launch(kernel_printCounters,{1, 1},counters);
-}
+template
+void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 6888e6725bc39..02e5de7f112ae 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -601,3 +601,287 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
          c.nEmptyCells / double(c.nCells),
          c.nZeroTrackCells / double(c.nCells));
 }
+
+
+
+// launchers
+
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto blockSize = 128;
+  int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+
+  cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream},
+      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  // zero tuples
+  cudautils::launchZero(tuples_d, cudaStream);
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  auto nthTot = 64;
+  auto stride = 4;
+  auto blockSize = nthTot / stride;
+  int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  auto rescale = numberOfBlocks / 65536;
+  blockSize *= (rescale + 1);
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  assert(numberOfBlocks < 65536);
+  assert(blockSize > 0 && 0 == blockSize % 16);
+  dim3 blks(1, numberOfBlocks, 1);
+  dim3 thrs(stride, blockSize, 1);
+
+  cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream},
+      device_hitTuple_apc_,
+      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+      hh.view(),
+      device_theCells_.get(),
+      device_nCells_,
+      device_theCellNeighbors_,
+      device_isOuterHitOfCell_.get(),
+      m_params.hardCurvCut_,
+      m_params.ptmin_,
+      m_params.CAThetaCutBarrel_,
+      m_params.CAThetaCutForward_,
+      m_params.dcaCutInnerTriplet_,
+      m_params.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && m_params.earlyFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    int blockSize = nthTot / stride;
+    int numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+  }
+
+  blockSize = 64;
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(),
+                                                                     device_theCells_.get(),
+                                                                     device_nCells_,
+                                                                     device_theCellTracks_,
+                                                                     tuples_d,
+                                                                     device_hitTuple_apc_,
+                                                                     quality_d,
+                                                                     m_params.minHitsPerNtuplet_);
+
+  if (m_params.doStats_)
+    cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+  blockSize = 128;
+  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+//  cudautils::launch(cudautils::finalizeBulk<decltype(*tuples_d)>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d);
+#ifdef __CUDACC__
+  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+#else
+ cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
+#endif
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
+      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+
+  blockSize = 128;
+  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+
+  if (nhits > 1 && m_params.lateFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+  }
+
+  if (m_params.doStats_) {
+    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_,
+                                                                        device_theCellTracks_,
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        m_params.maxNumberOfDoublets_,
+                                                                        counters_);
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  device_isOuterHitOfCell_ = TTraits:: template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  assert(device_isOuterHitOfCell_.get());
+  {
+    int threadsPerBlock = 128;
+    // at least one block!
+    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(),
+                                                                           nhits,
+                                                                           device_theCellNeighbors_,
+                                                                           device_theCellNeighborsContainer_.get(),
+                                                                           device_theCellTracks_,
+                                                                           device_theCellTracksContainer_.get());
+  }
+
+  device_theCells_ = TTraits:: template make_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_ > 3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
+  int stride = 4;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
+  int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  dim3 blks(1, blocks, 1);
+  dim3 thrs(stride, threadsPerBlock, 1);
+  cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(),
+                                                                    device_nCells_,
+                                                                    device_theCellNeighbors_,
+                                                                    device_theCellTracks_,
+                                                                    hh.view(),
+                                                                    device_isOuterHitOfCell_.get(),
+                                                                    nActualPairs,
+                                                                    m_params.idealConditions_,
+                                                                    m_params.doClusterCut_,
+                                                                    m_params.doZ0Cut_,
+                                                                    m_params.doPtCut_,
+                                                                    m_params.maxNumberOfDoublets_);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  int blockSize = 64;
+
+  // classify tracks based on kinematics
+  int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d);
+
+  if (m_params.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
+        device_theCells_.get(), device_nCells_, quality_d);
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
+      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+
+  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
+    // fill hit->track "map"
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},
+        tuples_d, quality_d, device_hitToTuple_.get());
+    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cudaCheck(cudaGetLastError());
+    cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get());
+  }
+  if (m_params.minHitsPerNtuplet_ < 4) {
+    // remove duplicates (tracks that share a hit)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
+        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+  }
+
+  if (m_params.doStats_) {
+    // counters (add flag???)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_);
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_);
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream},
+      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::printCounters(Counters const *counters) {
+  cudautils::launch(kernel_printCounters,{1, 1,0, cudaStreamDefault},counters);
+}

From 0740845c25a07ee0818e278179622692542c630a Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 15:51:25 +0100
Subject: [PATCH 15/22] fully ported to launch and traits

---
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |   6 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |   4 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cc  | 113 +--------------
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  | 130 +----------------
 .../PixelTriplets/plugins/RiemannFitOnGPU.h   | 131 ++++++++++++++++++
 5 files changed, 142 insertions(+), 242 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index dac6faa5188b9..6e92c1b217cfb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -89,11 +89,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit
   HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
   fitter.allocate(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
-    if constexpr (Traits::runOnDevice) {
-      fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
-    }else {
-      fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
-    }
+    fitter.launchRiemannKernels<Traits>(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   } else {
     fitter.launchBrokenLineKernels<Traits>(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
   }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 11ec0a663aed3..cf877dbd11363 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -44,13 +44,13 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocate(); }
 
   void setBField(double bField) { bField_ = bField; }
+
+  template<typename Traits>
   void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
   template<typename Traits>
   void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
-  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
-
   void allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
   void deallocate();
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index 3476362864a79..f77f9a4fb9a41 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -1,110 +1,7 @@
 #include "RiemannFitOnGPU.h"
 
-void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
-
-  //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
-  auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
-  auto circle_fit_resultsGPU_holder = std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // triplets
-    kernelFastFit<3>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-    kernelCircleFit<3>(tupleMultiplicity_d,
-                       3,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-
-    kernelLineFit<3>(tupleMultiplicity_d,
-                     3,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
-
-    // quads
-    kernelFastFit<4>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-    kernelCircleFit<4>(tupleMultiplicity_d,
-                       4,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-
-    kernelLineFit<4>(tupleMultiplicity_d,
-                     4,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
-
-    if (fit5as4_) {
-      // penta
-      kernelFastFit<4>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-      kernelCircleFit<4>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
-
-      kernelLineFit<4>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-
-    } else {
-      // penta all 5
-      kernelFastFit<5>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-      kernelCircleFit<5>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
-
-      kernelLineFit<5>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-    }
-  }
-}
+template
+void HelixFitOnGPU::launchRiemannKernels<cudaCompat::CPUTraits>(HitsView const *hv,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples,
+                                         cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index cb5d32b47aea3..011b7842f9681 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -1,131 +1,7 @@
 #include "RiemannFitOnGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
+template
+void HelixFitOnGPU::launchRiemannKernels<cudaCompat::GPUTraits>(HitsView const *hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
-                                         cudaStream_t stream) {
-  assert(tuples_d);
-
-  auto blockSize = 64;
-  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
-
-  //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
-  auto circle_fit_resultsGPU_holder =
-      cudautils::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // triplets
-    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 3,
-                                                                 bField_,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 circle_fit_resultsGPU_,
-                                                                 offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                               3,
-                                                               bField_,
-                                                               outputSoa_d,
-                                                               hitsGPU_.get(),
-                                                               hits_geGPU_.get(),
-                                                               fast_fit_resultsGPU_.get(),
-                                                               circle_fit_resultsGPU_,
-                                                               offset);
-    cudaCheck(cudaGetLastError());
-
-    // quads
-    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     4,
-                                                                     bField_,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   4,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   circle_fit_resultsGPU_,
-                                                                   offset);
-    cudaCheck(cudaGetLastError());
-
-    if (fit5as4_) {
-      // penta
-      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
-      cudaCheck(cudaGetLastError());
-    } else {
-      // penta all 5
-      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
-      cudaCheck(cudaGetLastError());
-    }
-  }
-}
+                                         cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index b46de519034d5..c6345ba6eab3b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -187,3 +187,134 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 #endif
   }
 }
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+template<typename Traits>
+void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples,
+                                         cudaStream_t stream) {
+  assert(tuples_d);
+
+  int blockSize = 64;
+  int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU_ = Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ = Traits:: template make_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ = Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto circle_fit_resultsGPU_holder =
+      Traits:: template make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
+  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    cudautils::launch(kernelFastFit<3>,{numberOfBlocks, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelCircleFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                 3,
+                                                                 bField_,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 circle_fit_resultsGPU_,
+                                                                 offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelLineFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d,
+                                                               3,
+                                                               bField_,
+                                                               outputSoa_d,
+                                                               hitsGPU_.get(),
+                                                               hits_geGPU_.get(),
+                                                               fast_fit_resultsGPU_.get(),
+                                                               circle_fit_resultsGPU_,
+                                                               offset);
+    cudaCheck(cudaGetLastError());
+
+    // quads
+    cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                     4,
+                                                                     bField_,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                   4,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   circle_fit_resultsGPU_,
+                                                                   offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // penta
+      cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+      cudaCheck(cudaGetLastError());
+    } else {
+      // penta all 5
+      cudautils::launch(kernelFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelCircleFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelLineFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+      cudaCheck(cudaGetLastError());
+    }
+  }
+}

From 9391df8a78717754e24aa2cd064b55f431dd0717 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 18:08:40 +0100
Subject: [PATCH 16/22] fix the decltype

---
 .../plugins/CAHitNtupletGeneratorKernelsImpl.h             | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 02e5de7f112ae..19df8f5e249a2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -699,12 +699,7 @@ void CAHitNtupletGeneratorKernels<TTraits>::launchKernels(HitsOnCPU const &hh, T
 #endif
   blockSize = 128;
   numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
-//  cudautils::launch(cudautils::finalizeBulk<decltype(*tuples_d)>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d);
-#ifdef __CUDACC__
-  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
-#else
- cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
-#endif
+  cudautils::launch(cudautils::finalizeBulk<std::remove_pointer<decltype(tuples_d)>::type>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;

From 70d36561147f4f746a79ef4d16486beb264cf3b8 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 18:18:56 +0100
Subject: [PATCH 17/22] revert to nullptr

---
 HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 6fbaced1858dd..5f29ad3b81ea8 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -72,7 +72,7 @@ namespace cudautils {
   inline void launchFinalize(Histo *__restrict__ h,
                              uint8_t *__restrict__ ws
 #ifndef __CUDACC__
-                             = cudaStreamDefault
+                             = nullptr
 #endif
                              ,
                              cudaStream_t stream

From 51f5342fb55994784b85ba89253ab6ac3dff850c Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 6 Dec 2019 19:15:45 +0100
Subject: [PATCH 18/22] migrate to malloc

---
 CUDADataFormats/Common/interface/HostProduct.h             | 7 ++++---
 .../SiPixelCluster/interface/SiPixelClustersCUDA.h         | 1 -
 CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc | 1 -
 .../SiPixelRecHits/plugins/SiPixelRecHitConverter.cc       | 2 +-
 .../SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc         | 2 +-
 .../SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc   | 2 +-
 6 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index 17ad98ba403a4..0a706b6c57927 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -2,6 +2,7 @@
 #define CUDADataFormatsCommonHostProduct_H
 
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 
 // a heterogeneous unique pointer...
 template <typename T>
@@ -13,9 +14,9 @@ class HostProduct {
   HostProduct& operator=(HostProduct&&) = default;
 
   explicit HostProduct(cudautils::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
-  explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
+  explicit HostProduct(cudautils::cpu::unique_ptr<T>&& p) : cm_ptr(std::move(p)) {}
 
-  auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
+  auto const* get() const { return hm_ptr ? hm_ptr.get() : cm_ptr.get(); }
 
   auto const& operator*() const { return *get(); }
 
@@ -23,7 +24,7 @@ class HostProduct {
 
 private:
   cudautils::host::unique_ptr<T> hm_ptr;  //!
-  std::unique_ptr<T> std_ptr;             //!
+  cudautils::cpu::unique_ptr<T>  cm_ptr;  //!
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index d3650e164d44e..afb7c5aa37833 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -2,7 +2,6 @@
 #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
 #include <cuda_runtime.h>
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
index e6f223bfec4e3..60d2693fa45b0 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
@@ -1,7 +1,6 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 template <>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
index 945bbb28a3262..608c65eff83d1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
@@ -105,7 +105,7 @@ namespace cms {
     const edmNew::DetSetVector<SiPixelCluster>& input = *inputhandle;
 
     // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still...
-    auto hmsp = std::make_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1);
+    auto hmsp = cudautils::make_cpu_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1,cudaStreamDefault);
     auto hitsModuleStart = hmsp.get();
     auto hms = std::make_unique<HMSstorage>(std::move(hmsp));  // hmsp is gone
     iEvent.put(std::move(hms));  // hms is gone! hitsModuleStart still alive and kicking...
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
index a4f19ac276a7a..7e840e1b266e1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
@@ -81,7 +81,7 @@ void SiPixelRecHitFromSOA::acquire(edm::Event const& iEvent,
 void SiPixelRecHitFromSOA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
   // yes a unique ptr of a unique ptr so edm is happy
   auto sizeOfHitModuleStart = gpuClustering::MaxNumModules + 1;
-  auto hmsp = std::make_unique<uint32_t[]>(sizeOfHitModuleStart);
+  auto hmsp = cudautils::make_cpu_unique<uint32_t[]>(sizeOfHitModuleStart,cudaStreamDefault);
   std::copy(m_hitsModuleStart.get(), m_hitsModuleStart.get() + sizeOfHitModuleStart, hmsp.get());
   auto hms = std::make_unique<HMSstorage>(std::move(hmsp));  // hmsp is gone
   iEvent.put(std::move(hms));                                // hms is gone!
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index 7900cf8b2289a..d286583cbc822 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -103,7 +103,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   auto const& input = *hclusters;
 
   // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still...
-  auto hmsp = std::make_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1);
+  auto hmsp = cudautils::make_cpu_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1,cudaStreamDefault);
   auto hitsModuleStart = hmsp.get();
   auto hms = std::make_unique<HMSstorage>(std::move(hmsp));  // hmsp is gone
   iEvent.put(tokenModuleStart_, std::move(hms));             // hms is gone! hitsModuleStart still alive and kicking...

From 04465cf8329d0e6f4e2f87638b1497f87602e00c Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 8 Dec 2019 11:15:57 +0100
Subject: [PATCH 19/22] add memset for cpu as well (just for symmetry)

---
 .../CUDAUtilities/interface/memsetAsync.h           | 13 +++++++++++++
 .../CUDAUtilities/test/cpu_unique_ptr_t.cpp         |  8 ++++++++
 2 files changed, 21 insertions(+)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index 203aac78a165c..819db79d62497 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -3,8 +3,10 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 
 #include <type_traits>
+#include <cstring>
 
 namespace cudautils {
   template <typename T>
@@ -15,6 +17,11 @@ namespace cudautils {
     cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
   }
 
+  template <typename T>
+  inline void memsetAsync(cudautils::cpu::unique_ptr<T>& ptr, T value, cudaStream_t) {
+    ::memset(ptr.get(), value, sizeof(T));
+  }
+
   /**
    * The type of `value` is `int` because of `cudaMemsetAsync()` takes
    * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
@@ -25,6 +32,12 @@ namespace cudautils {
   inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
     cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
   }
+  template <typename T>
+  inline void memsetAsync(cudautils::cpu::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t) {
+    ::memset(ptr.get(), value, nelements * sizeof(T));
+  }
+
+
 }  // namespace cudautils
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
index 828b7912ed6c4..4a293c6760b25 100644
--- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
@@ -1,12 +1,16 @@
 #include "catch.hpp"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
 
 TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
 
   SECTION("Single element") {
     auto ptr = cudautils::make_cpu_unique<int>(cudaStreamDefault);
     REQUIRE(ptr != nullptr);
+    *ptr = 1; 
+    cudautils::memsetAsync(ptr,0,cudaStreamDefault);
+    REQUIRE(0==*ptr);
   }
 
   SECTION("Reset") {
@@ -20,6 +24,10 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
   SECTION("Multiple elements") {
     auto ptr = cudautils::make_cpu_unique<int[]>(10,cudaStreamDefault);
     REQUIRE(ptr != nullptr);
+    for (int i=0; i<10; ++i) ptr[i]=1;
+    cudautils::memsetAsync(ptr,0,10,cudaStreamDefault);
+    int s=0; for (int i=0; i<10; ++i) s+=ptr[i];
+    REQUIRE(0==s);
   }
 
   SECTION("Allocating too much") {

From 36b834324edd0eb850c05c5d49f2b2c4e89855f3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 8 Dec 2019 11:19:46 +0100
Subject: [PATCH 20/22] oops forgot to compile test

---
 HeterogeneousCore/CUDAUtilities/test/BuildFile.xml        | 2 +-
 HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 893d1ab1745eb..88a12c789ed8f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -77,7 +77,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,cpu_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
   <use name="catch2"/>
 </bin>
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
index 4a293c6760b25..e945464c26b74 100644
--- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
@@ -33,7 +33,6 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
   SECTION("Allocating too much") {
     constexpr size_t maxSize = 1 << 30;  // 8**10
     auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1,cudaStreamDefault);
-    ptr.reset();
     REQUIRE(ptr != nullptr);
   }
 }

From 6928c3953e7e1269cb66d1e4a4004b36873d1041 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 8 Dec 2019 12:31:42 +0100
Subject: [PATCH 21/22] almost single source!

---
 .../SiPixelRecHits/test/BuildFile.xml         |  9 ++
 .../SiPixelRecHits/test/RecHitSoATest.cc      | 87 +++++++++++++++++++
 .../SiPixelRecHits/test/RecHitSoATest.cu      |  4 +
 .../SiPixelRecHits/test/RecHitSoATest.h       | 55 ++++++++++++
 4 files changed, 155 insertions(+)
 create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
 create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu
 create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h

diff --git a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
index d8f6b2a35880a..2c2544390f401 100644
--- a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
@@ -21,3 +21,12 @@
 <library file="ReadPixelRecHit.cc" name="ReadPixelRecHit">
   <flags EDM_PLUGIN="1"/>
 </library>
+
+
+<library file="RecHitSoATest.cc RecHitSoATest.cu" name="PixelRecHitSoATest">
+  <use name="cuda"/>
+  <use name="CUDADataFormats/BeamSpot"/>
+  <use name="CUDADataFormats/TrackingRecHit"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <flags EDM_PLUGIN="1"/>
+</library>
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
new file mode 100644
index 0000000000000..9970e9c9f83fb
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
@@ -0,0 +1,87 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+
+template<typename Traits>
+void analyzeImpl(TrackingRecHit2DHeterogeneous<Traits> const & ghits, cudaStream_t stream);
+
+/*
+TrackingRecHit2DCUDA and TrackingRecHit2DCPU are NOT the same type (for tracks and vertices are the same type)
+they are templated with the (GPU/CPU)Traits so in principle the whole Analyzer can be (partially) templated as well
+they return the same view type though (so the real analyzer algo in the header file above does not need to be templated)
+*/
+
+class RecHitSoATest : public edm::global::EDAnalyzer<> {
+public:
+
+  explicit RecHitSoATest(const edm::ParameterSet& iConfig);
+  ~RecHitSoATest() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
+  const bool m_onGPU;
+  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tGpuHits;
+  edm::EDGetTokenT<TrackingRecHit2DCPU> tCpuHits;
+};
+
+RecHitSoATest::RecHitSoATest(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+  if (m_onGPU) {
+      tGpuHits = 
+          consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"));
+  } else {
+      tCpuHits =
+          consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"));
+  }
+}
+
+
+/*
+  most of (all?) the code below is boiler-plate. we hope in future to fully wrap it so that a single (templated?) source will work for both gpu and cpu 
+*/
+void RecHitSoATest::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const {
+  if (m_onGPU) {
+    auto const & gh = iEvent.get(tGpuHits);
+    CUDAScopedContextProduce ctx{gh};
+
+    analyzeImpl<cudaCompat::GPUTraits>(ctx.get(gh),ctx.stream());
+
+  } else {
+    analyzeImpl<cudaCompat::CPUTraits>(iEvent.get(tCpuHits),cudaStreamDefault);
+  }
+}
+
+void RecHitSoATest::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("onGPU", true);
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
+  descriptions.add("RecHitSoATest", desc);
+}
+
+
+#include "RecHitSoATest.h"
+template
+void analyzeImpl<cudaCompat::CPUTraits>(TrackingRecHit2DHeterogeneous<cudaCompat::CPUTraits> const & ghits, cudaStream_t stream);
+
+
+DEFINE_FWK_MODULE(RecHitSoATest);
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu
new file mode 100644
index 0000000000000..0553db894f398
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu
@@ -0,0 +1,4 @@
+#include "RecHitSoATest.h"
+template
+void analyzeImpl<cudaCompat::GPUTraits>(TrackingRecHit2DHeterogeneous<cudaCompat::GPUTraits> const & gHits, cudaStream_t stream);
+
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h
new file mode 100644
index 0000000000000..5fd779e29a21c
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h
@@ -0,0 +1,55 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+__global__
+void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws) {
+
+
+  auto const& hh = *hhp;
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (0 == first)
+    printf("in analyzeClusterTP\n");
+
+  __shared__ int nh;
+  __shared__ double sch;
+ 
+  nh=0; sch=0;
+  __syncthreads();
+
+  for (int i = first, ni=nhits; i < ni; i += gridDim.x * blockDim.x) {
+    if (hh.charge(i) <=0) printf("a hit with zero charge?\n");
+    atomicAdd(&nh,1);
+    atomicAdd(&sch,(double)(hh.charge(i)));
+  }
+
+  __syncthreads();
+
+  if(0==threadIdx.x) {
+   atomicAdd(&ws[0],nh);
+   atomicAdd(&ws[1],sch);
+   atomicAdd(&ws[9],1);
+   if (gridDim.x==ws[9]) {
+     printf("in event  : found %f hits, average charge is %f\n",ws[0], ws[1]/ws[0]);  
+   }
+  }
+
+}
+
+template<typename Traits>
+void analyzeImpl(TrackingRecHit2DHeterogeneous<Traits> const & gHits, cudaStream_t stream) {
+    auto nhits = gHits.nHits();
+
+    if (0 == nhits) return;
+
+    auto ws_d = Traits:: template make_unique<double[]>(10,stream);
+    cudautils::memsetAsync(ws_d, 0, 10, stream);
+    int threadsPerBlock = 256;
+    int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    cudautils::launch(analyzeHits,{threadsPerBlock,blocks,0,stream} , gHits.view(), nhits, ws_d.get());
+
+}

From d685dfea9e8c0e137a24e1bc9252d7e2e4aa7459 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 8 Dec 2019 12:55:53 +0100
Subject: [PATCH 22/22] no way, works only with three files

---
 .../SiPixelRecHits/test/BuildFile.xml         |  2 +-
 .../SiPixelRecHits/test/RecHitSoATest.cc      |  9 ---------
 .../SiPixelRecHits/test/RecHitSoATest.h       | 20 +++++++++++++++++--
 .../SiPixelRecHits/test/RecHitSoATestCPU.cc   |  4 ++++
 .../{RecHitSoATest.cu => RecHitSoATestGPU.cu} |  0
 5 files changed, 23 insertions(+), 12 deletions(-)
 create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc
 rename RecoLocalTracker/SiPixelRecHits/test/{RecHitSoATest.cu => RecHitSoATestGPU.cu} (100%)

diff --git a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
index 2c2544390f401..bae77ee5d8fcf 100644
--- a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
@@ -23,7 +23,7 @@
 </library>
 
 
-<library file="RecHitSoATest.cc RecHitSoATest.cu" name="PixelRecHitSoATest">
+<library file="RecHitSoATest.cc RecHitSoATestCPU.cc RecHitSoATestGPU.cu" name="PixelRecHitSoATest">
   <use name="cuda"/>
   <use name="CUDADataFormats/BeamSpot"/>
   <use name="CUDADataFormats/TrackingRecHit"/>
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
index 9970e9c9f83fb..3b2cb0a921e19 100644
--- a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
@@ -56,9 +56,6 @@ RecHitSoATest::RecHitSoATest(const edm::ParameterSet& iConfig) : m_onGPU(iConfig
 }
 
 
-/*
-  most of (all?) the code below is boiler-plate. we hope in future to fully wrap it so that a single (templated?) source will work for both gpu and cpu 
-*/
 void RecHitSoATest::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const {
   if (m_onGPU) {
     auto const & gh = iEvent.get(tGpuHits);
@@ -78,10 +75,4 @@ void RecHitSoATest::fillDescriptions(edm::ConfigurationDescriptions& description
   descriptions.add("RecHitSoATest", desc);
 }
 
-
-#include "RecHitSoATest.h"
-template
-void analyzeImpl<cudaCompat::CPUTraits>(TrackingRecHit2DHeterogeneous<cudaCompat::CPUTraits> const & ghits, cudaStream_t stream);
-
-
 DEFINE_FWK_MODULE(RecHitSoATest);
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h
index 5fd779e29a21c..1032501abe3fc 100644
--- a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h
@@ -8,12 +8,21 @@
 __global__
 void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws) {
 
+  assert(hhp);
+  assert(ws);
 
   auto const& hh = *hhp;
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
 
+#ifdef CUDA_KERNELS_ON_CPU
+  std::cout << "kernel ON CPU!" << std::endl;
+#else
+  if (0 == first) printf("kernel ON GPU!\n");
+#endif
+
+
   if (0 == first)
-    printf("in analyzeClusterTP\n");
+    printf("in analyzeHits SoA\n");
 
   __shared__ int nh;
   __shared__ double sch;
@@ -42,6 +51,14 @@ void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws
 
 template<typename Traits>
 void analyzeImpl(TrackingRecHit2DHeterogeneous<Traits> const & gHits, cudaStream_t stream) {
+#ifdef CUDA_KERNELS_ON_CPU
+  std::cout << "launching ON CPU!" << std::endl;
+  assert(!Traits::runOnDevice);
+#else
+  printf("launching ON GPU!\n");
+  assert(Traits::runOnDevice);
+#endif
+
     auto nhits = gHits.nHits();
 
     if (0 == nhits) return;
@@ -51,5 +68,4 @@ void analyzeImpl(TrackingRecHit2DHeterogeneous<Traits> const & gHits, cudaStream
     int threadsPerBlock = 256;
     int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
     cudautils::launch(analyzeHits,{threadsPerBlock,blocks,0,stream} , gHits.view(), nhits, ws_d.get());
-
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc
new file mode 100644
index 0000000000000..aa9ab7168b6d3
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc
@@ -0,0 +1,4 @@
+#include "RecHitSoATest.h"
+template
+void analyzeImpl<cudaCompat::CPUTraits>(TrackingRecHit2DHeterogeneous<cudaCompat::CPUTraits> const & gHits, cudaStream_t stream);
+
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu
similarity index 100%
rename from RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu
rename to RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu