From f761369a6d9195748dab2c68202456a41edd1578 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 4 Dec 2019 12:43:13 +0100 Subject: [PATCH 01/22] more cudacompat --- .../CUDAUtilities/interface/cudaCompat.h | 6 +++- .../CUDAUtilities/interface/launch.h | 8 +++++ .../CUDAUtilities/test/BuildFile.xml | 7 +++++ .../CUDAUtilities/test/Launch_t.cpp | 31 +++++++++++++++++++ .../CUDAUtilities/test/Launch_t.cu | 10 ++++++ 5 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp create mode 100644 HeterogeneousCore/CUDAUtilities/test/Launch_t.cu diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index 32c7d7b481a50..602e876a12e15 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -6,6 +6,10 @@ */ #ifndef __CUDACC__ + #define CUDA_KERNELS_ON_CPU +#endif + +#ifdef CUDA_KERNELS_ON_CPU #include #include @@ -98,6 +102,6 @@ namespace cudaCompat { using namespace cudaCompat; #endif -#endif +#endif // CUDA_KERNELS_ON_CPU #endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h index 24282d690005e..a624f0262c34d 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/launch.h +++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h @@ -96,8 +96,12 @@ namespace cudautils { // wrappers for cudaLaunchKernel void launch(void (*kernel)(), LaunchParameters config) { +#ifdef CUDA_KERNELS_ON_CPU + kernel(); +#else cudaCheck(cudaLaunchKernel( (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream)); +#endif } template @@ -107,6 +111,9 @@ namespace cudautils { std::enable_if_t >::value> #endif launch(F* kernel, LaunchParameters config, Args&&... args) { +#ifdef CUDA_KERNELS_ON_CPU + kernel(args...); +#else using function_type = detail::kernel_traits; typename function_type::argument_type_tuple args_copy(args...); @@ -116,6 +123,7 @@ namespace cudautils { detail::pointer_setter()(pointers, args_copy); cudaCheck(cudaLaunchKernel( (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream)); +#endif } // wrappers for cudaLaunchCooperativeKernel diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index a700c0865f0f2..d9edcba07c182 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -7,6 +7,13 @@ + + + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp new file mode 100644 index 0000000000000..11a6876ff3665 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp @@ -0,0 +1,31 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h" + + + +#include +#include + +#undef __global__ +#define __global__ inline __attribute__((always_inline)) + + +__global__ +void hello(float k) { + + printf("hello %f\n",k); + +} + + + + +int main() { + + requireCUDADevices(); + + cudautils::launch(hello,{1, 1},3.14); + cudaDeviceSynchronize(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu new file mode 100644 index 0000000000000..7983512739b60 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu @@ -0,0 +1,10 @@ +#include + +__global__ +void hello(float k) { + + printf("hello %f\n",k); + +} + + From 466b1aec8cb02ae849d4192e1e3dd9d04647bb0a Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 4 Dec 2019 13:56:11 +0100 Subject: [PATCH 02/22] more cudacompat --- .../CUDAUtilities/interface/cpu_unique_ptr.h | 85 +++++++++++++++++++ .../CUDAUtilities/test/cpu_unique_ptr_t.cpp | 31 +++++++ 2 files changed, 116 insertions(+) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h create mode 100644 HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h new file mode 100644 index 0000000000000..f4e97f9e47523 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h @@ -0,0 +1,85 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h +#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h + +#include +#include + +#include + +namespace cudautils { + namespace cpu { + namespace impl { + // Additional layer of types to distinguish from device:: and host::unique_ptr + class CPUDeleter { + public: + CPUDeleter() = default; + + void operator()(void *ptr) { + ::free(ptr); + } + }; + } // namespace impl + + template + using unique_ptr = std::unique_ptr; + + namespace impl { + template + struct make_cpu_unique_selector { + using non_array = cudautils::cpu::unique_ptr; + }; + template + struct make_cpu_unique_selector { + using unbounded_array = cudautils::cpu::unique_ptr; + }; + template + struct make_cpu_unique_selector { + struct bounded_array {}; + }; + } // namespace impl + } // namespace cpu + + template + typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique() { + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the cpu memory is not supported"); + void *mem = ::malloc(sizeof(T)); + return typename cpu::impl::make_cpu_unique_selector::non_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique(size_t n) { + using element_type = typename std::remove_extent::type; + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the cpu memory is not supported"); + void *mem = ::malloc(n * sizeof(element_type)); + return typename cpu::impl::make_cpu_unique_selector::unbounded_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::bounded_array make_cpu_unique(Args &&...) = delete; + + // No check for the trivial constructor, make it clear in the interface + template + typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique_uninitialized() { + void *mem = ::malloc(sizeof(T)); + return typename cpu::impl::make_cpu_unique_selector::non_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique_uninitialized(size_t n) { + using element_type = typename std::remove_extent::type; + void *mem = ::malloc(n * sizeof(element_type)); + return typename cpu::impl::make_cpu_unique_selector::unbounded_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::bounded_array make_cpu_unique_uninitialized(Args &&...) = + delete; +} // namespace cudautils + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp new file mode 100644 index 0000000000000..e7f3077e999a0 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp @@ -0,0 +1,31 @@ +#include "catch.hpp" + +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" + +TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { + + SECTION("Single element") { + auto ptr = cudautils::make_cpu_unique(); + REQUIRE(ptr != nullptr); + } + + SECTION("Reset") { + auto ptr = cudautils::make_cpu_unique(); + REQUIRE(ptr != nullptr); + + ptr.reset(); + REQUIRE(ptr.get() == nullptr); + } + + SECTION("Multiple elements") { + auto ptr = cudautils::make_host_unique(10); + REQUIRE(ptr != nullptr); + } + + SECTION("Allocating too much") { + constexpr size_t maxSize = 1 << 30; // 8**10 + auto ptr = cudautils::make_cpu_unique(maxSize+1); + ptr.reset(); + REQUIRE(ptr != nullptr); + } +} From 69b6daac5b895f76db1c52f54c28096a4706b03f Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 4 Dec 2019 14:45:58 +0100 Subject: [PATCH 03/22] improve test --- .../PixelVertexFinding/test/VertexFinder_t.h | 125 +++++++++++++++--- 1 file changed, 108 insertions(+), 17 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 5261069a6b283..039135efa852f 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -56,7 +56,7 @@ struct Event { struct ClusterGenerator { explicit ClusterGenerator(float nvert, float ntrack) - : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {} + : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(0.001,1.) {} void operator()(Event& ev) { int nclus = clusGen(reng); @@ -68,19 +68,28 @@ struct ClusterGenerator { ev.ztrack.clear(); ev.eztrack.clear(); - ev.ivert.clear(); + ev.pttrack.clear(); + ev.ivert.clear(); + float ptMax=0; float pt5=0; for (int iv = 0; iv < nclus; ++iv) { auto nt = trackGen(reng); - ev.itrack[nclus] = nt; + if (iv == 5) nt *= 3; + ev.itrack[iv] = nt; + float ptSum=0; for (int it = 0; it < nt; ++it) { auto err = errgen(reng); // reality is not flat.... ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng)); ev.eztrack.push_back(err * err); ev.ivert.push_back(iv); - ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng)); + ev.pttrack.push_back(std::pow(ptGen(reng),iv==5 ?-1.0f:-0.5f)); ev.pttrack.back() *= ev.pttrack.back(); + ptSum += ev.pttrack.back(); } + ptMax = std::max(ptMax,ptSum); + if (iv == 5) pt5 = ptSum; } + std::cout << "PV, ptMax " << std::sqrt(pt5) << ' ' << std::sqrt(ptMax) << std::endl; + // add noise auto nt = 2 * trackGen(reng); for (int it = 0; it < nt; ++it) { @@ -88,7 +97,7 @@ struct ClusterGenerator { ev.ztrack.push_back(rgen(reng)); ev.eztrack.push_back(err * err); ev.ivert.push_back(9999); - ev.pttrack.push_back(0.5f + ptGen(reng)); + ev.pttrack.push_back(std::pow(ptGen(reng),-0.5f)); ev.pttrack.back() *= ev.pttrack.back(); } } @@ -99,7 +108,7 @@ struct ClusterGenerator { std::poisson_distribution clusGen; std::poisson_distribution trackGen; std::normal_distribution gauss; - std::exponential_distribution ptGen; + std::uniform_real_distribution ptGen; // becomes a power low }; // a macro SORRY @@ -112,6 +121,12 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) { printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); } +__global__ void linit(ZVertices * pdata, WorkSpace * pws, int nt) { + auto & __restrict__ data = *pdata; + auto & __restrict__ ws = *pws; + for (int i=0; i par{{eps, 0.01f, 9.0f}}; - for (int nav = 30; nav < 80; nav += 20) { + for (int nav = 30; nav < 100; nav += 20) { ClusterGenerator gen(nav, 10); - for (int i = 8; i < 20; ++i) { - auto kk = i / 4; // M param + for (int iii = 8; iii < 20; ++iii) { + auto kk = iii / 4; // M param gen(ev); + std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; + int nt = ev.ztrack.size(); + int nvori = ev.zvert.size(); + int ntori = nt; + assert(ntori>>(onGPU_d.get(), ws_d.get()); + linit<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get(),ntori); #else onGPU_d->init(); ws_d->init(); + for (int16_t i=0; iitrk[i]=i; onGPU_d->idv[i] = -1;} // FIXME do the same on GPU.... #endif - std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; - auto nt = ev.ztrack.size(); #ifdef __CUDACC__ cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + #else ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); @@ -156,15 +179,15 @@ int main() { ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); #endif - std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; + std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (iii % 4) << std::endl; - if ((i % 4) == 0) + if ((iii % 4) == 0) par = {{eps, 0.02f, 12.0f}}; - if ((i % 4) == 1) + if ((iii % 4) == 1) par = {{eps, 0.02f, 9.0f}}; - if ((i % 4) == 2) + if ((iii % 4) == 2) par = {{eps, 0.01f, 9.0f}}; - if ((i % 4) == 3) + if ((iii % 4) == 3) par = {{0.7f * eps, 0.01f, 9.0f}}; uint32_t nv = 0; @@ -200,6 +223,7 @@ int main() { continue; } + int16_t * idv = nullptr; float* zv = nullptr; float* wv = nullptr; float* ptv2 = nullptr; @@ -210,18 +234,21 @@ int main() { float chi2[2 * nv]; // make space for splitting... #ifdef __CUDACC__ + int16_t hidv[16000]; float hzv[2 * nv]; float hwv[2 * nv]; float hptv2[2 * nv]; int32_t hnn[2 * nv]; uint16_t hind[2 * nv]; + idv = hidv; zv = hzv; wv = hwv; ptv2 = hptv2; nn = hnn; ind = hind; #else + idv = onGPU_d->idv; zv = onGPU_d->zv; wv = onGPU_d->wv; ptv2 = onGPU_d->ptv2; @@ -236,6 +263,64 @@ int main() { memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif + auto verifyMatch = [&]() { + + // matching-merging metrics + constexpr int MAXMA = 32; + struct Match { Match() {for (auto&e:vid)e=-1; for (auto&e:nt)e=0;} std::array vid; std::array nt; }; + + auto nnn=0; + Match matches[nv]; for (auto kv = 0U; kv < nv; ++kv) { matches[kv] = Match();} + auto iPV = ind[nv - 1]; + for (int it=0; it9990) continue; + assert(iv9990) continue; + ++nnn; + for (int i=0; i=0 && mx0.75f) ++nok; + if (frac[kv]<0.5f) ++nmess; + auto ldz = std::abs(zv[kv] - ev.zvert[itv]); + dz = std::max(dz,ldz); + int nm5=0; int nm7=0; + int ntt=0; + for (int i=0; i0.5f) ++nm5; + if (f>0.75f) ++nm7; + } + if (nm5>1) ++merged50; + if (nm7>1) ++merged75; + if (kv == iPV ) std::cout << "PV " << itv << ' ' << std::sqrt(ptv2[kv]) << ' ' << float(ntt)/float(ev.itrack[itv]) << '/' << frac[kv] << '/' << nm5 << '/' << nm7 << ' ' << dz << std::endl; + } + // for (auto f: frac) std::cout << f << ' '; + // std::cout << std::endl; + std::cout << "ori/tot/matched/merged5//merged7/random/dz " + << nvori << '/' << nv << '/' << nok << '/' << merged50 << '/' << merged75 << '/' << nmess + << '/' << dz << std::endl; + }; // verifyMatch + + for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) chi2[j] /= float(nn[j]); @@ -302,7 +387,9 @@ int main() { cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(idv, LOC_ONGPU(idv), nt * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif + for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) chi2[j] /= float(nn[j]); @@ -333,7 +420,7 @@ int main() { } dd[kv] = md; } - if (i == 6) { + if (iii == 6) { for (auto d : dd) std::cout << d << ' '; std::cout << std::endl; @@ -345,6 +432,10 @@ int main() { rms = std::sqrt(rms) / (nv - 1); std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; + + verifyMatch(); + + } // loop on events } // lopp on ave vert From f398b454e94ee3cb3f9b792315d8fb52a392c6b1 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 4 Dec 2019 15:46:02 +0100 Subject: [PATCH 04/22] compiles --- .../Common/interface/HeterogeneousSoA.h | 32 +++++++++++++------ .../CUDAUtilities/interface/launch.h | 4 +-- .../plugins/CAHitNtupletGeneratorKernels.cc | 8 ++--- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 2 +- .../src/gpuVertexFinderImpl.h | 30 ++++++++--------- .../PixelVertexFinding/test/VertexFinder_t.h | 25 ++++++--------- 6 files changed, 50 insertions(+), 51 deletions(-) diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index 907b7647a3452..e0b6d330ce27a 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -5,6 +5,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" @@ -21,15 +22,15 @@ class HeterogeneousSoA { explicit HeterogeneousSoA(cudautils::device::unique_ptr &&p) : dm_ptr(std::move(p)) {} explicit HeterogeneousSoA(cudautils::host::unique_ptr &&p) : hm_ptr(std::move(p)) {} - explicit HeterogeneousSoA(std::unique_ptr &&p) : std_ptr(std::move(p)) {} + explicit HeterogeneousSoA(cudautils::cpu::unique_ptr &&p) : cm_ptr(std::move(p)) {} - auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } + auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); } auto const &operator*() const { return *get(); } auto const *operator-> () const { return get(); } - auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } + auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); } auto &operator*() { return *get(); } @@ -47,7 +48,7 @@ class HeterogeneousSoA { // a union wan't do it, a variant will not be more efficienct cudautils::device::unique_ptr dm_ptr; //! cudautils::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! + cudautils::cpu::unique_ptr cm_ptr; //! }; namespace cudaCompat { @@ -109,31 +110,42 @@ namespace cudaCompat { struct CPUTraits { template - using unique_ptr = std::unique_ptr; + using unique_ptr = cudautils::cpu::unique_ptr;; + + + template + static auto make_unique() { + return cudautils::make_cpu_unique(); + } + + template + static auto make_unique(size_t size) { + return cudautils::make_cpu_unique(size); + } template static auto make_unique(cudaStream_t) { - return std::make_unique(); + return cudautils::make_cpu_unique(); } template static auto make_unique(size_t size, cudaStream_t) { - return std::make_unique(size); + return cudautils::make_cpu_unique(size); } template static auto make_host_unique(cudaStream_t) { - return std::make_unique(); + return cudautils::make_cpu_unique(); } template static auto make_device_unique(cudaStream_t) { - return std::make_unique(); + return cudautils::make_cpu_unique(); } template static auto make_device_unique(size_t size, cudaStream_t) { - return std::make_unique(size); + return cudautils::make_cpu_unique(size); } }; diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h index a624f0262c34d..c8c95b0744300 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/launch.h +++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h @@ -94,7 +94,7 @@ namespace cudautils { } // namespace detail // wrappers for cudaLaunchKernel - + inline void launch(void (*kernel)(), LaunchParameters config) { #ifdef CUDA_KERNELS_ON_CPU kernel(); @@ -127,7 +127,7 @@ namespace cudautils { } // wrappers for cudaLaunchCooperativeKernel - + inline void launch_cooperative(void (*kernel)(), LaunchParameters config) { cudaCheck(cudaLaunchCooperativeKernel( (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream)); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 75066458dc170..b4cd1f8aa1abc 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -19,10 +19,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif // in principle we can use "nhits" to heuristically dimension the workspace... - // overkill to use template here (std::make_unique would suffice) - // device_isOuterHitOfCell_ = Traits:: template make_unique(cs, std::max(1U,nhits), stream); - device_isOuterHitOfCell_.reset( - (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell))); + device_isOuterHitOfCell_ = Traits:: template make_unique(std::max(1U,nhits),stream); assert(device_isOuterHitOfCell_.get()); gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), nhits, @@ -31,8 +28,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_theCellTracks_, device_theCellTracksContainer_.get()); - // device_theCells_ = Traits:: template make_unique(cs, m_params.maxNumberOfDoublets_, stream); - device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_)); + device_theCells_ = Traits:: template make_unique(m_params.maxNumberOfDoublets_,stream); if (0 == nhits) return; // protect against empty events diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 2e875caba7130..9801d17de8118 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -187,7 +187,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH } PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(std::make_unique()); + PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique()); auto* soa = tracks.get(); assert(soa); diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h index d6e63227ccf85..6c72937ae590c 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h @@ -1,12 +1,14 @@ -#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h" -#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h" -#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksIterative.h" +#include "gpuClusterTracksByDensity.h" +#include "gpuClusterTracksDBSCAN.h" +#include "gpuClusterTracksIterative.h" #include "gpuFitVertices.h" #include "gpuSortByPt2.h" #include "gpuSplitVertices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + namespace gpuVertexFinder { @@ -84,38 +86,32 @@ namespace gpuVertexFinder { } #endif -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { // std::cout << "producing Vertices on GPU" << std::endl; ZVertexHeterogeneous vertices(cudautils::make_device_unique(stream)); #else ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { + cudaStream_t stream = 0; // std::cout << "producing Vertices on CPU" << std::endl; - ZVertexHeterogeneous vertices(std::make_unique()); + ZVertexHeterogeneous vertices(cudautils::make_cpu_unique()); #endif assert(tksoa); auto* soa = vertices.get(); assert(soa); -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU auto ws_d = cudautils::make_device_unique(stream); #else - auto ws_d = std::make_unique(); + auto ws_d = cudautils::make_cpu_unique(); #endif -#ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); auto blockSize = 128; auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); - cudaCheck(cudaGetLastError()); -#else - cudaCompat::resetGrid(); - init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin); -#endif + cudautils::launch(init,{1,1,0,stream},soa, ws_d.get()); + cudautils::launch(loadTracks, {numberOfBlocks, blockSize, 0, stream}, tksoa, soa, ws_d.get(), ptMin); -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 039135efa852f..1bf4e223ab52f 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -22,7 +22,6 @@ #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h" #ifdef ONE_KERNEL -#ifdef __CUDACC__ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws, int minT, // min number of neighbours to be "seed" @@ -41,7 +40,6 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, sortByPt2(pdata, pws); } #endif -#endif using namespace gpuVertexFinder; @@ -128,7 +126,7 @@ __global__ void linit(ZVertices * pdata, WorkSpace * pws, int nt) { } int main() { -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU requireCUDADevices(); auto onGPU_d = cudautils::make_device_unique(1, nullptr); @@ -157,7 +155,7 @@ int main() { assert(ntori>>(onGPU_d.get(), ws_d.get()); linit<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get(),ntori); #else @@ -166,7 +164,7 @@ int main() { for (int16_t i=0; iitrk[i]=i; onGPU_d->idv[i] = -1;} // FIXME do the same on GPU.... #endif -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); @@ -191,7 +189,7 @@ int main() { par = {{0.7f * eps, 0.01f, 9.0f}}; uint32_t nv = 0; -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); @@ -233,7 +231,7 @@ int main() { // keep chi2 separated... float chi2[2 * nv]; // make space for splitting... -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU int16_t hidv[16000]; float hzv[2 * nv]; float hwv[2 * nv]; @@ -256,7 +254,7 @@ int main() { ind = onGPU_d->sortInd; #endif -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else @@ -329,7 +327,7 @@ int main() { std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; } -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); @@ -348,7 +346,7 @@ int main() { std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; } -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU // one vertex per block!!! cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); @@ -361,12 +359,9 @@ int main() { #endif std::cout << "after split " << nv << std::endl; -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); - cudaCheck(cudaGetLastError()); - cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); - cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); @@ -380,7 +375,7 @@ int main() { continue; } -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); From f7ec457013d94a5d30b074051175d02e36cd835f Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 4 Dec 2019 16:37:57 +0100 Subject: [PATCH 05/22] compiles --- .../Common/interface/HeterogeneousSoA.h | 24 ++++---- .../CUDAUtilities/interface/cpu_unique_ptr.h | 9 +-- .../CUDAUtilities/test/cpu_unique_ptr_t.cpp | 8 +-- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 2 +- .../src/PixelVertexProducerCUDA.cc | 3 +- .../PixelVertexFinding/src/gpuVertexFinder.h | 2 +- .../src/gpuVertexFinderImpl.h | 59 ++++++------------- 7 files changed, 42 insertions(+), 65 deletions(-) diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index e0b6d330ce27a..f4b70aa7188fd 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -115,37 +115,37 @@ namespace cudaCompat { template static auto make_unique() { - return cudautils::make_cpu_unique(); + return cudautils::make_cpu_unique(0); } template static auto make_unique(size_t size) { - return cudautils::make_cpu_unique(size); + return cudautils::make_cpu_unique(size,0); } template - static auto make_unique(cudaStream_t) { - return cudautils::make_cpu_unique(); + static auto make_unique(cudaStream_t stream) { + return cudautils::make_cpu_unique(stream); } template - static auto make_unique(size_t size, cudaStream_t) { - return cudautils::make_cpu_unique(size); + static auto make_unique(size_t size, cudaStream_t stream) { + return cudautils::make_cpu_unique(size, stream); } template - static auto make_host_unique(cudaStream_t) { - return cudautils::make_cpu_unique(); + static auto make_host_unique(cudaStream_t stream) { + return cudautils::make_cpu_unique(stream); } template - static auto make_device_unique(cudaStream_t) { - return cudautils::make_cpu_unique(); + static auto make_device_unique(cudaStream_t stream) { + return cudautils::make_cpu_unique(stream); } template - static auto make_device_unique(size_t size, cudaStream_t) { - return cudautils::make_cpu_unique(size); + static auto make_device_unique(size_t size, cudaStream_t stream) { + return cudautils::make_cpu_unique(size, stream); } }; diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h index f4e97f9e47523..dab7d15f40f92 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h @@ -5,6 +5,7 @@ #include #include +#include namespace cudautils { namespace cpu { @@ -40,7 +41,7 @@ namespace cudautils { } // namespace cpu template - typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique() { + typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique(cudaStream_t) { static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the cpu memory is not supported"); void *mem = ::malloc(sizeof(T)); @@ -49,7 +50,7 @@ namespace cudautils { } template - typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique(size_t n) { + typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique(size_t n, cudaStream_t) { using element_type = typename std::remove_extent::type; static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the cpu memory is not supported"); @@ -63,14 +64,14 @@ namespace cudautils { // No check for the trivial constructor, make it clear in the interface template - typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique_uninitialized() { + typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique_uninitialized(cudaStream_t) { void *mem = ::malloc(sizeof(T)); return typename cpu::impl::make_cpu_unique_selector::non_array{reinterpret_cast(mem), cpu::impl::CPUDeleter()}; } template - typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique_uninitialized(size_t n) { + typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) { using element_type = typename std::remove_extent::type; void *mem = ::malloc(n * sizeof(element_type)); return typename cpu::impl::make_cpu_unique_selector::unbounded_array{reinterpret_cast(mem), diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp index e7f3077e999a0..35577dd1e508c 100644 --- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp @@ -5,12 +5,12 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { SECTION("Single element") { - auto ptr = cudautils::make_cpu_unique(); + auto ptr = cudautils::make_cpu_unique(0); REQUIRE(ptr != nullptr); } SECTION("Reset") { - auto ptr = cudautils::make_cpu_unique(); + auto ptr = cudautils::make_cpu_unique(0); REQUIRE(ptr != nullptr); ptr.reset(); @@ -18,13 +18,13 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { } SECTION("Multiple elements") { - auto ptr = cudautils::make_host_unique(10); + auto ptr = cudautils::make_host_unique(10,0); REQUIRE(ptr != nullptr); } SECTION("Allocating too much") { constexpr size_t maxSize = 1 << 30; // 8**10 - auto ptr = cudautils::make_cpu_unique(maxSize+1); + auto ptr = cudautils::make_cpu_unique(maxSize+1,0); ptr.reset(); REQUIRE(ptr != nullptr); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 9801d17de8118..4d8ecd96815c5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -187,7 +187,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH } PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique()); + PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique(0)); auto* soa = tracks.get(); assert(soa); diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc index e959abcfef949..acf7a07808add 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc @@ -117,7 +117,8 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; */ - iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin)); + cudaStream_t stream=0; + iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin)); } } diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index 6cd86c93a6737..351a7a47bdbf1 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -64,7 +64,7 @@ namespace gpuVertexFinder { ~Producer() = default; ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; - ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const; + ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; private: const bool oneKernel_; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h index 6c72937ae590c..ba3644e09d8bb 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h @@ -85,16 +85,17 @@ namespace gpuVertexFinder { sortByPt2(pdata, pws); } #endif - #ifndef CUDA_KERNELS_ON_CPU ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { +#else + ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { +#endif +#ifndef CUDA_KERNELS_ON_CPU // std::cout << "producing Vertices on GPU" << std::endl; ZVertexHeterogeneous vertices(cudautils::make_device_unique(stream)); #else - ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { - cudaStream_t stream = 0; // std::cout << "producing Vertices on CPU" << std::endl; - ZVertexHeterogeneous vertices(cudautils::make_cpu_unique()); + ZVertexHeterogeneous vertices(cudautils::make_cpu_unique(stream)); #endif assert(tksoa); auto* soa = vertices.get(); @@ -103,7 +104,7 @@ namespace gpuVertexFinder { #ifndef CUDA_KERNELS_ON_CPU auto ws_d = cudautils::make_device_unique(stream); #else - auto ws_d = cudautils::make_cpu_unique(); + auto ws_d = cudautils::make_cpu_unique(stream); #endif auto blockSize = 128; @@ -111,56 +112,30 @@ namespace gpuVertexFinder { cudautils::launch(init,{1,1,0,stream},soa, ws_d.get()); cudautils::launch(loadTracks, {numberOfBlocks, blockSize, 0, stream}, tksoa, soa, ws_d.get(), ptMin); -#ifndef CUDA_KERNELS_ON_CPU if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(vertexFinderOneKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); #else - vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); - cudaCheck(cudaGetLastError()); + cudautils::launch(vertexFinderKernel1,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); // one block per vertex... - splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f); - cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); + cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f); + cudautils::launch(vertexFinderKernel2,{1, 1024 - 256, 0, stream},soa, ws_d.get()); #endif } else { // five kernels if (useDensity_) { - clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(clusterTracksByDensityKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(clusterTracksDBSCAN,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(clusterTracksIterative,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); } - cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.); - cudaCheck(cudaGetLastError()); + cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 50.); // one block per vertex... - splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f); - cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.); - cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); - } - cudaCheck(cudaGetLastError()); -#else // __CUDACC__ - if (useDensity_) { - clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useDBSCAN_) { - clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useIterative_) { - clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f); + cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 5000.); + cudautils::launch(sortByPt2Kernel,{1, 1024 - 256, 0, stream},soa, ws_d.get()); } - // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; - fitVertices(soa, ws_d.get(), 50.); - // one block per vertex! - blockIdx.x = 0; - gridDim.x = 1; - splitVertices(soa, ws_d.get(), 9.f); - resetGrid(); - fitVertices(soa, ws_d.get(), 5000.); - sortByPt2(soa, ws_d.get()); -#endif return vertices; } From 332cb4559b36d2c67f9d65f7989d03f58ff6de49 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 5 Dec 2019 10:05:26 +0100 Subject: [PATCH 06/22] compiles --- .../Common/interface/HeterogeneousSoA.h | 6 ++++ .../src/PixelVertexProducerCUDA.cc | 4 +-- ...gpuVertexFinder.cpp => gpuVertexFinder.cc} | 0 .../PixelVertexFinding/src/gpuVertexFinder.h | 6 ++-- .../src/gpuVertexFinderImpl.h | 30 ++++++++----------- 5 files changed, 25 insertions(+), 21 deletions(-) rename RecoPixelVertexing/PixelVertexFinding/src/{gpuVertexFinder.cpp => gpuVertexFinder.cc} (100%) diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index f4b70aa7188fd..8edc45f8671b2 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -54,6 +54,8 @@ class HeterogeneousSoA { namespace cudaCompat { struct GPUTraits { + static constexpr const char * name = "GPU"; + template using unique_ptr = cudautils::device::unique_ptr; @@ -84,6 +86,8 @@ namespace cudaCompat { }; struct HostTraits { + static constexpr const char * name = "HOST"; + template using unique_ptr = cudautils::host::unique_ptr; @@ -109,6 +113,8 @@ namespace cudaCompat { }; struct CPUTraits { + static constexpr const char * name = "CPU"; + template using unique_ptr = cudautils::cpu::unique_ptr;; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc index acf7a07808add..f49b0d7b6dced 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc @@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent assert(tracks); - ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin)); + ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make(ctx.stream(), tracks, m_ptMin,m_OnGPU)); } else { auto const* tracks = iEvent.get(tokenCPUTrack_).get(); @@ -118,7 +118,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent */ cudaStream_t stream=0; - iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin)); + iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin,m_OnGPU)); } } diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc similarity index 100% rename from RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp rename to RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index 351a7a47bdbf1..9f21fa4b650a7 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -63,8 +63,10 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; - ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; + template + ZVertexHeterogeneous makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; + + ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const; private: const bool oneKernel_; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h index ba3644e09d8bb..98103e89ab012 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h @@ -85,27 +85,16 @@ namespace gpuVertexFinder { sortByPt2(pdata, pws); } #endif -#ifndef CUDA_KERNELS_ON_CPU - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { -#else - ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { -#endif -#ifndef CUDA_KERNELS_ON_CPU - // std::cout << "producing Vertices on GPU" << std::endl; - ZVertexHeterogeneous vertices(cudautils::make_device_unique(stream)); -#else - // std::cout << "producing Vertices on CPU" << std::endl; - ZVertexHeterogeneous vertices(cudautils::make_cpu_unique(stream)); -#endif + + template + ZVertexHeterogeneous Producer::makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { + std::cout << "producing Vertices on " << Traits::name << std::endl; + ZVertexHeterogeneous vertices(Traits::template make_unique(stream)); assert(tksoa); auto* soa = vertices.get(); assert(soa); -#ifndef CUDA_KERNELS_ON_CPU - auto ws_d = cudautils::make_device_unique(stream); -#else - auto ws_d = cudautils::make_cpu_unique(stream); -#endif + auto ws_d = Traits::template make_unique(stream); auto blockSize = 128; auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; @@ -140,6 +129,13 @@ namespace gpuVertexFinder { return vertices; } + ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const { + return onGPU ? + makeImpl(stream,tksoa,ptMin) : + makeImpl(stream,tksoa,ptMin); + } + + } // namespace gpuVertexFinder #undef FROM From 069615973f50b64bc42d11c574e1436f28b43f8a Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 5 Dec 2019 10:33:47 +0100 Subject: [PATCH 07/22] now runs properly --- .../PixelVertexFinding/src/PixelVertexProducerCUDA.cc | 4 ++-- .../PixelVertexFinding/src/gpuVertexFinder.cc | 8 ++++++++ .../PixelVertexFinding/src/gpuVertexFinder.cu | 7 +++++++ .../PixelVertexFinding/src/gpuVertexFinder.h | 1 + .../PixelVertexFinding/src/gpuVertexFinderImpl.h | 7 ------- 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc index f49b0d7b6dced..5d81407d00f5b 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc @@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent assert(tracks); - ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make(ctx.stream(), tracks, m_ptMin,m_OnGPU)); + ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make(ctx.stream(), tracks, m_ptMin,m_OnGPU)); } else { auto const* tracks = iEvent.get(tokenCPUTrack_).get(); @@ -118,7 +118,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent */ cudaStream_t stream=0; - iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin,m_OnGPU)); + iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin,m_OnGPU)); } } diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc index 084763385bdb4..e8c6a812d86d2 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc @@ -1 +1,9 @@ +#define CUDA_KERNELS_ON_CPU // not needed yet #include "gpuVertexFinderImpl.h" +namespace gpuVertexFinder { +template<> + ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const { + assert(!onGPU); + return makeImpl(stream,tksoa,ptMin); + } +} diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu index 084763385bdb4..96739a42c3b0a 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu @@ -1 +1,8 @@ #include "gpuVertexFinderImpl.h" +namespace gpuVertexFinder { +template<> + ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const { + assert(onGPU); + return makeImpl(stream,tksoa,ptMin); + } +} diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index 9f21fa4b650a7..4dc4572da6e7e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -66,6 +66,7 @@ namespace gpuVertexFinder { template ZVertexHeterogeneous makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; + template ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const; private: diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h index 98103e89ab012..480cfe3a1b0e0 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h @@ -129,13 +129,6 @@ namespace gpuVertexFinder { return vertices; } - ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const { - return onGPU ? - makeImpl(stream,tksoa,ptMin) : - makeImpl(stream,tksoa,ptMin); - } - - } // namespace gpuVertexFinder #undef FROM From 34f370434bbf2938d08b35b2f0641c3803968b85 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 5 Dec 2019 10:35:14 +0100 Subject: [PATCH 08/22] remove cout --- RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h index 480cfe3a1b0e0..6fd6e985f25b9 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h @@ -88,7 +88,7 @@ namespace gpuVertexFinder { template ZVertexHeterogeneous Producer::makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { - std::cout << "producing Vertices on " << Traits::name << std::endl; + // std::cout << "producing Vertices on " << Traits::name << std::endl; ZVertexHeterogeneous vertices(Traits::template make_unique(stream)); assert(tksoa); auto* soa = vertices.get(); From c87c860bfb5381437f5979559f5cc79b32e39c01 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 10:10:42 +0100 Subject: [PATCH 09/22] a more complete test --- .../Common/interface/HeterogeneousSoA.h | 4 +- .../CUDAUtilities/test/BuildFile.xml | 5 ++ .../CUDAUtilities/test/Launch_t.cpp | 31 +++++-------- .../CUDAUtilities/test/Launch_t.cu | 22 +++++---- .../CUDAUtilities/test/Launch_t.h | 46 +++++++++++++++++++ .../CUDAUtilities/test/cpu_unique_ptr_t.cpp | 8 ++-- .../src/PixelVertexProducerCUDA.cc | 2 +- 7 files changed, 82 insertions(+), 36 deletions(-) create mode 100644 HeterogeneousCore/CUDAUtilities/test/Launch_t.h diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index 8edc45f8671b2..08bdb42b28a98 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -121,12 +121,12 @@ namespace cudaCompat { template static auto make_unique() { - return cudautils::make_cpu_unique(0); + return cudautils::make_cpu_unique(cudaStreamDefault); } template static auto make_unique(size_t size) { - return cudautils::make_cpu_unique(size,0); + return cudautils::make_cpu_unique(size,cudaStreamDefault); } template diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index d9edcba07c182..893d1ab1745eb 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -14,6 +14,11 @@ + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp index 11a6876ff3665..77c824fc833c4 100644 --- a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp @@ -1,31 +1,24 @@ -#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h" +#include "Launch_t.h" - -#include -#include - -#undef __global__ -#define __global__ inline __attribute__((always_inline)) - - -__global__ -void hello(float k) { - - printf("hello %f\n",k); - -} - - +#ifdef LaunchInCU + void wrapperInCU(); +#endif int main() { requireCUDADevices(); - cudautils::launch(hello,{1, 1},3.14); - cudaDeviceSynchronize(); + printf("in Main\n"); + printEnv(); + + wrapper(); + +#ifdef LaunchInCU + wrapperInCU(); +#endif return 0; } diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu index 7983512739b60..90b24fb6fec4f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu @@ -1,10 +1,12 @@ -#include - -__global__ -void hello(float k) { - - printf("hello %f\n",k); - -} - - +#ifdef CUDA_KERNELS_ON_CPU +#undef CUDA_KERNELS_ON_CPU +#endif + +#include "Launch_t.h" + +#ifdef LaunchInCU + void wrapperInCU() { + printf("in cu wrapper\n"); + wrapper(); + } +#endif diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.h b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h new file mode 100644 index 0000000000000..3237f6383f47d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h @@ -0,0 +1,46 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include +#include + +// these are in cudaCompat: here we do just a minimal test of "launch" +#ifndef __CUDACC__ +#undef __global__ +#define __global__ inline __attribute__((always_inline)) +#endif + +inline +constexpr void printEnv() { + printf("ENV\n"); +#ifdef CUDA_KERNELS_ON_CPU + printf("CUDA_KERNELS_ON_CPU defined\n"); +#endif +#ifdef __CUDACC__ + printf("__CUDACC__ defined\n"); +#endif +#ifdef __CUDA_RUNTIME_H__ + printf("__CUDA_RUNTIME_H__ defined\n"); +#endif +#ifdef __CUDA_ARCH__ + printf("__CUDA_ARCH__ defined\n"); +#endif + printf("---\n"); +} + +__global__ +void hello(float k) { + + printf("hello from kernel %f\n",k); + printEnv(); +} + + +inline void wrapper() { + + printf("in Wrapper\n"); + printEnv(); + + cudautils::launch(hello,{1, 1},3.14); + cudaDeviceSynchronize(); + +} + diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp index 35577dd1e508c..828b7912ed6c4 100644 --- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp @@ -5,12 +5,12 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { SECTION("Single element") { - auto ptr = cudautils::make_cpu_unique(0); + auto ptr = cudautils::make_cpu_unique(cudaStreamDefault); REQUIRE(ptr != nullptr); } SECTION("Reset") { - auto ptr = cudautils::make_cpu_unique(0); + auto ptr = cudautils::make_cpu_unique(cudaStreamDefault); REQUIRE(ptr != nullptr); ptr.reset(); @@ -18,13 +18,13 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { } SECTION("Multiple elements") { - auto ptr = cudautils::make_host_unique(10,0); + auto ptr = cudautils::make_cpu_unique(10,cudaStreamDefault); REQUIRE(ptr != nullptr); } SECTION("Allocating too much") { constexpr size_t maxSize = 1 << 30; // 8**10 - auto ptr = cudautils::make_cpu_unique(maxSize+1,0); + auto ptr = cudautils::make_cpu_unique(maxSize+1,cudaStreamDefault); ptr.reset(); REQUIRE(ptr != nullptr); } diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc index 5d81407d00f5b..347476cf1b6a3 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc @@ -117,7 +117,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; */ - cudaStream_t stream=0; + cudaStream_t stream=cudaStreamDefault; iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin,m_OnGPU)); } } From b26b7f4204974a6f444b679d3c10aff6d3242bb7 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 10:19:05 +0100 Subject: [PATCH 10/22] make sure cuda kernels are seen as inline functions when not compiled as such --- .../CUDAUtilities/interface/cudaCompat.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index 602e876a12e15..892cb7cf5271a 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -90,18 +90,20 @@ namespace cudaCompat { #define __forceinline__ #endif -// make sure function are inlined to avoid multiple definition #ifndef __CUDA_ARCH__ +using namespace cudaCompat; +#endif + +#endif // CUDA_KERNELS_ON_CPU + + +// make sure function are inlined to avoid multiple definition +#ifndef __CUDACC__ #undef __global__ #define __global__ inline __attribute__((always_inline)) #undef __forceinline__ #define __forceinline__ inline __attribute__((always_inline)) #endif -#ifndef __CUDA_ARCH__ -using namespace cudaCompat; -#endif - -#endif // CUDA_KERNELS_ON_CPU #endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h From e89d8864cea88b0a213b57d7f5327806683133d3 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 11:32:21 +0100 Subject: [PATCH 11/22] move to single source --- .../Common/interface/HeterogeneousSoA.h | 4 +- .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 4 +- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 58 ------------------- .../plugins/CAHitNtupletGeneratorOnGPU.h | 51 +++++++++++++++- 4 files changed, 53 insertions(+), 64 deletions(-) diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index 08bdb42b28a98..0c3e8bd37cb6a 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -55,6 +55,7 @@ namespace cudaCompat { struct GPUTraits { static constexpr const char * name = "GPU"; + static constexpr bool runOnDevice = true; template using unique_ptr = cudautils::device::unique_ptr; @@ -87,6 +88,7 @@ namespace cudaCompat { struct HostTraits { static constexpr const char * name = "HOST"; + static constexpr bool runOnDevice = false; template using unique_ptr = cudautils::host::unique_ptr; @@ -114,11 +116,11 @@ namespace cudaCompat { struct CPUTraits { static constexpr const char * name = "CPU"; + static constexpr bool runOnDevice = false; template using unique_ptr = cudautils::cpu::unique_ptr;; - template static auto make_unique() { return cudautils::make_cpu_unique(cudaStreamDefault); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 11b644d466768..a87b45f47cbe5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -74,10 +74,10 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const CUDAScopedContextProduce ctx{*hHits}; auto const& hits = ctx.get(*hHits); - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuples(hits, bf, ctx.stream())); } else { auto const& hits = iEvent.get(tokenHitCPU_); - iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf)); + iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf,cudaStreamDefault)); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 4d8ecd96815c5..e6525bd16a49b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -159,61 +159,3 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& "cuts\" based on the fit results (pT, Tip, Zip)."); } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d, - float bfield, - cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cudautils::make_device_unique(stream)); - - auto* soa = tracks.get(); - - CAHitNtupletGeneratorKernelsGPU kernels(m_params); - kernels.counters_ = m_counters; - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); - - kernels.allocateOnGPU(stream); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); - - kernels.buildDoublets(hits_d, stream); - kernels.launchKernels(hits_d, soa, stream); - kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" - if (m_params.useRiemannFit_) { - fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); - } else { - fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); - } - kernels.classifyTuples(hits_d, soa, stream); - - return tracks; -} - -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique(0)); - - auto* soa = tracks.get(); - assert(soa); - - CAHitNtupletGeneratorKernelsCPU kernels(m_params); - kernels.counters_ = m_counters; - kernels.allocateOnGPU(nullptr); - - kernels.buildDoublets(hits_d, nullptr); - kernels.launchKernels(hits_d, soa, nullptr); - kernels.fillHitDetIndices(hits_d.view(), soa, nullptr); // in principle needed only if Hits not "available" - - if (0 == hits_d.nHits()) - return tracks; - - // now fit - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); - - if (m_params.useRiemannFit_) { - fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); - } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); - } - - kernels.classifyTuples(hits_d, soa, nullptr); - - return tracks; -} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index de2e1913dd18b..82ccb6b835d68 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -47,9 +47,8 @@ class CAHitNtupletGeneratorOnGPU { static void fillDescriptions(edm::ParameterSetDescription& desc); static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; } - PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; - - PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + template + PixelTrackHeterogeneous makeTuples(Hits2D const& hits_d, float bfield, cudaStream_t stream) const; private: void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; @@ -63,4 +62,50 @@ class CAHitNtupletGeneratorOnGPU { Counters* m_counters = nullptr; }; +template +inline +PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hits_d, + float bfield, + cudaStream_t stream) const { + PixelTrackHeterogeneous tracks(Traits:: template make_unique(stream)); + + using NTupletGeneratorKernels = CAHitNtupletGeneratorKernels; + + auto* soa = tracks.get(); + + NTupletGeneratorKernels kernels(m_params); + kernels.counters_ = m_counters; + kernels.allocateOnGPU(stream); + + kernels.buildDoublets(hits_d, stream); + kernels.launchKernels(hits_d, soa, stream); + kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + + if constexpr (!Traits::runOnDevice) + if (0 == hits_d.nHits()) + return tracks; + + // now fit + HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + if (m_params.useRiemannFit_) { + if constexpr (Traits::runOnDevice) { + fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); + }else { + fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); + } + } else { + if constexpr (Traits::runOnDevice) { + fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); + } else { + fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); + } + } + kernels.classifyTuples(hits_d, soa, stream); + + return tracks; +} + + + #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h From a7d242e9ed163f74ca9bc9533fb8127809f672b2 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 12:22:56 +0100 Subject: [PATCH 12/22] explicit instantiation --- .../plugins/BrokenLineFitOnGPU.cc | 71 ++-------------- .../plugins/BrokenLineFitOnGPU.cu | 84 +------------------ .../plugins/BrokenLineFitOnGPU.h | 81 ++++++++++++++++++ .../plugins/CAHitNtupletGeneratorKernels.h | 2 +- .../CAHitNtupletGeneratorKernelsAlloc.h | 4 +- .../plugins/CAHitNtupletGeneratorOnGPU.h | 10 +-- .../PixelTriplets/plugins/HelixFitOnGPU.cc | 4 +- .../PixelTriplets/plugins/HelixFitOnGPU.h | 9 +- 8 files changed, 102 insertions(+), 163 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc index cc5865d97fd95..6c90ab1dd8f6f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -1,68 +1,7 @@ #include "BrokenLineFitOnGPU.h" -void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { - assert(tuples_d); - - // Fit internals - auto hitsGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = - std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // fit triplets - kernelBLFastFit<3>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); - - kernelBLFit<3>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 3, - offset); - - // fit quads - kernelBLFastFit<4>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); - - kernelBLFit<4>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 4, - offset); - - if (fit5as4_) { - // fit penta (only first 4) - kernelBLFastFit<4>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - - kernelBLFit<4>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - } else { - // fit penta (all 5) - kernelBLFastFit<5>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - - kernelBLFit<5>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - } - - } // loop on concurrent fits -} +template +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples, + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index 660cf75e1f460..2eba3e9af5965 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -1,85 +1,7 @@ #include "BrokenLineFitOnGPU.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, +template +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples, - cudaStream_t stream) { - assert(tuples_d); - - auto blockSize = 64; - auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; - - // Fit internals - auto hitsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); - auto hits_geGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); - auto fast_fit_resultsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // fit triplets - kernelBLFastFit<3><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<3><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 3, - offset); - cudaCheck(cudaGetLastError()); - - // fit quads - kernelBLFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<4><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 4, - offset); - cudaCheck(cudaGetLastError()); - - if (fit5as4_) { - // fit penta (only first 4) - kernelBLFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<4><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - cudaCheck(cudaGetLastError()); - } else { - // fit penta (all 5) - kernelBLFastFit<5><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<5><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - cudaCheck(cudaGetLastError()); - } - - } // loop on concurrent fits -} + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index 82a5bee443f88..aa7c1e680929e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -185,3 +185,84 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t #endif } } + + +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +template +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { + assert(tuples_d); + + int blockSize = 64; + int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + + // Fit internals + auto hitsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); + auto hits_geGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); + auto fast_fit_resultsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // fit triplets + cudautils::launch(kernelBLFastFit<3>,{numberOfBlocks, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelBLFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 3, + offset); + + // fit quads + cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 4, + offset); + + if (fit5as4_) { + // fit penta (only first 4) + cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + + cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + } else { + // fit penta (all 5) + cudautils::launch(kernelBLFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + + cudautils::launch(kernelBLFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + } + + } // loop on concurrent fits +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 0140958aa9ef2..8213cc9c2af03 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -171,7 +171,7 @@ class CAHitNtupletGeneratorKernels { void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream); void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); - void allocateOnGPU(cudaStream_t stream); + void allocate(cudaStream_t stream); void cleanup(cudaStream_t cudaStream); static void printCounters(Counters const* counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h index b91911c66924e..e688052f1faf0 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h @@ -4,9 +4,9 @@ template <> #ifdef __CUDACC__ -void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsGPU::allocate(cudaStream_t stream) { #else -void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::allocate(cudaStream_t stream) { #endif ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 82ccb6b835d68..dac6faa5188b9 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -75,7 +75,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit NTupletGeneratorKernels kernels(m_params); kernels.counters_ = m_counters; - kernels.allocateOnGPU(stream); + kernels.allocate(stream); kernels.buildDoublets(hits_d, stream); kernels.launchKernels(hits_d, soa, stream); @@ -87,7 +87,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit // now fit HelixFitOnGPU fitter(bfield, m_params.fit5as4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocate(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { if constexpr (Traits::runOnDevice) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); @@ -95,11 +95,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); } } else { - if constexpr (Traits::runOnDevice) { - fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); - } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); - } + fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); } kernels.classifyTuples(hits_d, soa, stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index becbd0a1a8540..050a5c83431ce 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -1,7 +1,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HelixFitOnGPU.h" -void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, +void HelixFitOnGPU::allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *helix_fit_results) { tuples_d = tuples; @@ -13,4 +13,4 @@ void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, assert(outputSoa_d); } -void HelixFitOnGPU::deallocateOnGPU() {} +void HelixFitOnGPU::deallocate() {} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 05b399e870f58..11ec0a663aed3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -41,17 +41,18 @@ class HelixFitOnGPU { using TupleMultiplicity = CAConstants::TupleMultiplicity; explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {} - ~HelixFitOnGPU() { deallocateOnGPU(); } + ~HelixFitOnGPU() { deallocate(); } void setBField(double bField) { bField_ = bField; } void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + + template void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); - void deallocateOnGPU(); + void allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); + void deallocate(); private: static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits(); From 3a530f19a860dd567efd3103af9541e04302bf45 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 13:51:53 +0100 Subject: [PATCH 13/22] all ported but one --- .../plugins/CAHitNtupletGeneratorKernels.cu | 81 ++++++++----------- 1 file changed, 33 insertions(+), 48 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index aaf882633f17d..bf1461b2119c5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,13 +1,15 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + + template <> void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { auto blockSize = 128; - auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; + int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; - kernel_fillHitDetIndices<<>>( + cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream}, &tracks_d->hitIndices, hv, &tracks_d->detIndices); - cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); @@ -36,7 +38,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * auto nthTot = 64; auto stride = 4; auto blockSize = nthTot / stride; - auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; auto rescale = numberOfBlocks / 65536; blockSize *= (rescale + 1); numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; @@ -45,7 +47,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - kernel_connect<<>>( + cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream}, device_hitTuple_apc_, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel hh.view(), @@ -59,23 +61,21 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * m_params.CAThetaCutForward_, m_params.dcaCutInnerTriplet_, m_params.dcaCutOuterTriplet_); - cudaCheck(cudaGetLastError()); if (nhits > 1 && m_params.earlyFishbone_) { auto nthTot = 128; auto stride = 16; - auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; + int blockSize = nthTot / stride; + int numberOfBlocks = (nhits + blockSize - 1) / blockSize; dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - fishbone<<>>( + cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); - cudaCheck(cudaGetLastError()); } blockSize = 64; numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_find_ntuplets<<>>(hh.view(), + cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_, device_theCellTracks_, @@ -83,35 +83,32 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_hitTuple_apc_, quality_d, m_params.minHitsPerNtuplet_); - cudaCheck(cudaGetLastError()); if (m_params.doStats_) - kernel_mark_used<<>>(hh.view(), device_theCells_.get(), device_nCells_); - cudaCheck(cudaGetLastError()); + cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_); #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif - blockSize = 128; numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; +// cudautils::launch(cudautils::finalizeBulk,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d); cudautils::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); + // remove duplicates (tracks that share a doublet) numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_earlyDuplicateRemover<<>>( + cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, device_theCells_.get(), device_nCells_, tuples_d, quality_d); - cudaCheck(cudaGetLastError()); blockSize = 128; numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( + cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, tuples_d, quality_d, device_tupleMultiplicity_.get()); cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); - kernel_fillMultiplicity<<>>( + cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, tuples_d, quality_d, device_tupleMultiplicity_.get()); - cudaCheck(cudaGetLastError()); if (nhits > 1 && m_params.lateFishbone_) { auto nthTot = 128; @@ -120,14 +117,13 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - fishbone<<>>( + cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); - cudaCheck(cudaGetLastError()); } if (m_params.doStats_) { numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize; - kernel_checkOverflows<<>>(tuples_d, + cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, device_tupleMultiplicity_.get(), device_hitTuple_apc_, device_theCells_.get(), @@ -138,7 +134,6 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * nhits, m_params.maxNumberOfDoublets_, counters_); - cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -166,13 +161,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr int threadsPerBlock = 128; // at least one block! int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; - gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), + cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(), nhits, device_theCellNeighbors_, device_theCellNeighborsContainer_.get(), device_theCellTracks_, device_theCellTracksContainer_.get()); - cudaCheck(cudaGetLastError()); } device_theCells_ = cudautils::make_device_unique(m_params.maxNumberOfDoublets_, stream); @@ -199,7 +193,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; dim3 blks(1, blocks, 1); dim3 thrs(stride, threadsPerBlock, 1); - gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), + cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(), device_nCells_, device_theCellNeighbors_, device_theCellTracks_, @@ -211,7 +205,6 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr m_params.doZ0Cut_, m_params.doPtCut_, m_params.maxNumberOfDoublets_); - cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -225,54 +218,46 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA auto const *tuples_d = &tracks_d->hitIndices; auto *quality_d = (Quality *)(&tracks_d->m_quality); - auto blockSize = 64; + int blockSize = 64; // classify tracks based on kinematics - auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - kernel_classifyTracks<<>>(tuples_d, tracks_d, m_params.cuts_, quality_d); - cudaCheck(cudaGetLastError()); + int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d); if (m_params.lateFishbone_) { // apply fishbone cleaning to good tracks numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_fishboneCleaner<<>>( + cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, device_theCells_.get(), device_nCells_, quality_d); - cudaCheck(cudaGetLastError()); } // remove duplicates (tracks that share a doublet) numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_fastDuplicateRemover<<>>( + cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, device_theCells_.get(), device_nCells_, tuples_d, tracks_d); - cudaCheck(cudaGetLastError()); if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) { // fill hit->track "map" numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - kernel_countHitInTracks<<>>( + cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream}, tuples_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); + cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get()); } if (m_params.minHitsPerNtuplet_ < 4) { // remove duplicates (tracks that share a hit) numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - kernel_tripletCleaner<<>>( + cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); } if (m_params.doStats_) { // counters (add flag???) numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); - cudaCheck(cudaGetLastError()); + cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_); numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - kernel_doStatsForTracks<<>>(tuples_d, quality_d, counters_); - cudaCheck(cudaGetLastError()); + cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_); } #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -282,12 +267,12 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA #ifdef DUMP_GPU_TK_TUPLES static std::atomic iev(0); ++iev; - kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( + cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream}, hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); #endif } template <> void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) { - kernel_printCounters<<<1, 1>>>(counters); + cudautils::launch(kernel_printCounters,{1, 1},counters); } From 324667ee67e349e6fd7729c7bdbddd930635017a Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 14:31:26 +0100 Subject: [PATCH 14/22] compiles --- .../plugins/CAHitNtupletGeneratorKernels.cc | 173 +---------- .../plugins/CAHitNtupletGeneratorKernels.cu | 281 +---------------- .../CAHitNtupletGeneratorKernelsImpl.h | 284 ++++++++++++++++++ 3 files changed, 304 insertions(+), 434 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index b4cd1f8aa1abc..03a8b927289b8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -1,170 +1,17 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" -template <> -void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { - kernel_printCounters(counters); -} -template <> -void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) { - kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices); -} +template +void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream); -template <> -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - auto nhits = hh.nHits(); +template +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); -#ifdef NTUPLE_DEBUG - std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; -#endif +template +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream); - // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = Traits:: template make_unique(std::max(1U,nhits),stream); - assert(device_isOuterHitOfCell_.get()); - gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), - nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); +template +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); - device_theCells_ = Traits:: template make_unique(m_params.maxNumberOfDoublets_,stream); - if (0 == nhits) - return; // protect against empty events - - // FIXME avoid magic numbers - auto nActualPairs = gpuPixelDoublets::nPairs; - if (!m_params.includeJumpingForwardDoublets_) - nActualPairs = 15; - if (m_params.minHitsPerNtuplet_ > 3) { - nActualPairs = 13; - } - - assert(nActualPairs <= gpuPixelDoublets::nPairs); - gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - hh.view(), - device_isOuterHitOfCell_.get(), - nActualPairs, - m_params.idealConditions_, - m_params.doClusterCut_, - m_params.doZ0Cut_, - m_params.doPtCut_, - m_params.maxNumberOfDoublets_); -} - -template <> -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); - - assert(tuples_d && quality_d); - - // zero tuples - cudautils::launchZero(tuples_d, cudaStream); - - auto nhits = hh.nHits(); - assert(nhits <= pixelGPUConstants::maxNumberOfHits); - - // std::cout << "N hits " << nhits << std::endl; - // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; - - // - // applying conbinatoric cleaning such as fishbone at this stage is too expensive - // - - kernel_connect(device_hitTuple_apc_, - device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_isOuterHitOfCell_.get(), - m_params.hardCurvCut_, - m_params.ptmin_, - m_params.CAThetaCutBarrel_, - m_params.CAThetaCutForward_, - m_params.dcaCutInnerTriplet_, - m_params.dcaCutOuterTriplet_); - - if (nhits > 1 && m_params.earlyFishbone_) { - fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); - } - - kernel_find_ntuplets(hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellTracks_, - tuples_d, - device_hitTuple_apc_, - quality_d, - m_params.minHitsPerNtuplet_); - if (m_params.doStats_) - kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_); - - cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d); - - // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d); - - kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - - if (nhits > 1 && m_params.lateFishbone_) { - fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); - } - - if (m_params.doStats_) { - kernel_checkOverflows(tuples_d, - device_tupleMultiplicity_.get(), - device_hitTuple_apc_, - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - device_isOuterHitOfCell_.get(), - nhits, - m_params.maxNumberOfDoublets_, - counters_); - } -} - -template <> -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); - - // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d); - - if (m_params.lateFishbone_) { - // apply fishbone cleaning to good tracks - kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d); - } - - // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d); - - // fill hit->track "map" - kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - - // remove duplicates (tracks that share a hit) - kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); - - if (m_params.doStats_) { - // counters (add flag???) - kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); - kernel_doStatsForTracks(tuples_d, quality_d, counters_); - } - -#ifdef DUMP_GPU_TK_TUPLES - static std::atomic iev(0); - ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); -#endif -} +template +void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index bf1461b2119c5..8939acff2ebc2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,278 +1,17 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" -#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +template +void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream); -template <> -void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto blockSize = 128; - int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; +template +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); - cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream}, - &tracks_d->hitIndices, hv, &tracks_d->detIndices); -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif -} +template +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream); -template <> -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - // these are pointer on GPU! - auto *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); +template +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); - // zero tuples - cudautils::launchZero(tuples_d, cudaStream); - - auto nhits = hh.nHits(); - assert(nhits <= pixelGPUConstants::maxNumberOfHits); - - // std::cout << "N hits " << nhits << std::endl; - // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; - - // - // applying conbinatoric cleaning such as fishbone at this stage is too expensive - // - - auto nthTot = 64; - auto stride = 4; - auto blockSize = nthTot / stride; - int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - auto rescale = numberOfBlocks / 65536; - blockSize *= (rescale + 1); - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - assert(numberOfBlocks < 65536); - assert(blockSize > 0 && 0 == blockSize % 16); - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - - cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream}, - device_hitTuple_apc_, - device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_isOuterHitOfCell_.get(), - m_params.hardCurvCut_, - m_params.ptmin_, - m_params.CAThetaCutBarrel_, - m_params.CAThetaCutForward_, - m_params.dcaCutInnerTriplet_, - m_params.dcaCutOuterTriplet_); - - if (nhits > 1 && m_params.earlyFishbone_) { - auto nthTot = 128; - auto stride = 16; - int blockSize = nthTot / stride; - int numberOfBlocks = (nhits + blockSize - 1) / blockSize; - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); - } - - blockSize = 64; - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellTracks_, - tuples_d, - device_hitTuple_apc_, - quality_d, - m_params.minHitsPerNtuplet_); - - if (m_params.doStats_) - cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_); - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - blockSize = 128; - numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; -// cudautils::launch(cudautils::finalizeBulk,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d); - cudautils::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); - - - // remove duplicates (tracks that share a doublet) - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, - device_theCells_.get(), device_nCells_, tuples_d, quality_d); - - blockSize = 128; - numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, - tuples_d, quality_d, device_tupleMultiplicity_.get()); - cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); - cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, - tuples_d, quality_d, device_tupleMultiplicity_.get()); - - if (nhits > 1 && m_params.lateFishbone_) { - auto nthTot = 128; - auto stride = 16; - auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); - } - - if (m_params.doStats_) { - numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize; - cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, - device_tupleMultiplicity_.get(), - device_hitTuple_apc_, - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - device_isOuterHitOfCell_.get(), - nhits, - m_params.maxNumberOfDoublets_, - counters_); - } -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif -} - -template <> -void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - auto nhits = hh.nHits(); - -#ifdef NTUPLE_DEBUG - std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; -#endif - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = cudautils::make_device_unique(std::max(1U, nhits), stream); - assert(device_isOuterHitOfCell_.get()); - { - int threadsPerBlock = 128; - // at least one block! - int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; - cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(), - nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); - } - - device_theCells_ = cudautils::make_device_unique(m_params.maxNumberOfDoublets_, stream); - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - if (0 == nhits) - return; // protect against empty events - - // FIXME avoid magic numbers - auto nActualPairs = gpuPixelDoublets::nPairs; - if (!m_params.includeJumpingForwardDoublets_) - nActualPairs = 15; - if (m_params.minHitsPerNtuplet_ > 3) { - nActualPairs = 13; - } - - assert(nActualPairs <= gpuPixelDoublets::nPairs); - int stride = 4; - int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride; - int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; - dim3 blks(1, blocks, 1); - dim3 thrs(stride, threadsPerBlock, 1); - cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - hh.view(), - device_isOuterHitOfCell_.get(), - nActualPairs, - m_params.idealConditions_, - m_params.doClusterCut_, - m_params.doZ0Cut_, - m_params.doPtCut_, - m_params.maxNumberOfDoublets_); - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif -} - -template <> -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - // these are pointer on GPU! - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); - - int blockSize = 64; - - // classify tracks based on kinematics - int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d); - - if (m_params.lateFishbone_) { - // apply fishbone cleaning to good tracks - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, - device_theCells_.get(), device_nCells_, quality_d); - } - - // remove duplicates (tracks that share a doublet) - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, - device_theCells_.get(), device_nCells_, tuples_d, tracks_d); - - if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) { - // fill hit->track "map" - numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream}, - tuples_d, quality_d, device_hitToTuple_.get()); - cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); - cudaCheck(cudaGetLastError()); - cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get()); - } - if (m_params.minHitsPerNtuplet_ < 4) { - // remove duplicates (tracks that share a hit) - numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); - } - - if (m_params.doStats_) { - // counters (add flag???) - numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_); - numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_); - } -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - -#ifdef DUMP_GPU_TK_TUPLES - static std::atomic iev(0); - ++iev; - cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream}, - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); -#endif -} - -template <> -void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) { - cudautils::launch(kernel_printCounters,{1, 1},counters); -} +template +void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 6888e6725bc39..02e5de7f112ae 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -601,3 +601,287 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun c.nEmptyCells / double(c.nCells), c.nZeroTrackCells / double(c.nCells)); } + + + +// launchers + +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + + +template +void CAHitNtupletGeneratorKernels::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { + auto blockSize = 128; + int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; + + cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream}, + &tracks_d->hitIndices, hv, &tracks_d->detIndices); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template +void CAHitNtupletGeneratorKernels::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + // these are pointer on GPU! + auto *tuples_d = &tracks_d->hitIndices; + auto *quality_d = (Quality *)(&tracks_d->m_quality); + + // zero tuples + cudautils::launchZero(tuples_d, cudaStream); + + auto nhits = hh.nHits(); + assert(nhits <= pixelGPUConstants::maxNumberOfHits); + + // std::cout << "N hits " << nhits << std::endl; + // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; + + // + // applying conbinatoric cleaning such as fishbone at this stage is too expensive + // + + auto nthTot = 64; + auto stride = 4; + auto blockSize = nthTot / stride; + int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + auto rescale = numberOfBlocks / 65536; + blockSize *= (rescale + 1); + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + assert(numberOfBlocks < 65536); + assert(blockSize > 0 && 0 == blockSize % 16); + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + + cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream}, + device_hitTuple_apc_, + device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_isOuterHitOfCell_.get(), + m_params.hardCurvCut_, + m_params.ptmin_, + m_params.CAThetaCutBarrel_, + m_params.CAThetaCutForward_, + m_params.dcaCutInnerTriplet_, + m_params.dcaCutOuterTriplet_); + + if (nhits > 1 && m_params.earlyFishbone_) { + auto nthTot = 128; + auto stride = 16; + int blockSize = nthTot / stride; + int numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + } + + blockSize = 64; + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellTracks_, + tuples_d, + device_hitTuple_apc_, + quality_d, + m_params.minHitsPerNtuplet_); + + if (m_params.doStats_) + cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + blockSize = 128; + numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; +// cudautils::launch(cudautils::finalizeBulk,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d); +#ifdef __CUDACC__ + cudautils::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); +#else + cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d); +#endif + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, + device_theCells_.get(), device_nCells_, tuples_d, quality_d); + + blockSize = 128; + numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, + tuples_d, quality_d, device_tupleMultiplicity_.get()); + cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); + cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, + tuples_d, quality_d, device_tupleMultiplicity_.get()); + + if (nhits > 1 && m_params.lateFishbone_) { + auto nthTot = 128; + auto stride = 16; + auto blockSize = nthTot / stride; + auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + } + + if (m_params.doStats_) { + numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize; + cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, + device_tupleMultiplicity_.get(), + device_hitTuple_apc_, + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_theCellTracks_, + device_isOuterHitOfCell_.get(), + nhits, + m_params.maxNumberOfDoublets_, + counters_); + } +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template +void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { + auto nhits = hh.nHits(); + +#ifdef NTUPLE_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; +#endif + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + // in principle we can use "nhits" to heuristically dimension the workspace... + device_isOuterHitOfCell_ = TTraits:: template make_unique(std::max(1U, nhits), stream); + assert(device_isOuterHitOfCell_.get()); + { + int threadsPerBlock = 128; + // at least one block! + int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; + cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(), + nhits, + device_theCellNeighbors_, + device_theCellNeighborsContainer_.get(), + device_theCellTracks_, + device_theCellTracksContainer_.get()); + } + + device_theCells_ = TTraits:: template make_unique(m_params.maxNumberOfDoublets_, stream); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + if (0 == nhits) + return; // protect against empty events + + // FIXME avoid magic numbers + auto nActualPairs = gpuPixelDoublets::nPairs; + if (!m_params.includeJumpingForwardDoublets_) + nActualPairs = 15; + if (m_params.minHitsPerNtuplet_ > 3) { + nActualPairs = 13; + } + + assert(nActualPairs <= gpuPixelDoublets::nPairs); + int stride = 4; + int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride; + int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; + dim3 blks(1, blocks, 1); + dim3 thrs(stride, threadsPerBlock, 1); + cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_theCellTracks_, + hh.view(), + device_isOuterHitOfCell_.get(), + nActualPairs, + m_params.idealConditions_, + m_params.doClusterCut_, + m_params.doZ0Cut_, + m_params.doPtCut_, + m_params.maxNumberOfDoublets_); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template +void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + // these are pointer on GPU! + auto const *tuples_d = &tracks_d->hitIndices; + auto *quality_d = (Quality *)(&tracks_d->m_quality); + + int blockSize = 64; + + // classify tracks based on kinematics + int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d); + + if (m_params.lateFishbone_) { + // apply fishbone cleaning to good tracks + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, + device_theCells_.get(), device_nCells_, quality_d); + } + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, + device_theCells_.get(), device_nCells_, tuples_d, tracks_d); + + if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) { + // fill hit->track "map" + numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream}, + tuples_d, quality_d, device_hitToTuple_.get()); + cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); + cudaCheck(cudaGetLastError()); + cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get()); + } + if (m_params.minHitsPerNtuplet_ < 4) { + // remove duplicates (tracks that share a hit) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, + hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); + } + + if (m_params.doStats_) { + // counters (add flag???) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_); + numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_); + } +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + +#ifdef DUMP_GPU_TK_TUPLES + static std::atomic iev(0); + ++iev; + cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream}, + hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); +#endif +} + +template +void CAHitNtupletGeneratorKernels::printCounters(Counters const *counters) { + cudautils::launch(kernel_printCounters,{1, 1,0, cudaStreamDefault},counters); +} From 0740845c25a07ee0818e278179622692542c630a Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 15:51:25 +0100 Subject: [PATCH 15/22] fully ported to launch and traits --- .../plugins/CAHitNtupletGeneratorOnGPU.h | 6 +- .../PixelTriplets/plugins/HelixFitOnGPU.h | 4 +- .../PixelTriplets/plugins/RiemannFitOnGPU.cc | 113 +-------------- .../PixelTriplets/plugins/RiemannFitOnGPU.cu | 130 +---------------- .../PixelTriplets/plugins/RiemannFitOnGPU.h | 131 ++++++++++++++++++ 5 files changed, 142 insertions(+), 242 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index dac6faa5188b9..6e92c1b217cfb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -89,11 +89,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hit HelixFitOnGPU fitter(bfield, m_params.fit5as4_); fitter.allocate(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { - if constexpr (Traits::runOnDevice) { - fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); - }else { - fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); - } + fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); } else { fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 11ec0a663aed3..cf877dbd11363 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -44,13 +44,13 @@ class HelixFitOnGPU { ~HelixFitOnGPU() { deallocate(); } void setBField(double bField) { bField_ = bField; } + + template void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); template void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); void deallocate(); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc index 3476362864a79..f77f9a4fb9a41 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -1,110 +1,7 @@ #include "RiemannFitOnGPU.h" -void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) { - assert(tuples_d); - - // Fit internals - auto hitsGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = - std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); - auto circle_fit_resultsGPU_holder = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); - Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // triplets - kernelFastFit<3>( - tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<3>(tupleMultiplicity_d, - 3, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<3>(tupleMultiplicity_d, - 3, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - // quads - kernelFastFit<4>( - tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<4>(tupleMultiplicity_d, - 4, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<4>(tupleMultiplicity_d, - 4, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - if (fit5as4_) { - // penta - kernelFastFit<4>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<4>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<4>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - } else { - // penta all 5 - kernelFastFit<5>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<5>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<5>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - } - } -} +template +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index cb5d32b47aea3..011b7842f9681 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -1,131 +1,7 @@ #include "RiemannFitOnGPU.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, +template +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, - cudaStream_t stream) { - assert(tuples_d); - - auto blockSize = 64; - auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; - - // Fit internals - auto hitsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); - auto hits_geGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); - auto fast_fit_resultsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); - auto circle_fit_resultsGPU_holder = - cudautils::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream); - Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // triplets - kernelFastFit<3><<>>( - tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<3><<>>(tupleMultiplicity_d, - 3, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<3><<>>(tupleMultiplicity_d, - 3, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - // quads - kernelFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<4><<>>(tupleMultiplicity_d, - 4, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<4><<>>(tupleMultiplicity_d, - 4, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - if (fit5as4_) { - // penta - kernelFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<4><<>>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<4><<>>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - } else { - // penta all 5 - kernelFastFit<5><<>>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<5><<>>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<5><<>>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - } - } -} + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index b46de519034d5..c6345ba6eab3b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -187,3 +187,134 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ #endif } } +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +template +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { + assert(tuples_d); + + int blockSize = 64; + int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + + // Fit internals + auto hitsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); + auto hits_geGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); + auto fast_fit_resultsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); + auto circle_fit_resultsGPU_holder = + Traits:: template make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream); + Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // triplets + cudautils::launch(kernelFastFit<3>,{numberOfBlocks, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d, + 3, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d, + 3, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + // quads + cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 4, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 4, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + if (fit5as4_) { + // penta + cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + } else { + // penta all 5 + cudautils::launch(kernelFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + } + } +} From 9391df8a78717754e24aa2cd064b55f431dd0717 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 18:08:40 +0100 Subject: [PATCH 16/22] fix the decltype --- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 02e5de7f112ae..19df8f5e249a2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -699,12 +699,7 @@ void CAHitNtupletGeneratorKernels::launchKernels(HitsOnCPU const &hh, T #endif blockSize = 128; numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; -// cudautils::launch(cudautils::finalizeBulk,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d); -#ifdef __CUDACC__ - cudautils::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); -#else - cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d); -#endif + cudautils::launch(cudautils::finalizeBulk::type>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d); // remove duplicates (tracks that share a doublet) numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; From 70d36561147f4f746a79ef4d16486beb264cf3b8 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 18:18:56 +0100 Subject: [PATCH 17/22] revert to nullptr --- HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 6fbaced1858dd..5f29ad3b81ea8 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -72,7 +72,7 @@ namespace cudautils { inline void launchFinalize(Histo *__restrict__ h, uint8_t *__restrict__ ws #ifndef __CUDACC__ - = cudaStreamDefault + = nullptr #endif , cudaStream_t stream From 51f5342fb55994784b85ba89253ab6ac3dff850c Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 Dec 2019 19:15:45 +0100 Subject: [PATCH 18/22] migrate to malloc --- CUDADataFormats/Common/interface/HostProduct.h | 7 ++++--- .../SiPixelCluster/interface/SiPixelClustersCUDA.h | 1 - CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc | 1 - .../SiPixelRecHits/plugins/SiPixelRecHitConverter.cc | 2 +- .../SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc | 2 +- .../SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc | 2 +- 6 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h index 17ad98ba403a4..0a706b6c57927 100644 --- a/CUDADataFormats/Common/interface/HostProduct.h +++ b/CUDADataFormats/Common/interface/HostProduct.h @@ -2,6 +2,7 @@ #define CUDADataFormatsCommonHostProduct_H #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" // a heterogeneous unique pointer... template @@ -13,9 +14,9 @@ class HostProduct { HostProduct& operator=(HostProduct&&) = default; explicit HostProduct(cudautils::host::unique_ptr&& p) : hm_ptr(std::move(p)) {} - explicit HostProduct(std::unique_ptr&& p) : std_ptr(std::move(p)) {} + explicit HostProduct(cudautils::cpu::unique_ptr&& p) : cm_ptr(std::move(p)) {} - auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); } + auto const* get() const { return hm_ptr ? hm_ptr.get() : cm_ptr.get(); } auto const& operator*() const { return *get(); } @@ -23,7 +24,7 @@ class HostProduct { private: cudautils::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! + cudautils::cpu::unique_ptr cm_ptr; //! }; #endif diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index d3650e164d44e..afb7c5aa37833 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -2,7 +2,6 @@ #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc index e6f223bfec4e3..60d2693fa45b0 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc @@ -1,7 +1,6 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" template <> diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc index 945bbb28a3262..608c65eff83d1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc @@ -105,7 +105,7 @@ namespace cms { const edmNew::DetSetVector& input = *inputhandle; // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still... - auto hmsp = std::make_unique(gpuClustering::MaxNumModules + 1); + auto hmsp = cudautils::make_cpu_unique(gpuClustering::MaxNumModules + 1,cudaStreamDefault); auto hitsModuleStart = hmsp.get(); auto hms = std::make_unique(std::move(hmsp)); // hmsp is gone iEvent.put(std::move(hms)); // hms is gone! hitsModuleStart still alive and kicking... diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc index a4f19ac276a7a..7e840e1b266e1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc @@ -81,7 +81,7 @@ void SiPixelRecHitFromSOA::acquire(edm::Event const& iEvent, void SiPixelRecHitFromSOA::produce(edm::Event& iEvent, edm::EventSetup const& es) { // yes a unique ptr of a unique ptr so edm is happy auto sizeOfHitModuleStart = gpuClustering::MaxNumModules + 1; - auto hmsp = std::make_unique(sizeOfHitModuleStart); + auto hmsp = cudautils::make_cpu_unique(sizeOfHitModuleStart,cudaStreamDefault); std::copy(m_hitsModuleStart.get(), m_hitsModuleStart.get() + sizeOfHitModuleStart, hmsp.get()); auto hms = std::make_unique(std::move(hmsp)); // hmsp is gone iEvent.put(std::move(hms)); // hms is gone! diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index 7900cf8b2289a..d286583cbc822 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -103,7 +103,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv auto const& input = *hclusters; // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still... - auto hmsp = std::make_unique(gpuClustering::MaxNumModules + 1); + auto hmsp = cudautils::make_cpu_unique(gpuClustering::MaxNumModules + 1,cudaStreamDefault); auto hitsModuleStart = hmsp.get(); auto hms = std::make_unique(std::move(hmsp)); // hmsp is gone iEvent.put(tokenModuleStart_, std::move(hms)); // hms is gone! hitsModuleStart still alive and kicking... From 04465cf8329d0e6f4e2f87638b1497f87602e00c Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 8 Dec 2019 11:15:57 +0100 Subject: [PATCH 19/22] add memset for cpu as well (just for symmetry) --- .../CUDAUtilities/interface/memsetAsync.h | 13 +++++++++++++ .../CUDAUtilities/test/cpu_unique_ptr_t.cpp | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h index 203aac78a165c..819db79d62497 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h @@ -3,8 +3,10 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" #include +#include namespace cudautils { template @@ -15,6 +17,11 @@ namespace cudautils { cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream)); } + template + inline void memsetAsync(cudautils::cpu::unique_ptr& ptr, T value, cudaStream_t) { + ::memset(ptr.get(), value, sizeof(T)); + } + /** * The type of `value` is `int` because of `cudaMemsetAsync()` takes * it as an `int`. Note that `cudaMemsetAsync()` sets the value of @@ -25,6 +32,12 @@ namespace cudautils { inline void memsetAsync(cudautils::device::unique_ptr& ptr, int value, size_t nelements, cudaStream_t stream) { cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream)); } + template + inline void memsetAsync(cudautils::cpu::unique_ptr& ptr, int value, size_t nelements, cudaStream_t) { + ::memset(ptr.get(), value, nelements * sizeof(T)); + } + + } // namespace cudautils #endif diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp index 828b7912ed6c4..4a293c6760b25 100644 --- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp @@ -1,12 +1,16 @@ #include "catch.hpp" #include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { SECTION("Single element") { auto ptr = cudautils::make_cpu_unique(cudaStreamDefault); REQUIRE(ptr != nullptr); + *ptr = 1; + cudautils::memsetAsync(ptr,0,cudaStreamDefault); + REQUIRE(0==*ptr); } SECTION("Reset") { @@ -20,6 +24,10 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { SECTION("Multiple elements") { auto ptr = cudautils::make_cpu_unique(10,cudaStreamDefault); REQUIRE(ptr != nullptr); + for (int i=0; i<10; ++i) ptr[i]=1; + cudautils::memsetAsync(ptr,0,10,cudaStreamDefault); + int s=0; for (int i=0; i<10; ++i) s+=ptr[i]; + REQUIRE(0==s); } SECTION("Allocating too much") { From 36b834324edd0eb850c05c5d49f2b2c4e89855f3 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 8 Dec 2019 11:19:46 +0100 Subject: [PATCH 20/22] oops forgot to compile test --- HeterogeneousCore/CUDAUtilities/test/BuildFile.xml | 2 +- HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 893d1ab1745eb..88a12c789ed8f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -77,7 +77,7 @@ - + diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp index 4a293c6760b25..e945464c26b74 100644 --- a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp @@ -33,7 +33,6 @@ TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { SECTION("Allocating too much") { constexpr size_t maxSize = 1 << 30; // 8**10 auto ptr = cudautils::make_cpu_unique(maxSize+1,cudaStreamDefault); - ptr.reset(); REQUIRE(ptr != nullptr); } } From 6928c3953e7e1269cb66d1e4a4004b36873d1041 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 8 Dec 2019 12:31:42 +0100 Subject: [PATCH 21/22] almost single source! --- .../SiPixelRecHits/test/BuildFile.xml | 9 ++ .../SiPixelRecHits/test/RecHitSoATest.cc | 87 +++++++++++++++++++ .../SiPixelRecHits/test/RecHitSoATest.cu | 4 + .../SiPixelRecHits/test/RecHitSoATest.h | 55 ++++++++++++ 4 files changed, 155 insertions(+) create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h diff --git a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml index d8f6b2a35880a..2c2544390f401 100644 --- a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml @@ -21,3 +21,12 @@ + + + + + + + + + diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc new file mode 100644 index 0000000000000..9970e9c9f83fb --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc @@ -0,0 +1,87 @@ +#include + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & ghits, cudaStream_t stream); + +/* +TrackingRecHit2DCUDA and TrackingRecHit2DCPU are NOT the same type (for tracks and vertices are the same type) +they are templated with the (GPU/CPU)Traits so in principle the whole Analyzer can be (partially) templated as well +they return the same view type though (so the real analyzer algo in the header file above does not need to be templated) +*/ + +class RecHitSoATest : public edm::global::EDAnalyzer<> { +public: + + explicit RecHitSoATest(const edm::ParameterSet& iConfig); + ~RecHitSoATest() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; + const bool m_onGPU; + edm::EDGetTokenT> tGpuHits; + edm::EDGetTokenT tCpuHits; +}; + +RecHitSoATest::RecHitSoATest(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { + if (m_onGPU) { + tGpuHits = + consumes>(iConfig.getParameter("heterogeneousPixelRecHitSrc")); + } else { + tCpuHits = + consumes(iConfig.getParameter("heterogeneousPixelRecHitSrc")); + } +} + + +/* + most of (all?) the code below is boiler-plate. we hope in future to fully wrap it so that a single (templated?) source will work for both gpu and cpu +*/ +void RecHitSoATest::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const { + if (m_onGPU) { + auto const & gh = iEvent.get(tGpuHits); + CUDAScopedContextProduce ctx{gh}; + + analyzeImpl(ctx.get(gh),ctx.stream()); + + } else { + analyzeImpl(iEvent.get(tCpuHits),cudaStreamDefault); + } +} + +void RecHitSoATest::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("onGPU", true); + desc.add("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting")); + descriptions.add("RecHitSoATest", desc); +} + + +#include "RecHitSoATest.h" +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & ghits, cudaStream_t stream); + + +DEFINE_FWK_MODULE(RecHitSoATest); diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu new file mode 100644 index 0000000000000..0553db894f398 --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu @@ -0,0 +1,4 @@ +#include "RecHitSoATest.h" +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream_t stream); + diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h new file mode 100644 index 0000000000000..5fd779e29a21c --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h @@ -0,0 +1,55 @@ +#include + +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" + +__global__ +void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws) { + + + auto const& hh = *hhp; + auto first = blockIdx.x * blockDim.x + threadIdx.x; + + if (0 == first) + printf("in analyzeClusterTP\n"); + + __shared__ int nh; + __shared__ double sch; + + nh=0; sch=0; + __syncthreads(); + + for (int i = first, ni=nhits; i < ni; i += gridDim.x * blockDim.x) { + if (hh.charge(i) <=0) printf("a hit with zero charge?\n"); + atomicAdd(&nh,1); + atomicAdd(&sch,(double)(hh.charge(i))); + } + + __syncthreads(); + + if(0==threadIdx.x) { + atomicAdd(&ws[0],nh); + atomicAdd(&ws[1],sch); + atomicAdd(&ws[9],1); + if (gridDim.x==ws[9]) { + printf("in event : found %f hits, average charge is %f\n",ws[0], ws[1]/ws[0]); + } + } + +} + +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream_t stream) { + auto nhits = gHits.nHits(); + + if (0 == nhits) return; + + auto ws_d = Traits:: template make_unique(10,stream); + cudautils::memsetAsync(ws_d, 0, 10, stream); + int threadsPerBlock = 256; + int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; + cudautils::launch(analyzeHits,{threadsPerBlock,blocks,0,stream} , gHits.view(), nhits, ws_d.get()); + +} From d685dfea9e8c0e137a24e1bc9252d7e2e4aa7459 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 8 Dec 2019 12:55:53 +0100 Subject: [PATCH 22/22] no way, works only with three files --- .../SiPixelRecHits/test/BuildFile.xml | 2 +- .../SiPixelRecHits/test/RecHitSoATest.cc | 9 --------- .../SiPixelRecHits/test/RecHitSoATest.h | 20 +++++++++++++++++-- .../SiPixelRecHits/test/RecHitSoATestCPU.cc | 4 ++++ .../{RecHitSoATest.cu => RecHitSoATestGPU.cu} | 0 5 files changed, 23 insertions(+), 12 deletions(-) create mode 100644 RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc rename RecoLocalTracker/SiPixelRecHits/test/{RecHitSoATest.cu => RecHitSoATestGPU.cu} (100%) diff --git a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml index 2c2544390f401..bae77ee5d8fcf 100644 --- a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml @@ -23,7 +23,7 @@ - + diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc index 9970e9c9f83fb..3b2cb0a921e19 100644 --- a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc @@ -56,9 +56,6 @@ RecHitSoATest::RecHitSoATest(const edm::ParameterSet& iConfig) : m_onGPU(iConfig } -/* - most of (all?) the code below is boiler-plate. we hope in future to fully wrap it so that a single (templated?) source will work for both gpu and cpu -*/ void RecHitSoATest::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const { if (m_onGPU) { auto const & gh = iEvent.get(tGpuHits); @@ -78,10 +75,4 @@ void RecHitSoATest::fillDescriptions(edm::ConfigurationDescriptions& description descriptions.add("RecHitSoATest", desc); } - -#include "RecHitSoATest.h" -template -void analyzeImpl(TrackingRecHit2DHeterogeneous const & ghits, cudaStream_t stream); - - DEFINE_FWK_MODULE(RecHitSoATest); diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h index 5fd779e29a21c..1032501abe3fc 100644 --- a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h @@ -8,12 +8,21 @@ __global__ void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws) { + assert(hhp); + assert(ws); auto const& hh = *hhp; auto first = blockIdx.x * blockDim.x + threadIdx.x; +#ifdef CUDA_KERNELS_ON_CPU + std::cout << "kernel ON CPU!" << std::endl; +#else + if (0 == first) printf("kernel ON GPU!\n"); +#endif + + if (0 == first) - printf("in analyzeClusterTP\n"); + printf("in analyzeHits SoA\n"); __shared__ int nh; __shared__ double sch; @@ -42,6 +51,14 @@ void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws template void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream_t stream) { +#ifdef CUDA_KERNELS_ON_CPU + std::cout << "launching ON CPU!" << std::endl; + assert(!Traits::runOnDevice); +#else + printf("launching ON GPU!\n"); + assert(Traits::runOnDevice); +#endif + auto nhits = gHits.nHits(); if (0 == nhits) return; @@ -51,5 +68,4 @@ void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream int threadsPerBlock = 256; int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; cudautils::launch(analyzeHits,{threadsPerBlock,blocks,0,stream} , gHits.view(), nhits, ws_d.get()); - } diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc new file mode 100644 index 0000000000000..aa9ab7168b6d3 --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc @@ -0,0 +1,4 @@ +#include "RecHitSoATest.h" +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream_t stream); + diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu similarity index 100% rename from RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cu rename to RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu