cms-patatrack · makortel · Jan 8, 2019 · Aug 13, 2018 · Aug 13, 2018 · Aug 13, 2018
diff --git a/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h b/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h
@@ -1,7 +1,7 @@
 #ifndef CalibTracker_SiPixelESProducers_SiPixelGainCalibrationForHLTGPU_H
 #define CalibTracker_SiPixelESProducers_SiPixelGainCalibrationForHLTGPU_H
 
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAESManaged.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
 
 #include <cuda/api_wrappers.h>
@@ -19,14 +19,8 @@ class SiPixelGainCalibrationForHLTGPU {
   const SiPixelGainForHLTonGPU *getGPUProductAsync(cuda::stream_t<>& cudaStream) const;
 
 private:
-  const SiPixelGainCalibrationForHLT *gains_ = nullptr;
-  SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr;
-  struct GPUData {
-    ~GPUData();
-    SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr;
-    SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr;
-  };
-  CUDAESProduct<GPUData> gpuData_;
+  CUDAESManaged helper_;
+  SiPixelGainForHLTonGPU *gainForHLT_ = nullptr;
 };
 
 #endif
diff --git a/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc b/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc
@@ -7,8 +7,7 @@
 
 #include <cuda.h>
 
-SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains, const TrackerGeometry& geom):
-  gains_(&gains)
+SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains, const TrackerGeometry& geom)
 {
   // bizzarre logic (looking for fist strip-det) don't ask
   auto const & dus = geom.detUnits();
@@ -25,8 +24,7 @@ SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGa
   std::cout << "sizes " << sizeof(char) << ' ' << sizeof(uint8_t) << ' ' << sizeof(SiPixelGainForHLTonGPU::DecodingStructure) << std::endl;
   */
 
-  cudaCheck(cudaMallocHost((void**) & gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU)));
-  //gainForHLTonHost_->v_pedestals = gainDataOnGPU_; // how to do this?
+  helper_.allocate(&gainForHLT_, 1);
 
   // do not read back from the (possibly write-combined) memory buffer
   auto minPed  = gains.getPedLow();
@@ -36,21 +34,21 @@ SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGa
   auto nBinsToUseForEncoding = 253;
 
   // we will simplify later (not everything is needed....)
-  gainForHLTonHost_->minPed_ = minPed;
-  gainForHLTonHost_->maxPed_ = maxPed;
-  gainForHLTonHost_->minGain_= minGain;
-  gainForHLTonHost_->maxGain_= maxGain;
+  gainForHLT_->minPed_ = minPed;
+  gainForHLT_->maxPed_ = maxPed;
+  gainForHLT_->minGain_= minGain;
+  gainForHLT_->maxGain_= maxGain;
 
-  gainForHLTonHost_->numberOfRowsAveragedOver_ = 80;
-  gainForHLTonHost_->nBinsToUseForEncoding_    = nBinsToUseForEncoding;
-  gainForHLTonHost_->deadFlag_                 = 255;
-  gainForHLTonHost_->noisyFlag_                = 254;
+  gainForHLT_->numberOfRowsAveragedOver_ = 80;
+  gainForHLT_->nBinsToUseForEncoding_    = nBinsToUseForEncoding;
+  gainForHLT_->deadFlag_                 = 255;
+  gainForHLT_->noisyFlag_                = 254;
 
-  gainForHLTonHost_->pedPrecision  = static_cast<float>(maxPed - minPed) / nBinsToUseForEncoding;
-  gainForHLTonHost_->gainPrecision = static_cast<float>(maxGain - minGain) / nBinsToUseForEncoding;
+  gainForHLT_->pedPrecision  = static_cast<float>(maxPed - minPed) / nBinsToUseForEncoding;
+  gainForHLT_->gainPrecision = static_cast<float>(maxGain - minGain) / nBinsToUseForEncoding;
 
   /*
-  std::cout << "precisions g " << gainForHLTonHost_->pedPrecision << ' ' << gainForHLTonHost_->gainPrecision << std::endl;
+  std::cout << "precisions g " << gainForHLT_->pedPrecision << ' ' << gainForHLT_->gainPrecision << std::endl;
   */
 
   // fill the index map
@@ -68,31 +66,21 @@ SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGa
     assert(0==p->iend%2);
     assert(p->ibegin!=p->iend);
     assert(p->ncols>0);
-    gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(p->ibegin,p->iend), p->ncols);
+    gainForHLT_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(p->ibegin,p->iend), p->ncols);
     // if (ind[i].detid!=dus[i]->geographicalId()) std::cout << ind[i].detid<<"!="<<dus[i]->geographicalId() << std::endl;
-    // gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(ind[i].ibegin,ind[i].iend), ind[i].ncols);
+    // gainForHLT_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(ind[i].ibegin,ind[i].iend), ind[i].ncols);
   }
 
-}
+  helper_.allocate(&(gainForHLT_->v_pedestals), gains.data().size(), sizeof(char)); // override the element size because essentially we reinterpret_cast on the fly
+  std::memcpy(gainForHLT_->v_pedestals, gains.data().data(), gains.data().size()*sizeof(char));
 
-SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() {
-  cudaCheck(cudaFreeHost(gainForHLTonHost_));
+  helper_.advise();
 }
 
-SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
-  cudaCheck(cudaFree(gainForHLTonGPU));
-  cudaCheck(cudaFree(gainDataOnGPU));
+SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() {
 }
 
 const SiPixelGainForHLTonGPU *SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cuda::stream_t<>& cudaStream) const {
-  const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cuda::stream_t<>& stream) {
-      cudaCheck(cudaMalloc((void**) & data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU)));
-      cudaCheck(cudaMalloc((void**) & data.gainDataOnGPU, this->gains_->data().size())); // TODO: this could be changed to cuda::memory::device::unique_ptr<>
-      // gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory
-      cudaCheck(cudaMemcpyAsync(data.gainDataOnGPU, this->gains_->data().data(), this->gains_->data().size(), cudaMemcpyDefault, stream.id()));
-
-      cudaCheck(cudaMemcpyAsync(data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream.id()));
-      cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals), &(data.gainDataOnGPU), sizeof(SiPixelGainForHLTonGPU_DecodingStructure*), cudaMemcpyDefault, stream.id()));
-    });
-  return data.gainForHLTonGPU;
+  helper_.prefetchAsync(cudaStream);
+  return gainForHLT_;
 }
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -2,6 +2,7 @@
 #define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
 
 #include <cstdint>
+#include <array>
 
 namespace phase1PixelTopology {
 
@@ -29,6 +30,66 @@ namespace phase1PixelTopology {
                                           };
 
 
+  template<class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
+  {
+    return {{ f(Indices)... }};
+  }
+
+  template<int N, class Function>
+  constexpr auto map_to_array(Function f)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, N>
+  {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
+
+
+  constexpr uint32_t findMaxModuleStride() {
+    bool go = true;
+    int n=2;
+    while (go) {
+      for  (uint8_t i=1; i<11; ++i) {
+        if (layerStart[i]%n !=0) {go=false; break;}
+      }
+      if(!go) break;
+      n*=2;
+    }
+    return n/2;
+  }
+
+  constexpr uint32_t maxModuleStride = findMaxModuleStride();
+
+
+  constexpr uint8_t findLayer(uint32_t detId) {
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
+    detId*=maxModuleStride;
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+
+  constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
+  constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr bool validateLayerIndex() {
+    bool res=true;
+    for (auto i=0U; i<numberOfModules; ++i)  {
+      auto j = i/maxModuleStride;
+      res &=(layer[j]<10);
+      res &=(i>=layerStart[layer[j]]);
+      res &=(i<layerStart[layer[j]+1]);
+    }
+    return res;
+  }
+
+  static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");
+
+
   // this is for the ROC n<512 (upgrade 1024)
   constexpr inline
   uint16_t  divu52(uint16_t n) {

diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
@@ -141,5 +141,13 @@ int main() {
     assert(std::get<1>(ori)==bp);
   }
 
+  using namespace phase1PixelTopology;
+  for (auto i=0U; i<numberOfModules; ++i)  {
+    assert(layer[i]<10);
+    assert(i>=layerStart[layer[i]]);
+    assert(i<layerStart[layer[i]+1]);
+  }
+
+
   return 0;
 }
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESManaged.h b/HeterogeneousCore/CUDACore/interface/CUDAESManaged.h
@@ -0,0 +1,57 @@
+#ifndef HeterogeneousCore_CUDACore_interface_CUDAESManaged_h
+#define HeterogeneousCore_CUDACore_interface_CUDAESManaged_h
+
+#include <atomic>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <cuda/api_wrappers.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+/**
+ * Class to help with memory allocations for ESProducts. Each CUDA
+ * ESProduct wrapper should
+ * - include an instance of this class as their member
+ * - use allocate() to allocate the memory buffers
+ * - call advise() after filling the buffers in CPU
+ * - call prefetch() before returning the CUDA ESProduct
+ *
+ * It owns the allocated memory and frees it in its destructor.
+ */
+class CUDAESManaged {
+public:
+  CUDAESManaged();
+  ~CUDAESManaged();
+
+  template <typename T>
+  T *allocate(size_t elements, size_t elementSize=sizeof(T)) {
+    T *ptr = nullptr;
+    auto size = elementSize*elements;
+    cudaCheck(cudaMallocManaged(&ptr, size));
+    buffers_.emplace_back(ptr, size);
+    return ptr;
+  }
+
+  template <typename T>
+  void allocate(T **ptr, size_t elements, size_t elementSize=sizeof(T)) {
+    *ptr = allocate<T>(elements, elementSize);
+  }
+
+  // Record a buffer allocated elsewhere to be used in advise/prefetch
+  /*
+  void record(void *ptr, size_t size) {
+    buffers_.emplace_back(ptr, size);
+  }
+  */
+
+  void advise() const;
+
+  void prefetchAsync(cuda::stream_t<>& stream) const;
+
+private:
+  std::vector<std::pair<void *, size_t> > buffers_;
+  mutable std::vector<std::atomic<bool>> prefetched_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
diff --git a/HeterogeneousCore/CUDACore/src/CUDAESManaged.cc b/HeterogeneousCore/CUDACore/src/CUDAESManaged.cc
@@ -0,0 +1,35 @@
+#include "HeterogeneousCore/CUDACore/interface/CUDAESManaged.h"
+#include "HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h"
+
+CUDAESManaged::CUDAESManaged(): prefetched_(numberOfCUDADevices()) {
+  for(auto& pref: prefetched_) {
+    pref.store(false);
+  }
+}
+
+CUDAESManaged::~CUDAESManaged() {
+  for(auto& ptrSize: buffers_) {
+    cudaFree(ptrSize.first);
+  }
+}
+
+void CUDAESManaged::advise() const {
+  for(const auto& ptrSize: buffers_) {
+    cudaCheck(cudaMemAdvise(ptrSize.first, ptrSize.second, cudaMemAdviseSetReadMostly, 0)); // device is ignored for this advise
+  }
+}
+
+void CUDAESManaged::prefetchAsync(cuda::stream_t<>& stream) const {
+  // The boolean atomic is an optimization attempt, it doesn't really
+  // matter if more than one thread/edm stream issues the prefetches
+  // as long as most of the prefetches are avoided.
+  auto& pref = prefetched_[stream.device_id()];
+  if(pref.load())
+    return;
+
+  for(const auto& ptrSize: buffers_) {
+    cudaMemPrefetchAsync(ptrSize.first, ptrSize.second, stream.device_id(), stream.id());
+  }
+
+  pref.store(true);
+}