cms-patatrack · ericcano · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021
diff --git a/src/cudadev/CUDACore/device_unique_ptr.h b/src/cudadev/CUDACore/device_unique_ptr.h
@@ -49,8 +49,8 @@ namespace cms {
 
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
-      static_assert(std::is_trivially_constructible<T>::value,
-                    "Allocating with non-trivial constructor on the device memory is not supported");
+      static_assert(std::is_trivially_copyable<T>::value,
+                    "Allocating with non-trivial copy on the device memory is not supported");
       void *mem = allocate_device(sizeof(T), stream);
       return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
                                                                               device::impl::DeviceDeleter{stream}};
@@ -60,37 +60,15 @@ namespace cms {
     typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
                                                                                               cudaStream_t stream) {
       using element_type = typename std::remove_extent<T>::type;
-      static_assert(std::is_trivially_constructible<element_type>::value,
-                    "Allocating with non-trivial constructor on the device memory is not supported");
+      static_assert(std::is_trivially_copyable<element_type>::value,
+                    "Allocating with non-trivial copy on the device memory is not supported");
       void *mem = allocate_device(n * sizeof(element_type), stream);
       return typename device::impl::make_device_unique_selector<T>::unbounded_array{
           reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{stream}};
     }
 
     template <typename T, typename... Args>
     typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
-
-    // No check for the trivial constructor, make it clear in the interface
-    template <typename T>
-    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
-        cudaStream_t stream) {
-      void *mem = allocate_device(sizeof(T), stream);
-      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
-                                                                              device::impl::DeviceDeleter{stream}};
-    }
-
-    template <typename T>
-    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
-        size_t n, cudaStream_t stream) {
-      using element_type = typename std::remove_extent<T>::type;
-      void *mem = allocate_device(n * sizeof(element_type), stream);
-      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
-          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{stream}};
-    }
-
-    template <typename T, typename... Args>
-    typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique_uninitialized(Args &&...) =
-        delete;
   }  // namespace cuda
 }  // namespace cms
 

diff --git a/src/cudadev/CUDACore/host_unique_ptr.h b/src/cudadev/CUDACore/host_unique_ptr.h
@@ -39,17 +39,17 @@ namespace cms {
     // Allocate pinned host memory
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
-      static_assert(std::is_trivially_constructible<T>::value,
-                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      static_assert(std::is_trivially_copyable<T>::value,
+                    "Allocating with non-trivial copy on the pinned host memory is not supported");
       void *mem = allocate_host(sizeof(T), stream);
       return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
     }
 
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
       using element_type = typename std::remove_extent<T>::type;
-      static_assert(std::is_trivially_constructible<element_type>::value,
-                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      static_assert(std::is_trivially_copyable<element_type>::value,
+                    "Allocating with non-trivial copy on the pinned host memory is not supported");
       void *mem = allocate_host(n * sizeof(element_type), stream);
       return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
     }

diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
@@ -3,17 +3,9 @@
 #include "CUDACore/host_unique_ptr.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 
-SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
-    : moduleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)),
-      clusInModule_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
-      moduleId_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
-      clusModuleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)) {
-  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
-  view->moduleStart_ = moduleStart_d.get();
-  view->clusInModule_ = clusInModule_d.get();
-  view->moduleId_ = moduleId_d.get();
-  view->clusModuleStart_ = clusModuleStart_d.get();
+SiPixelClustersCUDA::SiPixelClustersCUDA() : data_d(), deviceLayout_(data_d.get(), 0), deviceView_(deviceLayout_) {}
 
-  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
-  cms::cuda::copyAsync(view_d, view, stream);
-}
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
+    : data_d(cms::cuda::make_device_unique<std::byte[]>(DeviceLayout::computeDataSize(maxModules), stream)),
+      deviceLayout_(data_d.get(), maxModules),
+      deviceView_(deviceLayout_) {}
diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
@@ -4,12 +4,49 @@
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
 #include "CUDACore/cudaCompat.h"
+#include "DataFormats/SoALayout.h"
+#include "DataFormats/SoAView.h"
 
 #include <cuda_runtime.h>
 
 class SiPixelClustersCUDA {
 public:
-  SiPixelClustersCUDA() = default;
+  GENERATE_SOA_LAYOUT(DeviceLayoutTemplate,
+                      SOA_COLUMN(uint32_t, moduleStart),   // index of the first pixel of each module
+                      SOA_COLUMN(uint32_t, clusInModule),  // number of clusters found in each module
+                      SOA_COLUMN(uint32_t, moduleId),      // module id of each module
+
+                      // originally from rechits
+                      SOA_COLUMN(uint32_t, clusModuleStart))  // index of the first cluster of each module
+
+  // We use all defaults for the template parameters.
+  using DeviceLayout = DeviceLayoutTemplate<>;
+
+  GENERATE_SOA_VIEW(
+      DeviceViewTemplate,
+      SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceLayout, deviceLayout)),
+      SOA_VIEW_VALUE_LIST(SOA_VIEW_VALUE(deviceLayout, moduleStart),   // index of the first pixel of each module
+                          SOA_VIEW_VALUE(deviceLayout, clusInModule),  // number of clusters found in each module
+                          SOA_VIEW_VALUE(deviceLayout, moduleId),      // module id of each module
+
+                          // originally from rechits
+                          SOA_VIEW_VALUE(deviceLayout, clusModuleStart)))  // index of the first cluster of each module
+
+  using DeviceView = DeviceViewTemplate<>;
+
+  GENERATE_SOA_CONST_VIEW(
+      DeviceConstViewTemplate,
+      SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceView, deviceView)),
+      SOA_VIEW_VALUE_LIST(SOA_VIEW_VALUE(deviceView, moduleStart),   // index of the first pixel of each module
+                          SOA_VIEW_VALUE(deviceView, clusInModule),  // number of clusters found in each module
+                          SOA_VIEW_VALUE(deviceView, moduleId),      // module id of each module
+
+                          // originally from rechits
+                          SOA_VIEW_VALUE(deviceView, clusModuleStart)))  // index of the first cluster of each module
+
+  using DeviceConstView = DeviceConstViewTemplate<>;
+
+  explicit SiPixelClustersCUDA();
   explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream);
   ~SiPixelClustersCUDA() = default;
 
@@ -22,40 +59,22 @@ class SiPixelClustersCUDA {
 
   uint32_t nClusters() const { return nClusters_h; }
 
-  uint32_t *moduleStart() { return moduleStart_d.get(); }
-  uint32_t *clusInModule() { return clusInModule_d.get(); }
-  uint32_t *moduleId() { return moduleId_d.get(); }
-  uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
-
-  uint32_t const *moduleStart() const { return moduleStart_d.get(); }
-  uint32_t const *clusInModule() const { return clusInModule_d.get(); }
-  uint32_t const *moduleId() const { return moduleId_d.get(); }
-  uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
+  uint32_t *moduleStart() { return deviceView_.moduleStart(); }
+  uint32_t *clusInModule() { return deviceView_.clusInModule(); }
+  uint32_t *moduleId() { return deviceView_.moduleId(); }
+  uint32_t *clusModuleStart() { return deviceView_.clusModuleStart(); }
 
-  class DeviceConstView {
-  public:
-    __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); }
-    __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); }
-    __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); }
-    __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); }
+  uint32_t const *moduleStart() const { return deviceView_.moduleStart(); }
+  uint32_t const *clusInModule() const { return deviceView_.clusInModule(); }
+  uint32_t const *moduleId() const { return deviceView_.moduleId(); }
+  uint32_t const *clusModuleStart() const { return deviceView_.clusModuleStart(); }
 
-    uint32_t const *moduleStart_;
-    uint32_t const *clusInModule_;
-    uint32_t const *moduleId_;
-    uint32_t const *clusModuleStart_;
-  };
-
-  DeviceConstView *view() const { return view_d.get(); }
+  DeviceConstView view() const { return DeviceConstView(deviceView_); }
 
 private:
-  cms::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
-  cms::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
-  cms::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
-
-  // originally from rechits
-  cms::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;  // index of the first cluster of each module
-
-  cms::cuda::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
+  cms::cuda::device::unique_ptr<std::byte[]> data_d;  // Single SoA storage
+  DeviceLayout deviceLayout_;
+  DeviceView deviceView_;
 
   uint32_t nClusters_h = 0;
 };

diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
@@ -5,44 +5,59 @@
 #include "CUDACore/host_unique_ptr.h"
 
 SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
-    : xx_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      yy_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      adc_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      moduleInd_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      clus_d(cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream)),
-      view_d(cms::cuda::make_device_unique<DeviceConstView>(stream)),
-      pdigi_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream)),
-      rawIdArr_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream)) {
-  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
-  view->xx_ = xx_d.get();
-  view->yy_ = yy_d.get();
-  view->adc_ = adc_d.get();
-  view->moduleInd_ = moduleInd_d.get();
-  view->clus_ = clus_d.get();
-
-  cms::cuda::copyAsync(view_d, view, stream);
-}
+    : data_d(cms::cuda::make_device_unique<std::byte[]>(
+          DeviceOnlyLayout::computeDataSize(maxFedWords) + HostDeviceLayout::computeDataSize(maxFedWords), stream)),
+      deviceOnlyLayout_d(data_d.get(), maxFedWords),
+      hostDeviceLayout_d(deviceOnlyLayout_d.soaMetadata().nextByte(), maxFedWords),
+      deviceFullView_(deviceOnlyLayout_d, hostDeviceLayout_d),
+      devicePixelConstView_(deviceFullView_) {}
 
-cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, adc_d, nDigis(), stream);
-  return ret;
-}
+SiPixelDigisCUDA::SiPixelDigisCUDA()
+    : data_d(), deviceOnlyLayout_d(), hostDeviceLayout_d(), deviceFullView_(), devicePixelConstView_() {}
 
-cms::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<int32_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, clus_d, nDigis(), stream);
-  return ret;
+SiPixelDigisCUDA::HostStore::HostStore() : data_h(), hostLayout_(nullptr, 0), hostView_(hostLayout_) {}
+
+SiPixelDigisCUDA::HostStore::HostStore(size_t maxFedWords, cudaStream_t stream)
+    : data_h(cms::cuda::make_host_unique<std::byte[]>(SiPixelDigisCUDA::HostDeviceLayout::computeDataSize(maxFedWords),
+                                                      stream)),
+      hostLayout_(data_h.get(), maxFedWords),
+      hostView_(hostLayout_) {}
+
+void SiPixelDigisCUDA::HostStore::reset() {
+  hostLayout_ = HostDeviceLayout();
+  hostView_ = HostDeviceView(hostLayout_);
+  data_h.reset();
 }
 
-cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, pdigi_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), stream);
+  // TODO: this is downgraded from cms::cuda::copyAsync as we copy data from within a block but not the full block.
+  cudaCheck(cudaMemcpyAsync(
+      ret.get(), deviceFullView_.adc(), nDigis() * sizeof(decltype(ret[0])), cudaMemcpyDeviceToHost, stream));
   return ret;
 }
 
-cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+SiPixelDigisCUDA::HostStore SiPixelDigisCUDA::dataToHostAsync(cudaStream_t stream) const {
+  // Allocate the needed space only and build the compact data in place in host memory (from the larger device memory).
+  // Due to the compaction with the 2D copy, we need to know the precise geometry, and hence operate on the store (as opposed
+  // to the view, which is unaware of the column pitches.
+  HostStore ret(nDigis(), stream);
+  auto rhlsm = ret.hostLayout_.soaMetadata();
+  auto hdlsm_d = hostDeviceLayout_d.soaMetadata();
+  cudaCheck(cudaMemcpyAsync(rhlsm.addressOf_adc(),
+                            hdlsm_d.addressOf_adc(),
+                            nDigis_h * sizeof(*rhlsm.addressOf_adc()),
+                            cudaMemcpyDeviceToHost,
+                            stream));
+  // Copy the other columns, realigning the data in shorter arrays. clus is the first but all 3 columns (clus, pdigis, rawIdArr) have
+  // the same geometry.
+  cudaCheck(cudaMemcpy2DAsync(rhlsm.addressOf_clus(),
+                              rhlsm.clusPitch(),
+                              hdlsm_d.addressOf_clus(),
+                              hdlsm_d.clusPitch(),
+                              3 /* rows */,
+                              nDigis() * sizeof(decltype(*ret.hostView_.clus())),
+                              cudaMemcpyDeviceToHost,
+                              stream));
   return ret;
-}
+}