From ab8d2c6d71d3f021b6cfd5904ab2d9679f8aea11 Mon Sep 17 00:00:00 2001
From: Eric Cano <eric.cano@cern.ch>
Date: Mon, 6 Sep 2021 17:13:15 +0200
Subject: [PATCH] Reduced size of host side structure for
 SiPixelDigisCUDA::HostData

The transfer is now just the right size but takes 2 transfers instead of 1.
---
 src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
index b0d138914..b507dd7a3 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
+++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
@@ -36,10 +36,15 @@ cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStr
 }
 
 SiPixelDigisCUDA::HostData SiPixelDigisCUDA::dataToHostAsync(cudaStream_t stream) const {
-  // TODO: we copy here on a bigger memory chunk than necessary. We could optimize to allocating a HostData of nDigis(),
-  // at the cost of 2 copies (one cudaMemcpy(2D) for the uint16_t adc column, plus one cudaMemcpy2D for the uint32_t columns
-  // clus, pdigis and rawIdArr. Less memory allocated and transferred, but one more memory transfer (and cuda call).
-  HostData ret(deviceFullView_.soaMetadata().size(), stream);
-  cudaCheck(cudaMemcpyAsync(ret.data_h.get(), deviceFullView_.adc(), ret.hostView_.soaMetadata().byteSize(), cudaMemcpyDeviceToHost, stream));
+  // Allocate the needed space only and build the compact data in place in host memory (from the larger device memory).
+  HostData ret(nDigis(), stream);
+  cudaCheck(cudaMemcpyAsync(ret.hostView_.adc(), deviceFullView_.adc(), nDigis_h * sizeof(decltype(*deviceFullView_.adc())),
+          cudaMemcpyDeviceToHost, stream));
+  // Copy the other columns, realigning the data in shorter arrays. clus is the first but all 3 columns (clus, pdigis, rawIdArr) have
+  // the same geometry.
+  cudaCheck(cudaMemcpy2DAsync(ret.hostView_.clus(), ret.hostView_.soaMetadata().clusPitch(),
+          deviceFullView_.clus(), deviceFullView_.soaMetadata().clusPitch(),
+          3 /* rows */,
+          nDigis() * sizeof(decltype (*ret.hostView_.clus())), cudaMemcpyDeviceToHost, stream));
   return ret;
 }