From ab8d2c6d71d3f021b6cfd5904ab2d9679f8aea11 Mon Sep 17 00:00:00 2001 From: Eric Cano Date: Mon, 6 Sep 2021 17:13:15 +0200 Subject: [PATCH] Reduced size of host side structure for SiPixelDigisCUDA::HostData The transfer is now just the right size but takes 2 transfers instead of 1. --- src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc index b0d138914..b507dd7a3 100644 --- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc +++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc @@ -36,10 +36,15 @@ cms::cuda::host::unique_ptr SiPixelDigisCUDA::adcToHostAsync(cudaStr } SiPixelDigisCUDA::HostData SiPixelDigisCUDA::dataToHostAsync(cudaStream_t stream) const { - // TODO: we copy here on a bigger memory chunk than necessary. We could optimize to allocating a HostData of nDigis(), - // at the cost of 2 copies (one cudaMemcpy(2D) for the uint16_t adc column, plus one cudaMemcpy2D for the uint32_t columns - // clus, pdigis and rawIdArr. Less memory allocated and transferred, but one more memory transfer (and cuda call). - HostData ret(deviceFullView_.soaMetadata().size(), stream); - cudaCheck(cudaMemcpyAsync(ret.data_h.get(), deviceFullView_.adc(), ret.hostView_.soaMetadata().byteSize(), cudaMemcpyDeviceToHost, stream)); + // Allocate the needed space only and build the compact data in place in host memory (from the larger device memory). + HostData ret(nDigis(), stream); + cudaCheck(cudaMemcpyAsync(ret.hostView_.adc(), deviceFullView_.adc(), nDigis_h * sizeof(decltype(*deviceFullView_.adc())), + cudaMemcpyDeviceToHost, stream)); + // Copy the other columns, realigning the data in shorter arrays. clus is the first but all 3 columns (clus, pdigis, rawIdArr) have + // the same geometry. + cudaCheck(cudaMemcpy2DAsync(ret.hostView_.clus(), ret.hostView_.soaMetadata().clusPitch(), + deviceFullView_.clus(), deviceFullView_.soaMetadata().clusPitch(), + 3 /* rows */, + nDigis() * sizeof(decltype (*ret.hostView_.clus())), cudaMemcpyDeviceToHost, stream)); return ret; }