Skip to content

Commit

Permalink
Replace use of API wrapper stream and event with plain CUDA, part 1 (#…
Browse files Browse the repository at this point in the history
…389)

Replace cuda::stream_t<> with cudaStream_t in client code
Replace cuda::event_t with cudaEvent_t in the client code
Clean up BuildFiles
  • Loading branch information
cmsbuild authored and fwyzard committed Dec 25, 2020
1 parent 55a3fce commit c42c9f7
Show file tree
Hide file tree
Showing 23 changed files with 94 additions and 92 deletions.
2 changes: 1 addition & 1 deletion CUDADataFormats/SiPixelCluster/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<use name="DataFormats/Common"/>
<use name="CUDADataFormats/Common"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<use name="rootcore"/>

<export>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include <cuda/api_wrappers.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"

#include <cuda_runtime.h>

class SiPixelClustersCUDA {
public:
SiPixelClustersCUDA() = default;
explicit SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<> &stream);
explicit SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream);
~SiPixelClustersCUDA() = default;

SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete;
Expand Down
2 changes: 1 addition & 1 deletion CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream) {
SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) {
moduleStart_d = cudautils::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
clusInModule_d = cudautils::make_device_unique<uint32_t[]>(maxClusters, stream);
moduleId_d = cudautils::make_device_unique<uint32_t[]>(maxClusters, stream);
Expand Down
2 changes: 1 addition & 1 deletion CUDADataFormats/SiPixelDigi/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<use name="DataFormats/Common"/>
<use name="DataFormats/SiPixelRawData"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<use name="rootcore"/>

<export>
Expand Down
8 changes: 4 additions & 4 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"

#include <cuda/api_wrappers.h>
#include <cuda_runtime.h>

class SiPixelDigiErrorsCUDA {
public:
SiPixelDigiErrorsCUDA() = default;
explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream);
explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream);
~SiPixelDigiErrorsCUDA() = default;

SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
Expand All @@ -27,9 +27,9 @@ class SiPixelDigiErrorsCUDA {

using HostDataError =
std::pair<GPU::SimpleVector<PixelErrorCompact>, cudautils::host::unique_ptr<PixelErrorCompact[]>>;
HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;
HostDataError dataErrorToHostAsync(cudaStream_t stream) const;

void copyErrorToHostAsync(cuda::stream_t<>& stream);
void copyErrorToHostAsync(cudaStream_t stream);

private:
cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
Expand Down
14 changes: 7 additions & 7 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include <cuda/api_wrappers.h>
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"

#include <cuda_runtime.h>

class SiPixelDigisCUDA {
public:
SiPixelDigisCUDA() = default;
explicit SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<> &stream);
explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
~SiPixelDigisCUDA() = default;

SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete;
Expand Down Expand Up @@ -50,10 +50,10 @@ class SiPixelDigisCUDA {
uint32_t const *c_pdigi() const { return pdigi_d.get(); }
uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }

cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<> &stream) const;
cudautils::host::unique_ptr<int32_t[]> clusToHostAsync(cuda::stream_t<> &stream) const;
cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<> &stream) const;
cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<> &stream) const;
cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cudaStream_t stream) const;
cudautils::host::unique_ptr<int32_t[]> clusToHostAsync(cudaStream_t stream) const;
cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cudaStream_t stream) const;
cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cudaStream_t stream) const;

class DeviceConstView {
public:
Expand Down
6 changes: 3 additions & 3 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

#include <cassert>

SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream)
SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
: formatterErrors_h(std::move(errors)) {
error_d = cudautils::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
data_d = cudautils::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
Expand All @@ -22,11 +22,11 @@ SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterE
cudautils::copyAsync(error_d, error_h, stream);
}

void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) {
void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) {
cudautils::copyAsync(error_h, error_d, stream);
}

SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const {
SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const {
// On one hand size() could be sufficient. On the other hand, if
// someone copies the SimpleVector<>, (s)he might expect the data
// buffer to actually have space for capacity() elements.
Expand Down
10 changes: 5 additions & 5 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream) {
SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
xx_d = cudautils::make_device_unique<uint16_t[]>(maxFedWords, stream);
yy_d = cudautils::make_device_unique<uint16_t[]>(maxFedWords, stream);
adc_d = cudautils::make_device_unique<uint16_t[]>(maxFedWords, stream);
Expand All @@ -25,25 +25,25 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream)
cudautils::copyAsync(view_d, view, stream);
}

cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
auto ret = cudautils::make_host_unique<uint16_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, adc_d, nDigis(), stream);
return ret;
}

cudautils::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const {
cudautils::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const {
auto ret = cudautils::make_host_unique<int32_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, clus_d, nDigis(), stream);
return ret;
}

cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const {
cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const {
auto ret = cudautils::make_host_unique<uint32_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, pdigi_d, nDigis(), stream);
return ret;
}

cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const {
cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const {
auto ret = cudautils::make_host_unique<uint32_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
return ret;
Expand Down
2 changes: 1 addition & 1 deletion CUDADataFormats/TrackingRecHit/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<use name="rootcore"/>
<use name="CUDADataFormats/Common"/>
<use name="DataFormats/Common"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class TrackingRecHit2DHeterogeneous {
explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cuda::stream_t<>& stream);
cudaStream_t stream);

~TrackingRecHit2DHeterogeneous() = default;

Expand All @@ -37,9 +37,9 @@ class TrackingRecHit2DHeterogeneous {
auto iphi() { return m_iphi; }

// only the local coord and detector index
cudautils::host::unique_ptr<float[]> localCoordToHostAsync(cuda::stream_t<>& stream) const;
cudautils::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cuda::stream_t<>& stream) const;
cudautils::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cuda::stream_t<>& stream) const;
cudautils::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
cudautils::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cudaStream_t stream) const;
cudautils::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;

private:
static constexpr uint32_t n16 = 4;
Expand Down Expand Up @@ -71,7 +71,7 @@ template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cuda::stream_t<>& stream)
cudaStream_t stream)
: m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

Expand Down
16 changes: 11 additions & 5 deletions CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

namespace testTrackingRecHit2D {

Expand All @@ -11,13 +12,18 @@ namespace testTrackingRecHit2D {
int main() {
exitSansCUDADevices();

auto current_device = cuda::device::current::get();
auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
cudaStream_t stream;
cudaCheck(cudaStreamCreate(&stream));

auto nHits = 200;
TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);
// inner scope to deallocate memory before destroying the stream
{
auto nHits = 200;
TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);

testTrackingRecHit2D::runKernels(tkhit.view());
testTrackingRecHit2D::runKernels(tkhit.view());
}

cudaCheck(cudaStreamDestroy(stream));

return 0;
}
2 changes: 1 addition & 1 deletion CalibTracker/SiPixelESProducers/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<use name="MagneticField/VolumeBasedEngine"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="boost"/>
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<export>
<lib name="1"/>
</export>
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class SiPixelGainCalibrationForHLTGPU {
explicit SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT &gains, const TrackerGeometry &geom);
~SiPixelGainCalibrationForHLTGPU();

const SiPixelGainForHLTonGPU *getGPUProductAsync(cuda::stream_t<> &cudaStream) const;
const SiPixelGainForHLTonGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
const SiPixelGainForHLTonGPU *getCPUProduct() const { return gainForHLTonHost_; }
const SiPixelGainCalibrationForHLT *getOriginalProduct() { return gains_; }

Expand Down
2 changes: 1 addition & 1 deletion CalibTracker/SiPixelESProducers/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<use name="Geometry/TrackerGeometryBuilder"/>
<use name="CalibTracker/SiPixelESProducers"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<library file="*.cc" name="CalibTrackerSiPixelESProducersPlugins">
<flags EDM_PLUGIN="1"/>
</library>
Original file line number Diff line number Diff line change
Expand Up @@ -84,23 +84,23 @@ SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
cudaCheck(cudaFree(gainDataOnGPU));
}

const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cuda::stream_t<>& cudaStream) const {
const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cuda::stream_t<>& stream) {
const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cudaStream_t cudaStream) const {
const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
cudaCheck(cudaMalloc((void**)&data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU)));
cudaCheck(
cudaMalloc((void**)&data.gainDataOnGPU,
this->gains_->data().size())); // TODO: this could be changed to cuda::memory::device::unique_ptr<>
// gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory
cudaCheck(cudaMemcpyAsync(
data.gainDataOnGPU, this->gains_->data().data(), this->gains_->data().size(), cudaMemcpyDefault, stream.id()));
data.gainDataOnGPU, this->gains_->data().data(), this->gains_->data().size(), cudaMemcpyDefault, stream));

cudaCheck(cudaMemcpyAsync(
data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream.id()));
data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream));
cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals),
&(data.gainDataOnGPU),
sizeof(SiPixelGainForHLTonGPU_DecodingStructure*),
cudaMemcpyDefault,
stream.id()));
stream));
});
return data.gainForHLTonGPU;
}
2 changes: 1 addition & 1 deletion EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<use name="CUDADataFormats/SiPixelDigi"/>
<use name="EventFilter/SiPixelRawToDigi"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<library file="*.cc" name="EventFilterSiPixelRawToDigiPlugins">
<flags EDM_PLUGIN="1"/>
</library>
Loading

0 comments on commit c42c9f7

Please sign in to comment.