Skip to content

Commit

Permalink
Implement changes from the CUDA framework review (#429)
Browse files Browse the repository at this point in the history
Rename the cudautils namespace to cms::cuda or cms::cudatest, and drop the CUDA prefix from the symbols defined there.

Always record and query the CUDA event, to minimize need for error checking in CUDAScopedContextProduce destructor.

Add comments to highlight the pieces in CachingDeviceAllocator that have been changed wrt. cub.

Various other updates and clean up:
  - enable CUDA for compute capability 3.5.
  - clean up CUDAService, CUDA tests and plugins.
  - add CUDA existence protections to BuildFiles.
  - mark thread-safe static variables with CMS_THREAD_SAFE.
  • Loading branch information
makortel authored and fwyzard committed Jan 15, 2021
1 parent 57c85b4 commit 6dfc6f9
Show file tree
Hide file tree
Showing 13 changed files with 47 additions and 46 deletions.
6 changes: 3 additions & 3 deletions CUDADataFormats/Track/src/classes.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef CUDADataFormats__src_classes_h
#define CUDADataFormats__src_classes_h
#ifndef CUDADataFormats_Track_src_classes_h
#define CUDADataFormats_Track__src_classes_h

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/Common/interface/HostProduct.h"
#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
#include "CUDADataFormats/Common/interface/ArrayShadow.h"
Expand Down
4 changes: 2 additions & 2 deletions CUDADataFormats/Track/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<lcgdict>
<class name="CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
<class name="edm::Wrapper<CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
<class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
<class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
<class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
<class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
<class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
Expand Down
4 changes: 2 additions & 2 deletions CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ __global__ void testTSSoA(TS* pts, int n) {
}

#ifdef __CUDACC__
#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#endif

int main() {
#ifdef __CUDACC__
requireCUDADevices();
cms::cudatest::requireDevices();
#endif

TS ts;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <cuda_runtime.h>

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/Common/interface/HostProduct.h"
#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
#include "DataFormats/Common/interface/Handle.h"
Expand All @@ -15,7 +15,7 @@
#include "FWCore/PluginManager/interface/ModuleDef.h"
#include "FWCore/Utilities/interface/EDGetToken.h"
#include "FWCore/Utilities/interface/InputTag.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"

class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
public:
Expand All @@ -30,14 +30,14 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;

edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenCUDA_;
edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;

cudautils::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
};

PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
: tokenCUDA_(consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
: tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
tokenSOA_(produces<PixelTrackHeterogeneous>()) {}

void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand All @@ -50,8 +50,8 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
CUDAProduct<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
auto const& inputData = ctx.get(inputDataWrapped);

m_soa = inputData.toHostAsync(ctx.stream());
Expand Down
4 changes: 2 additions & 2 deletions RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <Eigen/Eigenvalues>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"

#ifdef USE_BL
#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
Expand Down Expand Up @@ -329,7 +329,7 @@ void testFit() {
}

int main(int argc, char* argv[]) {
requireCUDADevices();
cms::cudatest::requireDevices();

testFit<4>();
testFit<3>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <Eigen/Eigenvalues>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
#include "test_common.h"

using namespace Eigen;
Expand Down Expand Up @@ -215,7 +215,7 @@ void testEigenvalues() {
}

int main(int argc, char *argv[]) {
requireCUDADevices();
cms::cudatest::requireDevices();

testEigenvalues();
testInverse3x3();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;

// Fit internals
auto hitsGPU_ = cudautils::make_device_unique<double[]>(
auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);

for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
Expand Down
17 changes: 9 additions & 8 deletions RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <cuda_runtime.h>

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "DataFormats/Common/interface/Handle.h"
#include "FWCore/Framework/interface/ESHandle.h"
#include "FWCore/Framework/interface/Event.h"
Expand All @@ -15,7 +15,7 @@
#include "FWCore/PluginManager/interface/ModuleDef.h"
#include "FWCore/Utilities/interface/EDGetToken.h"
#include "FWCore/Utilities/interface/RunningAverage.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"

#include "CAHitNtupletGeneratorOnGPU.h"
Expand All @@ -34,8 +34,8 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {

bool m_OnGPU;

edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DGPU>> tokenHitGPU_;
edm::EDPutTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenTrackGPU_;
edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;

Expand All @@ -45,8 +45,9 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
: m_OnGPU(iConfig.getParameter<bool>("onGPU")), gpuAlgo_(iConfig, consumesCollector()) {
if (m_OnGPU) {
tokenHitGPU_ = consumes<CUDAProduct<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
tokenTrackGPU_ = produces<CUDAProduct<PixelTrackHeterogeneous>>();
tokenHitGPU_ =
consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
} else {
tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
Expand All @@ -68,10 +69,10 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
auto bf = 1. / PixelRecoUtilities::fieldInInvGev(es);

if (m_OnGPU) {
edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>> hHits;
edm::Handle<cms::cuda::Product<TrackingRecHit2DCUDA>> hHits;
iEvent.getByToken(tokenHitGPU_, hHits);

CUDAScopedContextProduce ctx{*hHits};
cms::cuda::ScopedContextProduce ctx{*hHits};
auto const& hits = ctx.get(*hHits);

ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
assert(tuples_d && quality_d);

// zero tuples
cudautils::launchZero(tuples_d, cudaStream);
cms::cuda::launchZero(tuples_d, cudaStream);

auto nhits = hh.nHits();
assert(nhits <= pixelGPUConstants::maxNumberOfHits);
Expand Down Expand Up @@ -108,13 +108,13 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
if (m_params.doStats_)
kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);

cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);

// remove duplicates (tracks that share a doublet)
kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);

kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());

if (nhits > 1 && m_params.lateFishbone_) {
Expand Down Expand Up @@ -154,7 +154,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA

// fill hit->track "map"
kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());

// remove duplicates (tracks that share a hit)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
auto *quality_d = (Quality *)(&tracks_d->m_quality);

// zero tuples
cudautils::launchZero(tuples_d, cudaStream);
cms::cuda::launchZero(tuples_d, cudaStream);

auto nhits = hh.nHits();
assert(nhits <= pixelGPUConstants::maxNumberOfHits);
Expand Down Expand Up @@ -96,7 +96,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *

blockSize = 128;
numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);

// remove duplicates (tracks that share a doublet)
numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
Expand All @@ -108,7 +108,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
tuples_d, quality_d, device_tupleMultiplicity_.get());
cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
tuples_d, quality_d, device_tupleMultiplicity_.get());
cudaCheck(cudaGetLastError());
Expand Down Expand Up @@ -160,7 +160,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
#endif

// in principle we can use "nhits" to heuristically dimension the workspace...
device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
assert(device_isOuterHitOfCell_.get());
{
int threadsPerBlock = 128;
Expand All @@ -175,7 +175,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
cudaCheck(cudaGetLastError());
}

device_theCells_ = cudautils::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);

#ifdef GPU_DEBUG
cudaDeviceSynchronize();
Expand Down Expand Up @@ -252,7 +252,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
tuples_d, quality_d, device_hitToTuple_.get());
cudaCheck(cudaGetLastError());
cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
cudaCheck(cudaGetLastError());
kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
cudaCheck(cudaGetLastError());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,6 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
} else {
*device_nCells_ = 0;
}
cudautils::launchZero(device_tupleMultiplicity_.get(), stream);
cudautils::launchZero(device_hitToTuple_.get(), stream); // we may wish to keep it in the edm...
cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
cms::cuda::launchZero(device_hitToTuple_.get(), stream); // we may wish to keep it in the edm...
}
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
float bfield,
cudaStream_t stream) const {
PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));

auto* soa = tracks.get();

Expand Down
8 changes: 4 additions & 4 deletions RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;

// Fit internals
auto hitsGPU_ = cudautils::make_device_unique<double[]>(
auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
auto circle_fit_resultsGPU_holder =
cudautils::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());

for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
Expand Down

0 comments on commit 6dfc6f9

Please sign in to comment.