Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use unified memory for conditions #157

Open
wants to merge 12 commits into
base: CMSSW_10_4_X_Patatrack
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef CalibTracker_SiPixelESProducers_SiPixelGainCalibrationForHLTGPU_H
#define CalibTracker_SiPixelESProducers_SiPixelGainCalibrationForHLTGPU_H

#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAESManaged.h"
#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"

#include <cuda/api_wrappers.h>
Expand All @@ -19,14 +19,8 @@ class SiPixelGainCalibrationForHLTGPU {
const SiPixelGainForHLTonGPU *getGPUProductAsync(cuda::stream_t<>& cudaStream) const;

private:
const SiPixelGainCalibrationForHLT *gains_ = nullptr;
SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr;
struct GPUData {
~GPUData();
SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr;
SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr;
};
CUDAESProduct<GPUData> gpuData_;
CUDAESManaged helper_;
SiPixelGainForHLTonGPU *gainForHLT_ = nullptr;
};

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

#include <cuda.h>

SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains, const TrackerGeometry& geom):
gains_(&gains)
SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains, const TrackerGeometry& geom)
{
// bizzarre logic (looking for fist strip-det) don't ask
auto const & dus = geom.detUnits();
Expand All @@ -25,8 +24,7 @@ SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGa
std::cout << "sizes " << sizeof(char) << ' ' << sizeof(uint8_t) << ' ' << sizeof(SiPixelGainForHLTonGPU::DecodingStructure) << std::endl;
*/

cudaCheck(cudaMallocHost((void**) & gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU)));
//gainForHLTonHost_->v_pedestals = gainDataOnGPU_; // how to do this?
helper_.allocate(&gainForHLT_, 1);

// do not read back from the (possibly write-combined) memory buffer
auto minPed = gains.getPedLow();
Expand All @@ -36,21 +34,21 @@ SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGa
auto nBinsToUseForEncoding = 253;

// we will simplify later (not everything is needed....)
gainForHLTonHost_->minPed_ = minPed;
gainForHLTonHost_->maxPed_ = maxPed;
gainForHLTonHost_->minGain_= minGain;
gainForHLTonHost_->maxGain_= maxGain;
gainForHLT_->minPed_ = minPed;
gainForHLT_->maxPed_ = maxPed;
gainForHLT_->minGain_= minGain;
gainForHLT_->maxGain_= maxGain;

gainForHLTonHost_->numberOfRowsAveragedOver_ = 80;
gainForHLTonHost_->nBinsToUseForEncoding_ = nBinsToUseForEncoding;
gainForHLTonHost_->deadFlag_ = 255;
gainForHLTonHost_->noisyFlag_ = 254;
gainForHLT_->numberOfRowsAveragedOver_ = 80;
gainForHLT_->nBinsToUseForEncoding_ = nBinsToUseForEncoding;
gainForHLT_->deadFlag_ = 255;
gainForHLT_->noisyFlag_ = 254;

gainForHLTonHost_->pedPrecision = static_cast<float>(maxPed - minPed) / nBinsToUseForEncoding;
gainForHLTonHost_->gainPrecision = static_cast<float>(maxGain - minGain) / nBinsToUseForEncoding;
gainForHLT_->pedPrecision = static_cast<float>(maxPed - minPed) / nBinsToUseForEncoding;
gainForHLT_->gainPrecision = static_cast<float>(maxGain - minGain) / nBinsToUseForEncoding;

/*
std::cout << "precisions g " << gainForHLTonHost_->pedPrecision << ' ' << gainForHLTonHost_->gainPrecision << std::endl;
std::cout << "precisions g " << gainForHLT_->pedPrecision << ' ' << gainForHLT_->gainPrecision << std::endl;
*/

// fill the index map
Expand All @@ -68,31 +66,21 @@ SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGa
assert(0==p->iend%2);
assert(p->ibegin!=p->iend);
assert(p->ncols>0);
gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(p->ibegin,p->iend), p->ncols);
gainForHLT_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(p->ibegin,p->iend), p->ncols);
// if (ind[i].detid!=dus[i]->geographicalId()) std::cout << ind[i].detid<<"!="<<dus[i]->geographicalId() << std::endl;
// gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(ind[i].ibegin,ind[i].iend), ind[i].ncols);
// gainForHLT_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(ind[i].ibegin,ind[i].iend), ind[i].ncols);
}

}
helper_.allocate(&(gainForHLT_->v_pedestals), gains.data().size(), sizeof(char)); // override the element size because essentially we reinterpret_cast on the fly
std::memcpy(gainForHLT_->v_pedestals, gains.data().data(), gains.data().size()*sizeof(char));

SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() {
cudaCheck(cudaFreeHost(gainForHLTonHost_));
helper_.advise();
}

SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
cudaCheck(cudaFree(gainForHLTonGPU));
cudaCheck(cudaFree(gainDataOnGPU));
SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() {
}

const SiPixelGainForHLTonGPU *SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cuda::stream_t<>& cudaStream) const {
const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cuda::stream_t<>& stream) {
cudaCheck(cudaMalloc((void**) & data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU)));
cudaCheck(cudaMalloc((void**) & data.gainDataOnGPU, this->gains_->data().size())); // TODO: this could be changed to cuda::memory::device::unique_ptr<>
// gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory
cudaCheck(cudaMemcpyAsync(data.gainDataOnGPU, this->gains_->data().data(), this->gains_->data().size(), cudaMemcpyDefault, stream.id()));

cudaCheck(cudaMemcpyAsync(data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals), &(data.gainDataOnGPU), sizeof(SiPixelGainForHLTonGPU_DecodingStructure*), cudaMemcpyDefault, stream.id()));
});
return data.gainForHLTonGPU;
helper_.prefetchAsync(cudaStream);
return gainForHLT_;
}
61 changes: 61 additions & 0 deletions Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h

#include <cstdint>
#include <array>

namespace phase1PixelTopology {

Expand Down Expand Up @@ -29,6 +30,66 @@ namespace phase1PixelTopology {
};


template<class Function, std::size_t... Indices>
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
{
return {{ f(Indices)... }};
}

template<int N, class Function>
constexpr auto map_to_array(Function f)
-> std::array<typename std::result_of<Function(std::size_t)>::type, N>
{
return map_to_array_helper(f, std::make_index_sequence<N>{});
}


constexpr uint32_t findMaxModuleStride() {
bool go = true;
int n=2;
while (go) {
for (uint8_t i=1; i<11; ++i) {
if (layerStart[i]%n !=0) {go=false; break;}
}
if(!go) break;
n*=2;
}
return n/2;
}

constexpr uint32_t maxModuleStride = findMaxModuleStride();


constexpr uint8_t findLayer(uint32_t detId) {
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}

constexpr uint8_t findLayerFromCompact(uint32_t detId) {
detId*=maxModuleStride;
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
return 11;
}


constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);

constexpr bool validateLayerIndex() {
bool res=true;
for (auto i=0U; i<numberOfModules; ++i) {
auto j = i/maxModuleStride;
res &=(layer[j]<10);
res &=(i>=layerStart[layer[j]]);
res &=(i<layerStart[layer[j]+1]);
}
return res;
}

static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");


// this is for the ROC n<512 (upgrade 1024)
constexpr inline
uint16_t divu52(uint16_t n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,13 @@ int main() {
assert(std::get<1>(ori)==bp);
}

using namespace phase1PixelTopology;
for (auto i=0U; i<numberOfModules; ++i) {
assert(layer[i]<10);
assert(i>=layerStart[layer[i]]);
assert(i<layerStart[layer[i]+1]);
}


return 0;
}
57 changes: 57 additions & 0 deletions HeterogeneousCore/CUDACore/interface/CUDAESManaged.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef HeterogeneousCore_CUDACore_interface_CUDAESManaged_h
#define HeterogeneousCore_CUDACore_interface_CUDAESManaged_h

#include <atomic>
#include <vector>

#include <cuda_runtime.h>
#include <cuda/api_wrappers.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

/**
* Class to help with memory allocations for ESProducts. Each CUDA
* ESProduct wrapper should
* - include an instance of this class as their member
* - use allocate() to allocate the memory buffers
* - call advise() after filling the buffers in CPU
* - call prefetch() before returning the CUDA ESProduct
*
* It owns the allocated memory and frees it in its destructor.
*/
class CUDAESManaged {
public:
CUDAESManaged();
~CUDAESManaged();

template <typename T>
T *allocate(size_t elements, size_t elementSize=sizeof(T)) {
T *ptr = nullptr;
auto size = elementSize*elements;
cudaCheck(cudaMallocManaged(&ptr, size));
buffers_.emplace_back(ptr, size);
return ptr;
}

template <typename T>
void allocate(T **ptr, size_t elements, size_t elementSize=sizeof(T)) {
*ptr = allocate<T>(elements, elementSize);
}

// Record a buffer allocated elsewhere to be used in advise/prefetch
/*
void record(void *ptr, size_t size) {
buffers_.emplace_back(ptr, size);
}
*/

void advise() const;

void prefetchAsync(cuda::stream_t<>& stream) const;

private:
std::vector<std::pair<void *, size_t> > buffers_;
mutable std::vector<std::atomic<bool>> prefetched_;
};

#endif
93 changes: 0 additions & 93 deletions HeterogeneousCore/CUDACore/interface/CUDAESProduct.h

This file was deleted.

35 changes: 35 additions & 0 deletions HeterogeneousCore/CUDACore/src/CUDAESManaged.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "HeterogeneousCore/CUDACore/interface/CUDAESManaged.h"
#include "HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h"

CUDAESManaged::CUDAESManaged(): prefetched_(numberOfCUDADevices()) {
for(auto& pref: prefetched_) {
pref.store(false);
}
}

CUDAESManaged::~CUDAESManaged() {
for(auto& ptrSize: buffers_) {
cudaFree(ptrSize.first);
}
}

void CUDAESManaged::advise() const {
for(const auto& ptrSize: buffers_) {
cudaCheck(cudaMemAdvise(ptrSize.first, ptrSize.second, cudaMemAdviseSetReadMostly, 0)); // device is ignored for this advise
}
}

void CUDAESManaged::prefetchAsync(cuda::stream_t<>& stream) const {
// The boolean atomic is an optimization attempt, it doesn't really
// matter if more than one thread/edm stream issues the prefetches
// as long as most of the prefetches are avoided.
auto& pref = prefetched_[stream.device_id()];
if(pref.load())
return;
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking through the code again I noticed that here we/I have a single flag to control behaviour for all devices. I think this should be changed to either

  1. the atomic<bool> flag should be changed to per-device flags, or
  2. issue the prefetch for all devices below

In principle the latter (2) could make more sense as we expect all devices to run all code (and thus need all GPU conditions) anyway. But async prefetch needs also a CUDA stream, so it needs a bit more thought. Basically it would mean that the CUDAESManaged would have additional CUDA streams (one per device), and then use CUDA events to "synchronize" with the argument stream (otherwise there would be no gain from async). Then the atomic<bool> flag would actually be replaced with cudaEventQuery() for the CUDA event (of a device). I would naively imagine atomic<bool> to be faster than cudaEventQuery(), which would favor option 1.


for(const auto& ptrSize: buffers_) {
cudaMemPrefetchAsync(ptrSize.first, ptrSize.second, stream.device_id(), stream.id());
}

pref.store(true);
}
Loading