Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC] New version of CudaCompat #428

Open
wants to merge 24 commits into
base: CMSSW_11_0_X_Patatrack
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 35 additions & 15 deletions CUDADataFormats/Common/interface/HeterogeneousSoA.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

Expand All @@ -21,15 +22,15 @@ class HeterogeneousSoA {

explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
explicit HeterogeneousSoA(cudautils::cpu::unique_ptr<T> &&p) : cm_ptr(std::move(p)) {}

auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }

auto const &operator*() const { return *get(); }

auto const *operator-> () const { return get(); }

auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }

auto &operator*() { return *get(); }

Expand All @@ -47,12 +48,15 @@ class HeterogeneousSoA {
// a union wan't do it, a variant will not be more efficienct
cudautils::device::unique_ptr<T> dm_ptr; //!
cudautils::host::unique_ptr<T> hm_ptr; //!
std::unique_ptr<T> std_ptr; //!
cudautils::cpu::unique_ptr<T> cm_ptr; //!
};

namespace cudaCompat {

struct GPUTraits {
static constexpr const char * name = "GPU";
static constexpr bool runOnDevice = true;

template <typename T>
using unique_ptr = cudautils::device::unique_ptr<T>;

Expand Down Expand Up @@ -83,6 +87,9 @@ namespace cudaCompat {
};

struct HostTraits {
static constexpr const char * name = "HOST";
static constexpr bool runOnDevice = false;

template <typename T>
using unique_ptr = cudautils::host::unique_ptr<T>;

Expand All @@ -108,32 +115,45 @@ namespace cudaCompat {
};

struct CPUTraits {
static constexpr const char * name = "CPU";
static constexpr bool runOnDevice = false;

template <typename T>
using unique_ptr = cudautils::cpu::unique_ptr<T>;;

template <typename T>
static auto make_unique() {
return cudautils::make_cpu_unique<T>(cudaStreamDefault);
}

template <typename T>
using unique_ptr = std::unique_ptr<T>;
static auto make_unique(size_t size) {
return cudautils::make_cpu_unique<T>(size,cudaStreamDefault);
}

template <typename T>
static auto make_unique(cudaStream_t) {
return std::make_unique<T>();
static auto make_unique(cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(stream);
}

template <typename T>
static auto make_unique(size_t size, cudaStream_t) {
return std::make_unique<T>(size);
static auto make_unique(size_t size, cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(size, stream);
}

template <typename T>
static auto make_host_unique(cudaStream_t) {
return std::make_unique<T>();
static auto make_host_unique(cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(stream);
}

template <typename T>
static auto make_device_unique(cudaStream_t) {
return std::make_unique<T>();
static auto make_device_unique(cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(stream);
}

template <typename T>
static auto make_device_unique(size_t size, cudaStream_t) {
return std::make_unique<T>(size);
static auto make_device_unique(size_t size, cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(size, stream);
}
};

Expand Down
7 changes: 4 additions & 3 deletions CUDADataFormats/Common/interface/HostProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define CUDADataFormatsCommonHostProduct_H

#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"

// a heterogeneous unique pointer...
template <typename T>
Expand All @@ -13,17 +14,17 @@ class HostProduct {
HostProduct& operator=(HostProduct&&) = default;

explicit HostProduct(cudautils::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
explicit HostProduct(cudautils::cpu::unique_ptr<T>&& p) : cm_ptr(std::move(p)) {}

auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
auto const* get() const { return hm_ptr ? hm_ptr.get() : cm_ptr.get(); }

auto const& operator*() const { return *get(); }

auto const* operator-> () const { return get(); }

private:
cudautils::host::unique_ptr<T> hm_ptr; //!
std::unique_ptr<T> std_ptr; //!
cudautils::cpu::unique_ptr<T> cm_ptr; //!
};

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"

#include <cuda_runtime.h>
Expand Down
1 change: 0 additions & 1 deletion CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

template <>
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ namespace cudautils {
inline void launchFinalize(Histo *__restrict__ h,
uint8_t *__restrict__ ws
#ifndef __CUDACC__
= cudaStreamDefault
= nullptr
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am curious if using cudaStreamDefault was giving problems ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no,to me nullptr makes more sense for a pointer (even if they are all == 0 )

#endif
,
cudaStream_t stream
Expand Down
86 changes: 86 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h

#include <memory>
#include <functional>

#include <cstdlib>
#include <cuda_runtime.h>
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from a look at the file, I think #include <cuda_runtime.h> could be removed ?


namespace cudautils {
namespace cpu {
namespace impl {
// Additional layer of types to distinguish from device:: and host::unique_ptr
class CPUDeleter {
public:
CPUDeleter() = default;

void operator()(void *ptr) {
::free(ptr);
}
};
} // namespace impl

template <typename T>
using unique_ptr = std::unique_ptr<T, impl::CPUDeleter>;

namespace impl {
template <typename T>
struct make_cpu_unique_selector {
using non_array = cudautils::cpu::unique_ptr<T>;
};
template <typename T>
struct make_cpu_unique_selector<T[]> {
using unbounded_array = cudautils::cpu::unique_ptr<T[]>;
};
template <typename T, size_t N>
struct make_cpu_unique_selector<T[N]> {
struct bounded_array {};
};
} // namespace impl
} // namespace cpu

template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique(cudaStream_t) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trying to better understand this: calling make_cpu_unique would be roughly equivalent to c++20's std::make_unique_default_init, plus it sets the deleter to just call free() instead of calling the destructors ?

static_assert(std::is_trivially_constructible<T>::value,
"Allocating with non-trivial constructor on the cpu memory is not supported");
void *mem = ::malloc(sizeof(T));
return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n, cudaStream_t) {
using element_type = typename std::remove_extent<T>::type;
static_assert(std::is_trivially_constructible<element_type>::value,
"Allocating with non-trivial constructor on the cpu memory is not supported");
void *mem = ::malloc(n * sizeof(element_type));
return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T, typename... Args>
typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique(Args &&...) = delete;

// No check for the trivial constructor, make it clear in the interface
template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized(cudaStream_t) {
void *mem = ::malloc(sizeof(T));
return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) {
using element_type = typename std::remove_extent<T>::type;
void *mem = ::malloc(n * sizeof(element_type));
return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T, typename... Args>
typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique_uninitialized(Args &&...) =
delete;
} // namespace cudautils

#endif
18 changes: 12 additions & 6 deletions HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
*/

#ifndef __CUDACC__
#define CUDA_KERNELS_ON_CPU
#endif

#ifdef CUDA_KERNELS_ON_CPU

#include <algorithm>
#include <cstdint>
Expand Down Expand Up @@ -86,18 +90,20 @@ namespace cudaCompat {
#define __forceinline__
#endif

// make sure function are inlined to avoid multiple definition
#ifndef __CUDA_ARCH__
using namespace cudaCompat;
#endif

#endif // CUDA_KERNELS_ON_CPU


// make sure function are inlined to avoid multiple definition
#ifndef __CUDACC__
#undef __global__
#define __global__ inline __attribute__((always_inline))
#undef __forceinline__
#define __forceinline__ inline __attribute__((always_inline))
#endif

#ifndef __CUDA_ARCH__
using namespace cudaCompat;
#endif

#endif

#endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
12 changes: 10 additions & 2 deletions HeterogeneousCore/CUDAUtilities/interface/launch.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,14 @@ namespace cudautils {
} // namespace detail

// wrappers for cudaLaunchKernel

inline
fwyzard marked this conversation as resolved.
Show resolved Hide resolved
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add the inline because it makes sense on its own

void launch(void (*kernel)(), LaunchParameters config) {
#ifdef CUDA_KERNELS_ON_CPU
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but I really, really do not want to add a dependency on #ifdefs etc. here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will find a less intrusive solution

kernel();
#else
cudaCheck(cudaLaunchKernel(
(const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
#endif
}

template <typename F, typename... Args>
Expand All @@ -107,6 +111,9 @@ namespace cudautils {
std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
#endif
launch(F* kernel, LaunchParameters config, Args&&... args) {
#ifdef CUDA_KERNELS_ON_CPU
kernel(args...);
#else
using function_type = detail::kernel_traits<F>;
typename function_type::argument_type_tuple args_copy(args...);

Expand All @@ -116,10 +123,11 @@ namespace cudautils {
detail::pointer_setter<size>()(pointers, args_copy);
cudaCheck(cudaLaunchKernel(
(const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
#endif
}

// wrappers for cudaLaunchCooperativeKernel

inline
fwyzard marked this conversation as resolved.
Show resolved Hide resolved
void launch_cooperative(void (*kernel)(), LaunchParameters config) {
cudaCheck(cudaLaunchCooperativeKernel(
(const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
Expand Down
13 changes: 13 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"

#include <type_traits>
#include <cstring>

namespace cudautils {
template <typename T>
Expand All @@ -15,6 +17,11 @@ namespace cudautils {
cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
}

template <typename T>
inline void memsetAsync(cudautils::cpu::unique_ptr<T>& ptr, T value, cudaStream_t) {
::memset(ptr.get(), value, sizeof(T));
}

/**
* The type of `value` is `int` because of `cudaMemsetAsync()` takes
* it as an `int`. Note that `cudaMemsetAsync()` sets the value of
Expand All @@ -25,6 +32,12 @@ namespace cudautils {
inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
}
template <typename T>
inline void memsetAsync(cudautils::cpu::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t) {
::memset(ptr.get(), value, nelements * sizeof(T));
}


} // namespace cudautils

#endif
14 changes: 13 additions & 1 deletion HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="Launch_t.cpp Launch_t.cu" name="gpuLaunch_t">
</bin>

<bin file="Launch_t.cpp" name="cpuLaunch_t">
<flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU"/>
</bin>

<bin file="Launch_t.cpp Launch_t.cu" name="mixedLaunch_t">
<flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU -DLaunchInCU"/>
</bin>


<bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
</bin>
Expand Down Expand Up @@ -65,7 +77,7 @@
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,cpu_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
<use name="catch2"/>
</bin>

Expand Down
Loading