diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index 2c7536c..1cb1460 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -19,10 +19,13 @@ #pragma once #include +#include +#include #include #include +#include namespace nvbench { @@ -39,18 +42,36 @@ namespace nvbench struct cuda_stream { /** - * Constructs a cuda_stream that owns a new stream, created with - * `cudaStreamCreate`. + * Constructs a cuda_stream that owns a new stream, created with `cudaStreamCreate`. + * + * @param device The device that this stream should be associated with. If no device is provided, + * the stream will be associated with the device that is active at the call time. */ - cuda_stream() - : m_stream{[]() { + explicit cuda_stream(std::optional device) + : m_stream{[device]() { cudaStream_t s; - NVBENCH_CUDA_CALL(cudaStreamCreate(&s)); + if (device.has_value()) + { + nvbench::detail::device_scope scope_guard{device.value().get_id()}; + NVBENCH_CUDA_CALL(cudaStreamCreate(&s)); + } + else + { + NVBENCH_CUDA_CALL(cudaStreamCreate(&s)); + } return s; }(), stream_deleter{true}} {} + /** + * @brief Constructs a new cuda_stream tha is asociated with the device that is active at the call + * time. + */ + cuda_stream() + : cuda_stream(std::nullopt) + {} + /** * Constructs a `cuda_stream` from an explicit cudaStream_t. * diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 69ceb7e..2d65fe5 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -39,7 +39,13 @@ namespace nvbench::detail measure_cold_base::measure_cold_base(state &exec_state) : m_state{exec_state} - , m_launch{m_state.get_cuda_stream()} + , m_launch{nvbench::launch([this]() -> decltype(auto) { + if (!m_state.get_cuda_stream().has_value()) + { + m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()}); + } + return m_state.get_cuda_stream().value(); + }())} , m_run_once{exec_state.get_run_once()} , m_no_block{exec_state.get_disable_blocking_kernel()} , m_min_samples{exec_state.get_min_samples()} diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu index e583cd5..de6abf1 100644 --- a/nvbench/detail/measure_cupti.cu +++ b/nvbench/detail/measure_cupti.cu @@ -165,7 +165,13 @@ measure_cupti_base::measure_cupti_base(state &exec_state) // (formatter doesn't handle `try :` very well...) try : m_state{exec_state} - , m_launch{m_state.get_cuda_stream()} + , m_launch{[this]() -> decltype(auto) { + if (!m_state.get_cuda_stream().has_value()) + { + m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()}); + } + return m_state.get_cuda_stream().value(); + }()} , m_cupti{*m_state.get_device(), add_metrics(m_state)} {} // clang-format on diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu index 9497122..2f231f1 100644 --- a/nvbench/detail/measure_hot.cu +++ b/nvbench/detail/measure_hot.cu @@ -37,7 +37,13 @@ namespace nvbench::detail measure_hot_base::measure_hot_base(state &exec_state) : m_state{exec_state} - , m_launch{m_state.get_cuda_stream()} + , m_launch{nvbench::launch([this]() -> decltype(auto) { + if (!m_state.get_cuda_stream().has_value()) + { + m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()}); + } + return m_state.get_cuda_stream().value(); + }())} , m_min_samples{exec_state.get_min_samples()} , m_min_time{exec_state.get_min_time()} , m_skip_time{exec_state.get_skip_time()} diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 53c7413..5d72477 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -63,7 +63,10 @@ struct state state &operator=(const state &) = delete; state &operator=(state &&) = default; - [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; } + [[nodiscard]] const std::optional &get_cuda_stream() const + { + return m_cuda_stream; + } void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); } /// The CUDA device associated with with this benchmark state. May be @@ -261,7 +264,6 @@ private: std::optional device, std::size_t type_config_index); - nvbench::cuda_stream m_cuda_stream; std::reference_wrapper m_benchmark; nvbench::named_values m_axis_values; std::optional m_device; @@ -277,6 +279,8 @@ private: nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; + std::optional m_cuda_stream; + // Deadlock protection. See blocking_kernel's class doc for details. nvbench::float64_t m_blocking_kernel_timeout{30.0}; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index 3cf105c..2f4e284 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -41,6 +41,7 @@ state::state(const benchmark_base &bench) , m_max_noise{bench.get_max_noise()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} + , m_cuda_stream{std::nullopt} {} state::state(const benchmark_base &bench, @@ -58,6 +59,7 @@ state::state(const benchmark_base &bench, , m_max_noise{bench.get_max_noise()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} + , m_cuda_stream{std::nullopt} {} nvbench::int64_t state::get_int64(const std::string &axis_name) const diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 4928ebc..01cacaa 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -3,6 +3,7 @@ set(test_srcs benchmark.cu create.cu cuda_timer.cu + cuda_stream.cu cpu_timer.cu enum_type_list.cu float64_axis.cu diff --git a/testing/cuda_stream.cu b/testing/cuda_stream.cu new file mode 100644 index 0000000..a721a6e --- /dev/null +++ b/testing/cuda_stream.cu @@ -0,0 +1,77 @@ +/* + * Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "test_asserts.cuh" + +#include + +namespace +{ +#ifdef NVBENCH_HAS_CUPTI +/** + * @brief Queries and returns the device id that the given \p cuda_stream is associated with + * + * @param cuda_stream The stream to get the device id for + * @return The device id that \p cuda_stream is associated with + */ +int get_device_of_stream(cudaStream_t cuda_stream) +{ + CUcontext ctx; + NVBENCH_DRIVER_API_CALL(cuStreamGetCtx(CUstream{cuda_stream}, &ctx)); + NVBENCH_DRIVER_API_CALL(cuCtxPushCurrent(ctx)); + CUdevice device_id{}; + NVBENCH_DRIVER_API_CALL(cuCtxGetDevice(&device_id)); + NVBENCH_DRIVER_API_CALL(cuCtxPopCurrent(&ctx)); + return static_cast(device_id); +} +#endif +} // namespace + +void test_basic() +{ +#ifdef NVBENCH_HAS_CUPTI + // Get devices + auto devices = nvbench::device_manager::get().get_devices(); + + // Iterate over devices + for (auto const &device_info : devices) + { + // Create stream on the device before it becomes the active device + nvbench::cuda_stream device_stream(device_info); + + // Verify cuda stream is associated with the correct cuda device + ASSERT(get_device_of_stream(device_stream.get_stream()) == device_info.get_id()); + + // Set the device as active device + device_info.set_active(); + + // Create the stream (implicitly) on the device that is currently active + nvbench::cuda_stream current_device_stream{}; + + // Verify the cuda stream was in fact associated with the currently active device + ASSERT(get_device_of_stream(current_device_stream.get_stream()) == device_info.get_id()); + } +#endif +} + +int main() { test_basic(); } diff --git a/testing/state.cu b/testing/state.cu index a2300d1..0064556 100644 --- a/testing/state.cu +++ b/testing/state.cu @@ -43,8 +43,7 @@ struct state_tester : public nvbench::state void set_param(std::string name, T &&value) { this->state::m_axis_values.set_value(std::move(name), - nvbench::named_values::value_type{ - std::forward(value)}); + nvbench::named_values::value_type{std::forward(value)}); } }; } // namespace nvbench::detail @@ -57,6 +56,9 @@ void test_streams() state_tester state{bench}; + // Confirm that the stream hasn't been initialized yet + ASSERT(!state.get_cuda_stream().has_value()); + // Test non-owning stream cudaStream_t default_stream = 0; state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});