NVIDIA · elstehle · Dec 7, 2022 · Dec 7, 2022 · Dec 8, 2022 · Jan 18, 2023
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
@@ -39,7 +39,13 @@ namespace nvbench::detail
 
 measure_cold_base::measure_cold_base(state &exec_state)
     : m_state{exec_state}
-    , m_launch{m_state.get_cuda_stream()}
+    , m_launch{nvbench::launch([this]() -> decltype(auto) {
+      if (!m_state.get_cuda_stream().has_value())
+      {
+        m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()});
+      }
+      return m_state.get_cuda_stream().value();
+    }())}
     , m_run_once{exec_state.get_run_once()}
     , m_no_block{exec_state.get_disable_blocking_kernel()}
     , m_min_samples{exec_state.get_min_samples()}

diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
@@ -165,7 +165,13 @@ measure_cupti_base::measure_cupti_base(state &exec_state)
 // (formatter doesn't handle `try :` very well...)
 try
   : m_state{exec_state}
-  , m_launch{m_state.get_cuda_stream()}
+  , m_launch{[this]() -> decltype(auto) {
+                   if (!m_state.get_cuda_stream().has_value())
+                   {
+                    m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()});
+                   }
+                   return m_state.get_cuda_stream().value();
+                 }()}
   , m_cupti{*m_state.get_device(), add_metrics(m_state)}
 {}
 // clang-format on

diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
@@ -37,7 +37,13 @@ namespace nvbench::detail
 
 measure_hot_base::measure_hot_base(state &exec_state)
     : m_state{exec_state}
-    , m_launch{m_state.get_cuda_stream()}
+    , m_launch{nvbench::launch([this]() -> decltype(auto) {
+      if (!m_state.get_cuda_stream().has_value())
+      {
+        m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()});
+      }
+      return m_state.get_cuda_stream().value();
+    }())}
     , m_min_samples{exec_state.get_min_samples()}
     , m_min_time{exec_state.get_min_time()}
     , m_skip_time{exec_state.get_skip_time()}

diff --git a/nvbench/state.cuh b/nvbench/state.cuh
@@ -63,7 +63,10 @@ struct state
   state &operator=(const state &) = delete;
   state &operator=(state &&)      = default;
 
-  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
+  [[nodiscard]] const std::optional<nvbench::cuda_stream> &get_cuda_stream() const
+  {
+    return m_cuda_stream;
+  }
   void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
 
   /// The CUDA device associated with with this benchmark state. May be
@@ -276,7 +279,7 @@ private:
   nvbench::float64_t m_skip_time;
   nvbench::float64_t m_timeout;
 
-  nvbench::cuda_stream m_cuda_stream;
+  std::optional<nvbench::cuda_stream> m_cuda_stream;
 
   // Deadlock protection. See blocking_kernel's class doc for details.
   nvbench::float64_t m_blocking_kernel_timeout{30.0};

diff --git a/nvbench/state.cxx b/nvbench/state.cxx
@@ -59,7 +59,7 @@ state::state(const benchmark_base &bench,
     , m_max_noise{bench.get_max_noise()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
-    , m_cuda_stream{m_device}
+    , m_cuda_stream{std::nullopt}
 {}
 
 nvbench::int64_t state::get_int64(const std::string &axis_name) const

diff --git a/testing/state.cu b/testing/state.cu
@@ -43,8 +43,7 @@ struct state_tester : public nvbench::state
   void set_param(std::string name, T &&value)
   {
     this->state::m_axis_values.set_value(std::move(name),
-                                         nvbench::named_values::value_type{
-                                           std::forward<T>(value)});
+                                         nvbench::named_values::value_type{std::forward<T>(value)});
   }
 };
 } // namespace nvbench::detail
@@ -57,6 +56,9 @@ void test_streams()
 
   state_tester state{bench};
 
+  // Confirm that the stream hasn't been initialized yet
+  ASSERT(!state.get_cuda_stream().has_value());
+
   // Test non-owning stream
   cudaStream_t default_stream = 0;
   state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});