[xla:cpu] Prefer sequential execution from small thunk sequences

PiperOrigin-RevId: 678337715
openxla · Sep 27, 2024 · 0a31a4d · 0a31a4d
1 parent b7dbeb6
commit 0a31a4d
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 10 deletions.
diff --git a/xla/backends/cpu/runtime/thunk_executor.cc b/xla/backends/cpu/runtime/thunk_executor.cc
@@ -60,6 +60,7 @@ ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
       sink_.push_back(i);
     }
   }
+
   // Erase redundant edges between nodes.
   int64_t num_erased_edges = RunTransitiveReductionAndUpdatePriorities();
 
@@ -69,7 +70,7 @@ ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
     is_sequential_ &= (absl::c_count(nodes_defs_[i].in_edges, i - 1) != 0);
   }
 
-  // Maybe mark execution as sequential if all thunks use small buffers.
+  // Prefer sequential execution if all thunks use small buffers.
   auto uses_small_buffers = [&](const std::unique_ptr<Thunk>& thunk) {
     return absl::c_all_of(thunk->buffer_uses(), [&](const BufferUse& use) {
       return use.slice().size() <= options.execute_sequential_buffer_threshold;
@@ -79,6 +80,10 @@ ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
   bool small_buffers = absl::c_all_of(thunk_sequence_, uses_small_buffers);
   is_sequential_ |= small_buffers;
 
+  // Prefer sequential execution for small thunk sequences.
+  is_sequential_ |=
+      thunk_sequence_.size() <= options.execute_sequential_num_thunks_threshold;
+
   VLOG(2) << absl::StreamFormat(
       "Constructed ThunkExecutor with %d nodes: #source_nodes=%d "
       "#sink_nodes=%d, #erased_edges=%d, is_sequential=%v, small_buffers=%v",
@@ -159,8 +164,10 @@ tsl::AsyncValueRef<ThunkExecutor::ExecuteEvent> ThunkExecutor::Execute(
     return thunk_sequence_[0]->Execute(params);
   }
 
-  // If thunk sequence dependencies form a sequential execution graph, we skip
-  // expensive async execution and simply run thunks one by one.
+  // When we choose sequential execution strategy (we rely on heuristics and
+  // a cost model to make the decision), we skip expensive async execution and
+  // simply run thunks one by one. This minimizes runtime overheads from small
+  // XLA programs with many cheap operations.
   if (is_sequential_) {
     return ExecuteSequential(params);
   }

diff --git a/xla/backends/cpu/runtime/thunk_executor.h b/xla/backends/cpu/runtime/thunk_executor.h
@@ -42,11 +42,16 @@ namespace internal {
 // Clang does not allow defining a nested struct with member initializer, as
 // a workaround we define a struct in internal namespace and create an alias.
 struct ThunkExecutorOptions {
-  // If all thunks in a sequence use buffers of size less than or equal to
-  // `execute_sequential_buffer_threshold`, we mark execution as sequential, as
-  // concurrency overheads will likely dominate the overall execution time.
+  // If all thunks in a sequence use buffers of size less than or equal to the
+  // given threshold, we mark execution as sequential, as concurrency overheads
+  // will likely dominate the overall execution time.
   size_t execute_sequential_buffer_threshold = 512;
 
+  // If thunk sequence length is less than or equal to the given threshold, we
+  // mark execution as sequential, as concurrency overheads will likely dominate
+  // the overall execution time.
+  size_t execute_sequential_num_thunks_threshold = 8;
+
   // Use priority ready queue to execute nodes according to their priority. By
   // default we use FIFO ready queue.
   bool use_priority_ready_queue = false;

diff --git a/xla/backends/cpu/runtime/thunk_executor_test.cc b/xla/backends/cpu/runtime/thunk_executor_test.cc
@@ -213,10 +213,8 @@ AddI32Thunk::ResourceUses AddI32Thunk::resource_uses() const {
 }
 
 static ThunkExecutor::Options OptionsForTest() {
-  // Override small buffers threshold to make sure that we test all execution
-  // paths, because in test we always use small buffers below the default
-  // threshold of `512`.
-  return ThunkExecutor::Options{/*execute_sequential_buffer_threshold=*/0};
+  return ThunkExecutor::Options{/*execute_sequential_buffer_threshold=*/0,
+                                /*execute_sequential_num_thunks_threshold=*/0};
 }
 
 TEST(ThunkExecutorTest, FifoReadyQueueTest) {