Skip to content

Commit

Permalink
[xla:cpu] Prefer sequential execution from small thunk sequences
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 678337715
  • Loading branch information
ezhulenev authored and Google-ML-Automation committed Sep 27, 2024
1 parent b7dbeb6 commit 0a31a4d
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 10 deletions.
13 changes: 10 additions & 3 deletions xla/backends/cpu/runtime/thunk_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
sink_.push_back(i);
}
}

// Erase redundant edges between nodes.
int64_t num_erased_edges = RunTransitiveReductionAndUpdatePriorities();

Expand All @@ -69,7 +70,7 @@ ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
is_sequential_ &= (absl::c_count(nodes_defs_[i].in_edges, i - 1) != 0);
}

// Maybe mark execution as sequential if all thunks use small buffers.
// Prefer sequential execution if all thunks use small buffers.
auto uses_small_buffers = [&](const std::unique_ptr<Thunk>& thunk) {
return absl::c_all_of(thunk->buffer_uses(), [&](const BufferUse& use) {
return use.slice().size() <= options.execute_sequential_buffer_threshold;
Expand All @@ -79,6 +80,10 @@ ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
bool small_buffers = absl::c_all_of(thunk_sequence_, uses_small_buffers);
is_sequential_ |= small_buffers;

// Prefer sequential execution for small thunk sequences.
is_sequential_ |=
thunk_sequence_.size() <= options.execute_sequential_num_thunks_threshold;

VLOG(2) << absl::StreamFormat(
"Constructed ThunkExecutor with %d nodes: #source_nodes=%d "
"#sink_nodes=%d, #erased_edges=%d, is_sequential=%v, small_buffers=%v",
Expand Down Expand Up @@ -159,8 +164,10 @@ tsl::AsyncValueRef<ThunkExecutor::ExecuteEvent> ThunkExecutor::Execute(
return thunk_sequence_[0]->Execute(params);
}

// If thunk sequence dependencies form a sequential execution graph, we skip
// expensive async execution and simply run thunks one by one.
// When we choose sequential execution strategy (we rely on heuristics and
// a cost model to make the decision), we skip expensive async execution and
// simply run thunks one by one. This minimizes runtime overheads from small
// XLA programs with many cheap operations.
if (is_sequential_) {
return ExecuteSequential(params);
}
Expand Down
11 changes: 8 additions & 3 deletions xla/backends/cpu/runtime/thunk_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,16 @@ namespace internal {
// Clang does not allow defining a nested struct with member initializer, as
// a workaround we define a struct in internal namespace and create an alias.
struct ThunkExecutorOptions {
// If all thunks in a sequence use buffers of size less than or equal to
// `execute_sequential_buffer_threshold`, we mark execution as sequential, as
// concurrency overheads will likely dominate the overall execution time.
// If all thunks in a sequence use buffers of size less than or equal to the
// given threshold, we mark execution as sequential, as concurrency overheads
// will likely dominate the overall execution time.
size_t execute_sequential_buffer_threshold = 512;

// If thunk sequence length is less than or equal to the given threshold, we
// mark execution as sequential, as concurrency overheads will likely dominate
// the overall execution time.
size_t execute_sequential_num_thunks_threshold = 8;

// Use priority ready queue to execute nodes according to their priority. By
// default we use FIFO ready queue.
bool use_priority_ready_queue = false;
Expand Down
6 changes: 2 additions & 4 deletions xla/backends/cpu/runtime/thunk_executor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,8 @@ AddI32Thunk::ResourceUses AddI32Thunk::resource_uses() const {
}

static ThunkExecutor::Options OptionsForTest() {
// Override small buffers threshold to make sure that we test all execution
// paths, because in test we always use small buffers below the default
// threshold of `512`.
return ThunkExecutor::Options{/*execute_sequential_buffer_threshold=*/0};
return ThunkExecutor::Options{/*execute_sequential_buffer_threshold=*/0,
/*execute_sequential_num_thunks_threshold=*/0};
}

TEST(ThunkExecutorTest, FifoReadyQueueTest) {
Expand Down

0 comments on commit 0a31a4d

Please sign in to comment.