From 5b2cb64ab88aad57354c7b0f47b11ca9ae8dd73b Mon Sep 17 00:00:00 2001 From: bjacob Date: Fri, 17 Nov 2023 16:40:29 -0500 Subject: [PATCH] Fix intermittent failure - functions with `_try_` in their name may fail spuriously. (#15636) Fixes #15488. This is the fix discussed there. Hat tip to @RoboTux and @freddan80 for the tenacious debugging - we certainly wouldn't have been able to complete this without your help!! --- .../src/iree/base/internal/synchronization.h | 3 ++ runtime/src/iree/task/BUILD.bazel | 1 + runtime/src/iree/task/CMakeLists.txt | 1 + runtime/src/iree/task/queue.h | 12 ++++-- runtime/src/iree/task/queue_test.cc | 41 +++++++++++++------ 5 files changed, 43 insertions(+), 15 deletions(-) diff --git a/runtime/src/iree/base/internal/synchronization.h b/runtime/src/iree/base/internal/synchronization.h index e26b9b4484fa..ab8dfde3cf0b 100644 --- a/runtime/src/iree/base/internal/synchronization.h +++ b/runtime/src/iree/base/internal/synchronization.h @@ -317,6 +317,9 @@ void iree_slim_mutex_lock(iree_slim_mutex_t* mutex) IREE_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(mutex)); // Tries to lock the |mutex| and returns true if the caller holds the lock. +// +// This function is allowed to fail spuriously, i.e. even if the lock isn't +// held by another thread. bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex) IREE_THREAD_ANNOTATION_ATTRIBUTE(try_acquire_capability(true, mutex)); diff --git a/runtime/src/iree/task/BUILD.bazel b/runtime/src/iree/task/BUILD.bazel index 78df154deb72..4995d131e938 100644 --- a/runtime/src/iree/task/BUILD.bazel +++ b/runtime/src/iree/task/BUILD.bazel @@ -142,6 +142,7 @@ iree_runtime_cc_test( deps = [ ":task", "//runtime/src/iree/base", + "//runtime/src/iree/base/internal:threading", "//runtime/src/iree/task/testing:test_util", "//runtime/src/iree/testing:gtest", "//runtime/src/iree/testing:gtest_main", diff --git a/runtime/src/iree/task/CMakeLists.txt b/runtime/src/iree/task/CMakeLists.txt index 5c726052c28d..8501997b4746 100644 --- a/runtime/src/iree/task/CMakeLists.txt +++ b/runtime/src/iree/task/CMakeLists.txt @@ -143,6 +143,7 @@ iree_cc_test( DEPS ::task iree::base + iree::base::internal::threading iree::task::testing::test_util iree::testing::gtest iree::testing::gtest_main diff --git a/runtime/src/iree/task/queue.h b/runtime/src/iree/task/queue.h index 917b872c97a3..809ac33177d9 100644 --- a/runtime/src/iree/task/queue.h +++ b/runtime/src/iree/task/queue.h @@ -149,9 +149,15 @@ iree_task_t* iree_task_queue_flush_from_lifo_slist( iree_task_t* iree_task_queue_pop_front(iree_task_queue_t* queue); // Tries to steal up to |max_tasks| from the back of the queue. -// Returns NULL if no tasks are available and otherwise up to |max_tasks| tasks -// that were at the tail of the |source_queue| will be moved to the -// |target_queue| and the first of the stolen tasks is returned. +// +// On success, up to |max_tasks| tasks that were at the tail of the +// |source_queue| will be moved to the |target_queue| and the first of the +// stolen tasks is returned. +// +// On failure, NULL is returned. +// +// This function is allowed to fail spuriously, i.e. even if there are +// tasks to steal. // // It's expected this is not called from the queue's owning worker, though it's // valid to do so. diff --git a/runtime/src/iree/task/queue_test.cc b/runtime/src/iree/task/queue_test.cc index 53342fd4b899..88116e3fb16f 100644 --- a/runtime/src/iree/task/queue_test.cc +++ b/runtime/src/iree/task/queue_test.cc @@ -6,8 +6,25 @@ #include "iree/task/queue.h" +#include "iree/base/internal/threading.h" #include "iree/testing/gtest.h" +// Like iree_task_queue_try_steal but retries until success. +// This is used in this test as iree_task_queue_try_steal may (rarely) fail even +// in simple single-threaded tests, see #15488. +static iree_task_t* iree_task_queue_try_steal_until_success( + iree_task_queue_t* source_queue, iree_task_queue_t* target_queue, + iree_host_size_t max_tasks) { + while (true) { + iree_task_t* task = + iree_task_queue_try_steal(source_queue, target_queue, max_tasks); + if (task) { + return task; + } + iree_thread_yield(); + } +} + namespace { TEST(QueueTest, Lifetime) { @@ -186,8 +203,8 @@ TEST(QueueTest, TryStealEmpty) { iree_task_t task_c = {0}; iree_task_queue_push_front(&source_queue, &task_c); - EXPECT_EQ(&task_a, - iree_task_queue_try_steal(&source_queue, &target_queue, 1)); + EXPECT_EQ(&task_a, iree_task_queue_try_steal_until_success(&source_queue, + &target_queue, 1)); iree_task_queue_deinitialize(&source_queue); iree_task_queue_deinitialize(&target_queue); @@ -202,8 +219,8 @@ TEST(QueueTest, TryStealLast) { iree_task_t task_a = {0}; iree_task_queue_push_front(&source_queue, &task_a); - EXPECT_EQ(&task_a, - iree_task_queue_try_steal(&source_queue, &target_queue, 100)); + EXPECT_EQ(&task_a, iree_task_queue_try_steal_until_success( + &source_queue, &target_queue, 100)); EXPECT_TRUE(iree_task_queue_is_empty(&target_queue)); EXPECT_TRUE(iree_task_queue_is_empty(&source_queue)); @@ -224,8 +241,8 @@ TEST(QueueTest, TrySteal1) { iree_task_queue_push_front(&source_queue, &task_b); iree_task_queue_push_front(&source_queue, &task_a); - EXPECT_EQ(&task_c, - iree_task_queue_try_steal(&source_queue, &target_queue, 1)); + EXPECT_EQ(&task_c, iree_task_queue_try_steal_until_success(&source_queue, + &target_queue, 1)); EXPECT_TRUE(iree_task_queue_is_empty(&target_queue)); EXPECT_EQ(&task_a, iree_task_queue_pop_front(&source_queue)); @@ -250,8 +267,8 @@ TEST(QueueTest, TryStealIntoExisting) { iree_task_t task_existing = {0}; iree_task_queue_push_front(&target_queue, &task_existing); - EXPECT_EQ(&task_existing, - iree_task_queue_try_steal(&source_queue, &target_queue, 1)); + EXPECT_EQ(&task_existing, iree_task_queue_try_steal_until_success( + &source_queue, &target_queue, 1)); EXPECT_EQ(&task_a, iree_task_queue_pop_front(&source_queue)); EXPECT_TRUE(iree_task_queue_is_empty(&source_queue)); @@ -278,8 +295,8 @@ TEST(QueueTest, TryStealMany) { iree_task_queue_push_front(&source_queue, &task_b); iree_task_queue_push_front(&source_queue, &task_a); - EXPECT_EQ(&task_c, - iree_task_queue_try_steal(&source_queue, &target_queue, 2)); + EXPECT_EQ(&task_c, iree_task_queue_try_steal_until_success(&source_queue, + &target_queue, 2)); EXPECT_EQ(&task_d, iree_task_queue_pop_front(&target_queue)); EXPECT_TRUE(iree_task_queue_is_empty(&target_queue)); @@ -306,8 +323,8 @@ TEST(QueueTest, TryStealAll) { iree_task_queue_push_front(&source_queue, &task_b); iree_task_queue_push_front(&source_queue, &task_a); - EXPECT_EQ(&task_c, - iree_task_queue_try_steal(&source_queue, &target_queue, 1000)); + EXPECT_EQ(&task_c, iree_task_queue_try_steal_until_success( + &source_queue, &target_queue, 1000)); EXPECT_EQ(&task_d, iree_task_queue_pop_front(&target_queue)); EXPECT_TRUE(iree_task_queue_is_empty(&target_queue));