From cb6c262a190ab5a1c71a5dd45ec267e237f1f0ef Mon Sep 17 00:00:00 2001 From: Sergey Pershin Date: Wed, 25 Sep 2024 12:05:01 -0700 Subject: [PATCH] Change locaton of TableScan::getOutput() 'should exit' check. (#11085) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11085 In some queries the eval can take considerable time inside DataSource::next which is being called in the infinite loop in TableScan::getOutput(). We've seen cases (with 100K character regex) when driver thread spends 20+ seconds on 1600 rows. We had a check that would force TableScan::getOutput() exit to avoid spending too much time executing w/o exiting. But this check is under the if (needNewSplit_) {} condition and might not be called for quite a long time. This is what we've observed in our corner case query. After a while the operator (driver) is considered to be stuck and then worker can get taken off cluster and query can get cancelled, while nothing is really stuck. In additon the query will hog multiple execution threads and can starve other queries/tasks. This change is putting the 'should exit' check in the beginning of the loop, so we call it no matter if we need a new split or not. Reviewed By: Yuhta Differential Revision: D63363269 fbshipit-source-id: f943a904fb0725843bf5a0f7e755e782c247ba45 --- velox/exec/TableScan.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/velox/exec/TableScan.cpp b/velox/exec/TableScan.cpp index 1a61ba9be667..f93d4a718afb 100644 --- a/velox/exec/TableScan.cpp +++ b/velox/exec/TableScan.cpp @@ -82,22 +82,22 @@ RowVectorPtr TableScan::getOutput() { curStatus_ = "getOutput: enter"; const auto startTimeMs = getCurrentTimeMs(); for (;;) { - if (needNewSplit_) { - // Check if our Task needs us to yield or we've been running for too long - // w/o producing a result. In this case we return with the Yield blocking - // reason and an already fulfilled future. - curStatus_ = "getOutput: task->shouldStop"; - const StopReason taskStopReason = driverCtx_->task->shouldStop(); - if (shouldStop(taskStopReason) || - shouldYield(taskStopReason, startTimeMs)) { - blockingReason_ = BlockingReason::kYield; - blockingFuture_ = ContinueFuture{folly::Unit{}}; - // A point for test code injection. - TestValue::adjust( - "facebook::velox::exec::TableScan::getOutput::yield", this); - return nullptr; - } + // Check if our Task needs us to yield or we've been running for too long + // w/o producing a result. In this case we return with the Yield blocking + // reason and an already fulfilled future. + curStatus_ = "getOutput: task->shouldStop"; + const StopReason taskStopReason = driverCtx_->task->shouldStop(); + if (shouldStop(taskStopReason) || + shouldYield(taskStopReason, startTimeMs)) { + blockingReason_ = BlockingReason::kYield; + blockingFuture_ = ContinueFuture{folly::Unit{}}; + // A point for test code injection. + TestValue::adjust( + "facebook::velox::exec::TableScan::getOutput::yield", this); + return nullptr; + } + if (needNewSplit_) { // A point for test code injection. TestValue::adjust("facebook::velox::exec::TableScan::getOutput", this);