Skip to content

Commit

Permalink
YQ-3405 fixed endless retries for external error (ydb-platform#6626)
Browse files Browse the repository at this point in the history
  • Loading branch information
GrigoriyPA committed Jul 16, 2024
1 parent d52cc84 commit 8c6096d
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 7 deletions.
6 changes: 5 additions & 1 deletion ydb/core/fq/libs/config/protos/control_plane_storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ message TQueryMapping {

// 1. StatusCode(s) are handled with defined policies, non-unique StatusCode(s) across all policies is UB
// 2. RetryCount and RetryPeriodMs are used to calculate actual RetryRate, if it exceeds RetryCount, query is aborted
// - Number of retries during RetryPeriod time less than 2 * RetryCount due to RetryRate
// 3. BackoffPeriodMs is factor of RetryRate to delay query execution before next retry
// 4. There are no default retry policy, all unhandled statuses are fatal
// 4. RetryLimit is hard limit for amount query retry count, after that query is aborted
// - If RetryLimit = 0, query can be abborted only by RetryRate
// 5. There are no default retry policy, all unhandled statuses are fatal

message TRetryPolicy {
uint64 RetryCount = 1;
uint64 RetryLimit = 4;
string RetryPeriod = 2;
string BackoffPeriod = 3;
}
Expand Down
3 changes: 2 additions & 1 deletion ydb/core/fq/libs/control_plane_storage/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,11 @@ TControlPlaneStorageConfig::TControlPlaneStorageConfig(const NConfig::TControlPl
for (const auto& mapping : Proto.GetRetryPolicyMapping()) {
auto& retryPolicy = mapping.GetPolicy();
auto retryCount = retryPolicy.GetRetryCount();
auto retryLimit = retryPolicy.GetRetryLimit();
auto retryPeriod = GetDuration(retryPolicy.GetRetryPeriod(), TDuration::Hours(1));
auto backoffPeriod = GetDuration(retryPolicy.GetBackoffPeriod(), TDuration::Zero());
for (const auto statusCode: mapping.GetStatusCode()) {
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryPeriod, backoffPeriod));
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryLimit, retryPeriod, backoffPeriod));
}
}

Expand Down
4 changes: 2 additions & 2 deletions ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ TPingTaskParams ConstructHardPingTask(
internal.clear_operation_id();
}

TRetryPolicyItem policy(0, TDuration::Seconds(1), TDuration::Zero());
TRetryPolicyItem policy(0, 0, TDuration::Seconds(1), TDuration::Zero());
auto it = retryPolicies.find(request.status_code());
auto policyFound = it != retryPolicies.end();
if (policyFound) {
Expand All @@ -183,7 +183,7 @@ TPingTaskParams ConstructHardPingTask(
TStringBuilder builder;
builder << "Query failed with code " << NYql::NDqProto::StatusIds_StatusCode_Name(request.status_code());
if (policy.RetryCount) {
builder << " (failure rate " << retryLimiter.RetryRate << " exceeds limit of " << policy.RetryCount << ")";
builder << " (" << retryLimiter.LastError << ")";
}
builder << " at " << Now();

Expand Down
12 changes: 11 additions & 1 deletion ydb/core/fq/libs/control_plane_storage/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@ bool TRetryLimiter::UpdateOnRetry(const TInstant& lastSeenAt, const TRetryPolicy
RetryRate = 0.0;
}
}
bool shouldRetry = RetryRate < policy.RetryCount;

bool shouldRetry = true;
if (RetryRate >= policy.RetryCount) {
shouldRetry = false;
LastError = TStringBuilder() << "failure rate " << RetryRate << " exceeds limit of " << policy.RetryCount;
} else if (policy.RetryLimit && RetryCount >= policy.RetryLimit) {
shouldRetry = false;
LastError = TStringBuilder() << "retry count reached limit of " << policy.RetryLimit;
}

if (shouldRetry) {
RetryCount++;
RetryCounterUpdatedAt = now;
Expand Down Expand Up @@ -140,6 +149,7 @@ NConfig::TControlPlaneStorageConfig FillDefaultParameters(NConfig::TControlPlane
policyMapping.AddStatusCode(NYql::NDqProto::StatusIds::EXTERNAL_ERROR);
auto& policy = *policyMapping.MutablePolicy();
policy.SetRetryCount(10);
policy.SetRetryLimit(40);
policy.SetRetryPeriod("1m");
policy.SetBackoffPeriod("1s");
}
Expand Down
6 changes: 4 additions & 2 deletions ydb/core/fq/libs/control_plane_storage/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ namespace NFq {
class TRetryPolicyItem {
public:
TRetryPolicyItem() = default;
TRetryPolicyItem(ui64 retryCount, const TDuration& retryPeriod, const TDuration& backoffPeriod)
: RetryCount(retryCount), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
TRetryPolicyItem(ui64 retryCount, ui64 retryLimit, const TDuration& retryPeriod, const TDuration& backoffPeriod)
: RetryCount(retryCount), RetryLimit(retryLimit), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
{ }
ui64 RetryCount = 0;
ui64 RetryLimit = 0;
TDuration RetryPeriod = TDuration::Zero();
TDuration BackoffPeriod = TDuration::Zero();
};
Expand All @@ -32,6 +33,7 @@ class TRetryLimiter {
ui64 RetryCount = 0;
TInstant RetryCounterUpdatedAt = TInstant::Zero();
double RetryRate = 0.0;
TString LastError;
};

bool IsTerminalStatus(FederatedQuery::QueryMeta::ComputeStatus status);
Expand Down

0 comments on commit 8c6096d

Please sign in to comment.