From c1ccdda7b8d83593bc9df402be2a3f159a2cf0d1 Mon Sep 17 00:00:00 2001 From: StekPerepolnen Date: Thu, 20 Jun 2024 13:58:46 +0000 Subject: [PATCH 1/3] move time difference up --- ydb/core/health_check/health_check.cpp | 48 ++++++++++++++++---------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 8851c2697b57..be810bd3386f 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -2579,6 +2579,7 @@ class TSelfCheckRequest : public TActorBootstrapped { databaseStatus.set_name(path); FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"}); FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"}); + FillTimeDifference(state, {&dbContext, "NODES_TIME_DIFFERENCE"}); if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN && databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()), @@ -2599,38 +2600,50 @@ class TSelfCheckRequest : public TActorBootstrapped { const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000); const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000); - void FillNodesSyncStatus(TOverallStateContext& context) { + void FillTimeDifference(TDatabaseState& databaseState, TSelfCheckContext context) { long maxClockSkewUs = 0; TNodeId maxClockSkewPeerId = 0; TNodeId maxClockSkewNodeId = 0; - for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) { - if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(nodeSystemState->GetMaxClockSkewPeerId()) - && abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { - maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs()); - maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId(); - maxClockSkewNodeId = nodeId; + + TVector* computeNodeIds = &databaseState.ComputeNodeIds; + if (databaseState.ResourcePathId + && databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive) + { + auto itDatabase = FilterDomainKey.find(TSubDomainKey(databaseState.ResourcePathId.OwnerId, databaseState.ResourcePathId.LocalPathId)); + if (itDatabase != FilterDomainKey.end()) { + const TString& sharedDatabaseName = itDatabase->second; + TDatabaseState& sharedDatabase = DatabaseState[sharedDatabaseName]; + computeNodeIds = &sharedDatabase.ComputeNodeIds; + } + } + + for (TNodeId nodeId : *computeNodeIds) { + auto itNodeSystemState = MergedNodeSystemState.find(nodeId); + if (itNodeSystemState != MergedNodeSystemState.end()) { + if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(itNodeSystemState->second->GetMaxClockSkewPeerId()) + && abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { + maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()); + maxClockSkewPeerId = itNodeSystemState->second->GetMaxClockSkewPeerId(); + maxClockSkewNodeId = nodeId; + } } } + if (!maxClockSkewNodeId) { return; } - TSelfCheckResult syncContext; - syncContext.Type = "NODES_TIME_DIFFERENCE"; - FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node()); - FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer()); + FillNodeInfo(maxClockSkewNodeId, context.Location.mutable_node()); + FillNodeInfo(maxClockSkewPeerId, context.Location.mutable_peer()); TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs); if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); + context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); + context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); } else { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); + context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); } - - context.UpdateMaxStatus(syncContext.GetOverallStatus()); - context.AddIssues(syncContext.IssueRecords); } void FillResult(TOverallStateContext context) { @@ -2641,7 +2654,6 @@ class TSelfCheckRequest : public TActorBootstrapped { FillDatabaseResult(context, path, state); } } - FillNodesSyncStatus(context); if (DatabaseState.empty()) { Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status()); TSelfCheckResult tabletContext; From 0dc0c27555617e6dbb2280697c1733d557aff7a6 Mon Sep 17 00:00:00 2001 From: StekPerepolnen Date: Sun, 23 Jun 2024 12:18:36 +0000 Subject: [PATCH 2/3] move time difference issue --- ydb/core/health_check/health_check.cpp | 18 +++++++++++++----- ydb/public/api/protos/ydb_monitoring.proto | 8 ++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index be810bd3386f..fd08bff710e1 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -2579,15 +2579,17 @@ class TSelfCheckRequest : public TActorBootstrapped { databaseStatus.set_name(path); FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"}); FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"}); - FillTimeDifference(state, {&dbContext, "NODES_TIME_DIFFERENCE"}); + FillTimeDifference(state, *databaseStatus.mutable_time_difference(), {&dbContext, "NODES_TIME_DIFFERENCE"}); if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN && databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()), - "Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState}); + "Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState, ETags::SyncState }); } else if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN) { - dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState}); + dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState, ETags::SyncState}); } else if (databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { - dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState}); + dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState, ETags::SyncState}); + } else if (databaseStatus.time_difference().overall() != Ydb::Monitoring::StatusFlag::GREEN) { + dbContext.ReportStatus(databaseStatus.time_difference().overall(), "Database has time difference issues", ETags::DBState, {ETags::SyncState}); } databaseStatus.set_overall(dbContext.GetOverallStatus()); context.UpdateMaxStatus(dbContext.GetOverallStatus()); @@ -2600,7 +2602,7 @@ class TSelfCheckRequest : public TActorBootstrapped { const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000); const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000); - void FillTimeDifference(TDatabaseState& databaseState, TSelfCheckContext context) { + void FillTimeDifference(TDatabaseState& databaseState, Ydb::Monitoring::TimeDifferenceStatus& timeDifferenceStatus, TSelfCheckContext context) { long maxClockSkewUs = 0; TNodeId maxClockSkewPeerId = 0; TNodeId maxClockSkewNodeId = 0; @@ -2630,6 +2632,7 @@ class TSelfCheckRequest : public TActorBootstrapped { } if (!maxClockSkewNodeId) { + timeDifferenceStatus.set_overall(Ydb::Monitoring::StatusFlag::GREEN); return; } @@ -2644,6 +2647,11 @@ class TSelfCheckRequest : public TActorBootstrapped { } else { context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); } + + timeDifferenceStatus.set_node(ToString(maxClockSkewNodeId)); + timeDifferenceStatus.set_peer(ToString(maxClockSkewPeerId)); + timeDifferenceStatus.set_max_difference_ms(maxClockSkewTime.MilliSeconds()); + timeDifferenceStatus.set_overall(context.GetOverallStatus()); } void FillResult(TOverallStateContext context) { diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto index dd99eb583f29..540a2c100fe6 100644 --- a/ydb/public/api/protos/ydb_monitoring.proto +++ b/ydb/public/api/protos/ydb_monitoring.proto @@ -122,6 +122,13 @@ message ComputeStatus { float shards_quota_usage = 5; } +message TimeDifferenceStatus { + StatusFlag.Status overall = 1; + int64 max_difference_ms = 2; + string node = 3; + string peer = 4; +} + message LocationNode { uint32 id = 1; string host = 2; @@ -198,6 +205,7 @@ message DatabaseStatus { StatusFlag.Status overall = 2; StorageStatus storage = 3; ComputeStatus compute = 4; + TimeDifferenceStatus time_difference = 5; } message SelfCheckResult { From 5e8f29df5bb0801b1eef9ef2daaeb260b39cbd21 Mon Sep 17 00:00:00 2001 From: StekPerepolnen Date: Fri, 28 Jun 2024 16:21:14 +0000 Subject: [PATCH 3/3] move time_difference to compute_node --- ydb/core/health_check/health_check.cpp | 106 +++++++++------------ ydb/public/api/protos/ydb_monitoring.proto | 16 ++-- 2 files changed, 54 insertions(+), 68 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index fd08bff710e1..46460e4de841 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1450,7 +1450,7 @@ class TSelfCheckRequest : public TActorBootstrapped { } } - void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { + void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); TSelfCheckContext rrContext(&context, "NODE_UPTIME"); @@ -1488,6 +1488,34 @@ class TSelfCheckRequest : public TActorBootstrapped { } loadAverageStatus.set_overall(laContext.GetOverallStatus()); } + + if (nodeSystemState.HasMaxClockSkewPeerId()) { + TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId(); + long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs(); + TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs)); + Ydb::Monitoring::StatusFlag::Status status; + if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) { + status = Ydb::Monitoring::StatusFlag::ORANGE; + } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) { + status = Ydb::Monitoring::StatusFlag::YELLOW; + } else { + status = Ydb::Monitoring::StatusFlag::GREEN; + } + + computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId)); + computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds()); + computeNodeStatus.set_overall(status); + + if (reportTimeDifference) { + TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE"); + FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer()); + if (status == Ydb::Monitoring::StatusFlag::GREEN) { + tdContext.ReportStatus(status); + } else { + tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState); + } + } + } } else { // context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, // TStringBuilder() << "Compute node is not available", @@ -1552,14 +1580,27 @@ class TSelfCheckRequest : public TActorBootstrapped { if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) { context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState}); } + long maxClockSkewUs = 0; + TNodeId maxClockSkewNodeId = 0; + for (TNodeId nodeId : *computeNodeIds) { + auto itNodeSystemState = MergedNodeSystemState.find(nodeId); + if (itNodeSystemState != MergedNodeSystemState.end()) { + if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0 + && abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { + maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()); + maxClockSkewNodeId = nodeId; + } + } + } for (TNodeId nodeId : *computeNodeIds) { auto& computeNode = *computeStatus.add_nodes(); - FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}); + FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}, maxClockSkewNodeId == nodeId); } FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"}); context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime}); context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage}); + context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState}); Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN; computeNodeIds->push_back(0); // for tablets without node for (TNodeId nodeId : *computeNodeIds) { @@ -2579,17 +2620,14 @@ class TSelfCheckRequest : public TActorBootstrapped { databaseStatus.set_name(path); FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"}); FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"}); - FillTimeDifference(state, *databaseStatus.mutable_time_difference(), {&dbContext, "NODES_TIME_DIFFERENCE"}); if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN && databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()), - "Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState, ETags::SyncState }); + "Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState}); } else if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN) { - dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState, ETags::SyncState}); + dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState}); } else if (databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { - dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState, ETags::SyncState}); - } else if (databaseStatus.time_difference().overall() != Ydb::Monitoring::StatusFlag::GREEN) { - dbContext.ReportStatus(databaseStatus.time_difference().overall(), "Database has time difference issues", ETags::DBState, {ETags::SyncState}); + dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState}); } databaseStatus.set_overall(dbContext.GetOverallStatus()); context.UpdateMaxStatus(dbContext.GetOverallStatus()); @@ -2602,58 +2640,6 @@ class TSelfCheckRequest : public TActorBootstrapped { const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000); const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000); - void FillTimeDifference(TDatabaseState& databaseState, Ydb::Monitoring::TimeDifferenceStatus& timeDifferenceStatus, TSelfCheckContext context) { - long maxClockSkewUs = 0; - TNodeId maxClockSkewPeerId = 0; - TNodeId maxClockSkewNodeId = 0; - - TVector* computeNodeIds = &databaseState.ComputeNodeIds; - if (databaseState.ResourcePathId - && databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive) - { - auto itDatabase = FilterDomainKey.find(TSubDomainKey(databaseState.ResourcePathId.OwnerId, databaseState.ResourcePathId.LocalPathId)); - if (itDatabase != FilterDomainKey.end()) { - const TString& sharedDatabaseName = itDatabase->second; - TDatabaseState& sharedDatabase = DatabaseState[sharedDatabaseName]; - computeNodeIds = &sharedDatabase.ComputeNodeIds; - } - } - - for (TNodeId nodeId : *computeNodeIds) { - auto itNodeSystemState = MergedNodeSystemState.find(nodeId); - if (itNodeSystemState != MergedNodeSystemState.end()) { - if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(itNodeSystemState->second->GetMaxClockSkewPeerId()) - && abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { - maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()); - maxClockSkewPeerId = itNodeSystemState->second->GetMaxClockSkewPeerId(); - maxClockSkewNodeId = nodeId; - } - } - } - - if (!maxClockSkewNodeId) { - timeDifferenceStatus.set_overall(Ydb::Monitoring::StatusFlag::GREEN); - return; - } - - FillNodeInfo(maxClockSkewNodeId, context.Location.mutable_node()); - FillNodeInfo(maxClockSkewPeerId, context.Location.mutable_peer()); - - TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs); - if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); - } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); - } else { - context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); - } - - timeDifferenceStatus.set_node(ToString(maxClockSkewNodeId)); - timeDifferenceStatus.set_peer(ToString(maxClockSkewPeerId)); - timeDifferenceStatus.set_max_difference_ms(maxClockSkewTime.MilliSeconds()); - timeDifferenceStatus.set_overall(context.GetOverallStatus()); - } - void FillResult(TOverallStateContext context) { if (IsSpecificDatabaseFilter()) { FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]); diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto index 540a2c100fe6..dc47c4ecfb89 100644 --- a/ydb/public/api/protos/ydb_monitoring.proto +++ b/ydb/public/api/protos/ydb_monitoring.proto @@ -106,12 +106,19 @@ message LoadAverageStatus { uint32 cores = 3; } +message TimeDifferenceStatus { + StatusFlag.Status overall = 1; + int64 difference_ms = 2; + string peer = 3; +} + message ComputeNodeStatus { string id = 1; StatusFlag.Status overall = 2; repeated ComputeTabletStatus tablets = 3; repeated ThreadPoolStatus pools = 4; LoadAverageStatus load = 5; + TimeDifferenceStatus max_time_difference = 6; } message ComputeStatus { @@ -122,13 +129,6 @@ message ComputeStatus { float shards_quota_usage = 5; } -message TimeDifferenceStatus { - StatusFlag.Status overall = 1; - int64 max_difference_ms = 2; - string node = 3; - string peer = 4; -} - message LocationNode { uint32 id = 1; string host = 2; @@ -174,6 +174,7 @@ message LocationCompute { LocationNode node = 1; LocationComputePool pool = 2; LocationComputeTablet tablet = 3; + LocationNode peer = 4; } message LocationDatabase { @@ -205,7 +206,6 @@ message DatabaseStatus { StatusFlag.Status overall = 2; StorageStatus storage = 3; ComputeStatus compute = 4; - TimeDifferenceStatus time_difference = 5; } message SelfCheckResult {