Skip to content

Commit

Permalink
move time_difference to compute_node
Browse files Browse the repository at this point in the history
  • Loading branch information
StekPerepolnen committed Jul 1, 2024
1 parent 0dc0c27 commit 5e8f29d
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 68 deletions.
106 changes: 46 additions & 60 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1450,7 +1450,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference) {
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());

TSelfCheckContext rrContext(&context, "NODE_UPTIME");
Expand Down Expand Up @@ -1488,6 +1488,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
loadAverageStatus.set_overall(laContext.GetOverallStatus());
}

if (nodeSystemState.HasMaxClockSkewPeerId()) {
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
Ydb::Monitoring::StatusFlag::Status status;
if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
status = Ydb::Monitoring::StatusFlag::ORANGE;
} else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
status = Ydb::Monitoring::StatusFlag::YELLOW;
} else {
status = Ydb::Monitoring::StatusFlag::GREEN;
}

computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId));
computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds());
computeNodeStatus.set_overall(status);

if (reportTimeDifference) {
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer());
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
tdContext.ReportStatus(status);
} else {
tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState);
}
}
}
} else {
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
// TStringBuilder() << "Compute node is not available",
Expand Down Expand Up @@ -1552,14 +1580,27 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
}
long maxClockSkewUs = 0;
TNodeId maxClockSkewNodeId = 0;
for (TNodeId nodeId : *computeNodeIds) {
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
if (itNodeSystemState != MergedNodeSystemState.end()) {
if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
maxClockSkewNodeId = nodeId;
}
}
}
for (TNodeId nodeId : *computeNodeIds) {
auto& computeNode = *computeStatus.add_nodes();
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}, maxClockSkewNodeId == nodeId);
}
FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
computeNodeIds->push_back(0); // for tablets without node
for (TNodeId nodeId : *computeNodeIds) {
Expand Down Expand Up @@ -2579,17 +2620,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
databaseStatus.set_name(path);
FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"});
FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"});
FillTimeDifference(state, *databaseStatus.mutable_time_difference(), {&dbContext, "NODES_TIME_DIFFERENCE"});
if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN
&& databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()),
"Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState, ETags::SyncState });
"Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState});
} else if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState, ETags::SyncState});
dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState});
} else if (databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState, ETags::SyncState});
} else if (databaseStatus.time_difference().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(databaseStatus.time_difference().overall(), "Database has time difference issues", ETags::DBState, {ETags::SyncState});
dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState});
}
databaseStatus.set_overall(dbContext.GetOverallStatus());
context.UpdateMaxStatus(dbContext.GetOverallStatus());
Expand All @@ -2602,58 +2640,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);

void FillTimeDifference(TDatabaseState& databaseState, Ydb::Monitoring::TimeDifferenceStatus& timeDifferenceStatus, TSelfCheckContext context) {
long maxClockSkewUs = 0;
TNodeId maxClockSkewPeerId = 0;
TNodeId maxClockSkewNodeId = 0;

TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds;
if (databaseState.ResourcePathId
&& databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive)
{
auto itDatabase = FilterDomainKey.find(TSubDomainKey(databaseState.ResourcePathId.OwnerId, databaseState.ResourcePathId.LocalPathId));
if (itDatabase != FilterDomainKey.end()) {
const TString& sharedDatabaseName = itDatabase->second;
TDatabaseState& sharedDatabase = DatabaseState[sharedDatabaseName];
computeNodeIds = &sharedDatabase.ComputeNodeIds;
}
}

for (TNodeId nodeId : *computeNodeIds) {
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
if (itNodeSystemState != MergedNodeSystemState.end()) {
if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(itNodeSystemState->second->GetMaxClockSkewPeerId())
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
maxClockSkewPeerId = itNodeSystemState->second->GetMaxClockSkewPeerId();
maxClockSkewNodeId = nodeId;
}
}
}

if (!maxClockSkewNodeId) {
timeDifferenceStatus.set_overall(Ydb::Monitoring::StatusFlag::GREEN);
return;
}

FillNodeInfo(maxClockSkewNodeId, context.Location.mutable_node());
FillNodeInfo(maxClockSkewPeerId, context.Location.mutable_peer());

TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}

timeDifferenceStatus.set_node(ToString(maxClockSkewNodeId));
timeDifferenceStatus.set_peer(ToString(maxClockSkewPeerId));
timeDifferenceStatus.set_max_difference_ms(maxClockSkewTime.MilliSeconds());
timeDifferenceStatus.set_overall(context.GetOverallStatus());
}

void FillResult(TOverallStateContext context) {
if (IsSpecificDatabaseFilter()) {
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
Expand Down
16 changes: 8 additions & 8 deletions ydb/public/api/protos/ydb_monitoring.proto
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,19 @@ message LoadAverageStatus {
uint32 cores = 3;
}

message TimeDifferenceStatus {
StatusFlag.Status overall = 1;
int64 difference_ms = 2;
string peer = 3;
}

message ComputeNodeStatus {
string id = 1;
StatusFlag.Status overall = 2;
repeated ComputeTabletStatus tablets = 3;
repeated ThreadPoolStatus pools = 4;
LoadAverageStatus load = 5;
TimeDifferenceStatus max_time_difference = 6;
}

message ComputeStatus {
Expand All @@ -122,13 +129,6 @@ message ComputeStatus {
float shards_quota_usage = 5;
}

message TimeDifferenceStatus {
StatusFlag.Status overall = 1;
int64 max_difference_ms = 2;
string node = 3;
string peer = 4;
}

message LocationNode {
uint32 id = 1;
string host = 2;
Expand Down Expand Up @@ -174,6 +174,7 @@ message LocationCompute {
LocationNode node = 1;
LocationComputePool pool = 2;
LocationComputeTablet tablet = 3;
LocationNode peer = 4;
}

message LocationDatabase {
Expand Down Expand Up @@ -205,7 +206,6 @@ message DatabaseStatus {
StatusFlag.Status overall = 2;
StorageStatus storage = 3;
ComputeStatus compute = 4;
TimeDifferenceStatus time_difference = 5;
}

message SelfCheckResult {
Expand Down

0 comments on commit 5e8f29d

Please sign in to comment.