Skip to content

Commit

Permalink
added database filter for time difference - merge stable-24-2 (ydb-pl…
Browse files Browse the repository at this point in the history
  • Loading branch information
StekPerepolnen authored and uzhastik committed Jul 5, 2024
1 parent 4e06d9b commit fd75036
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 42 deletions.
6 changes: 4 additions & 2 deletions ydb/core/driver_lib/run/kikimr_services_initializers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,8 +702,10 @@ void TBasicServicesInitializer::InitializeServices(NActors::TActorSystemSetup* s
data.Yellow ? NKikimrWhiteboard::EFlag::Yellow :
data.Orange ? NKikimrWhiteboard::EFlag::Orange :
data.Red ? NKikimrWhiteboard::EFlag::Red : NKikimrWhiteboard::EFlag()));
data.ActorSystem->Send(whiteboardId, new NNodeWhiteboard::TEvWhiteboard::TEvClockSkewUpdate(
data.PeerId, data.ClockSkew));
if (data.ReportClockSkew) {
data.ActorSystem->Send(whiteboardId, new NNodeWhiteboard::TEvWhiteboard::TEvClockSkewUpdate(
data.PeerId, data.ClockSkew));
}
};
}

Expand Down
81 changes: 43 additions & 38 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
ui64 StorageQuota;
ui64 StorageUsage;
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
TNodeId MaxTimeDifferenceNodeId = 0;
};

struct TSelfCheckResult {
Expand Down Expand Up @@ -813,10 +814,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
ReplyAndPassAway();
}

bool IsStaticNode(const TEvInterconnect::TNodeInfo& nodeInfo) const {
bool IsStaticNode(const TNodeId nodeId) const {
TAppData* appData = AppData();
if (appData->DynamicNameserviceConfig) {
return nodeInfo.NodeId <= AppData()->DynamicNameserviceConfig->MaxStaticNodeId;
return nodeId <= AppData()->DynamicNameserviceConfig->MaxStaticNodeId;
} else {
return true;
}
Expand All @@ -827,7 +828,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
NodesInfo = ev->Release();
for (const auto& ni : NodesInfo->Nodes) {
MergedNodeInfo[ni.NodeId] = &ni;
if (IsStaticNode(ni) && needComputeFromStaticNodes) {
if (IsStaticNode(ni.NodeId) && needComputeFromStaticNodes) {
DatabaseState[DomainPath].ComputeNodeIds.push_back(ni.NodeId);
RequestComputeNode(ni.NodeId);
}
Expand Down Expand Up @@ -1251,7 +1252,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());

TSelfCheckContext rrContext(&context, "NODE_UPTIME");
Expand Down Expand Up @@ -1289,6 +1290,32 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
loadAverageStatus.set_overall(laContext.GetOverallStatus());
}

if (nodeSystemState.HasMaxClockSkewPeerId()) {
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
Ydb::Monitoring::StatusFlag::Status status;
if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
status = Ydb::Monitoring::StatusFlag::ORANGE;
} else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
status = Ydb::Monitoring::StatusFlag::YELLOW;
} else {
status = Ydb::Monitoring::StatusFlag::GREEN;
}

if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
tdContext.ReportStatus(status);
} else {
tdContext.ReportStatus(status, TStringBuilder() << "Node is "
<< timeDifferenceDuration.MilliSeconds() << " ms "
<< (timeDifferenceUs > 0 ? "behind " : "ahead of ")
<< "peer [" << peerId << "]", ETags::SyncState);
}
}
}
} else {
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
// TStringBuilder() << "Compute node is not available",
Expand Down Expand Up @@ -1320,12 +1347,24 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
}
long maxTimeDifferenceUs = 0;
for (TNodeId nodeId : *computeNodeIds) {
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
if (itNodeSystemState != MergedNodeSystemState.end()) {
if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxTimeDifferenceUs) {
maxTimeDifferenceUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
databaseState.MaxTimeDifferenceNodeId = nodeId;
}
}
}
for (TNodeId nodeId : *computeNodeIds) {
auto& computeNode = *computeStatus.add_nodes();
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
}
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
computeNodeIds->push_back(0); // for tablets without node
for (TNodeId nodeId : *computeNodeIds) {
Expand Down Expand Up @@ -2072,39 +2111,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);

void FillNodesSyncStatus(TOverallStateContext& context) {
long maxClockSkewUs = 0;
TNodeId maxClockSkewPeerId = 0;
TNodeId maxClockSkewNodeId = 0;
for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) {
if (abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs());
maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId();
maxClockSkewNodeId = nodeId;
}
}
if (!maxClockSkewNodeId) {
return;
}

TSelfCheckResult syncContext;
syncContext.Type = "NODES_TIME_DIFFERENCE";
FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node());
FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer());

TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}

context.UpdateMaxStatus(syncContext.GetOverallStatus());
context.AddIssues(syncContext.IssueRecords);
}

void FillResult(TOverallStateContext context) {
if (IsSpecificDatabaseFilter()) {
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
Expand All @@ -2113,7 +2119,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
FillDatabaseResult(context, path, state);
}
}
FillNodesSyncStatus(context);
if (DatabaseState.empty()) {
Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status());
TSelfCheckResult tabletContext;
Expand Down
5 changes: 4 additions & 1 deletion ydb/library/actors/interconnect/interconnect_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@ namespace NActors {
bool Orange;
bool Red;
i64 ClockSkew;
bool ReportClockSkew;

TWhiteboardSessionStatus(TActorSystem* actorSystem, ui32 peerId, const TString& peer, bool connected, bool green, bool yellow, bool orange, bool red, i64 clockSkew)
TWhiteboardSessionStatus(TActorSystem* actorSystem, ui32 peerId, const TString& peer, bool connected,
bool green, bool yellow, bool orange, bool red, i64 clockSkew, bool reportClockSkew)
: ActorSystem(actorSystem)
, PeerId(peerId)
, Peer(peer)
Expand All @@ -82,6 +84,7 @@ namespace NActors {
, Orange(orange)
, Red(red)
, ClockSkew(clockSkew)
, ReportClockSkew(reportClockSkew)
{}
};

Expand Down
7 changes: 6 additions & 1 deletion ydb/library/actors/interconnect/interconnect_tcp_session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,10 @@ namespace NActors {
} while (false);
}

// we need track clockskew only if it's one tenant nodes connection
// they have one scope in this case
bool reportClockSkew = Proxy->Common->LocalScopeId.first != 0 && Proxy->Common->LocalScopeId == Params.PeerScopeId;

callback({TlsActivationContext->ExecutorThread.ActorSystem,
Proxy->PeerNodeId,
Proxy->Metrics->GetHumanFriendlyPeerHostName(),
Expand All @@ -1007,7 +1011,8 @@ namespace NActors {
flagState == EFlag::YELLOW,
flagState == EFlag::ORANGE,
flagState == EFlag::RED,
ReceiveContext->ClockSkew_us.load()});
ReceiveContext->ClockSkew_us.load(),
reportClockSkew});
}

if (connected) {
Expand Down

0 comments on commit fd75036

Please sign in to comment.