Skip to content

Commit

Permalink
do not trigger emergency balancer when all nodes have high usage (#6532
Browse files Browse the repository at this point in the history
…) (#6720)
  • Loading branch information
vporyadke authored Jul 19, 2024
1 parent df15684 commit 56c68e1
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 1 deletion.
3 changes: 2 additions & 1 deletion ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2327,7 +2327,8 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
nodeUsageHistogram.IncrementFor(record.Usage * 100);
}

if (stats.MaxUsage >= GetMaxNodeUsageToKick()) {
double minUsageToKick = GetMaxNodeUsageToKick() - GetNodeUsageRangeToKick();
if (stats.MaxUsage >= GetMaxNodeUsageToKick() && stats.MinUsage < minUsageToKick) {
std::vector<TNodeId> overloadedNodes;
for (const auto& [nodeId, nodeInfo] : Nodes) {
if (nodeInfo.IsAlive() && !nodeInfo.Down && nodeInfo.IsOverloaded()) {
Expand Down
4 changes: 4 additions & 0 deletions ydb/core/mind/hive/hive_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
return CurrentConfig.GetStorageBalancerInflight();
}

double GetNodeUsageRangeToKick() const {
return CurrentConfig.GetNodeUsageRangeToKick();
}

static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static bool IsSystemTablet(TTabletTypes::EType type);
Expand Down
70 changes: 70 additions & 0 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3961,6 +3961,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
// this value of MaxNodeUsageToKick is selected specifically to make test scenario work
// in link with number of tablets and values of network usage metrics used below
app.HiveConfig.SetMaxNodeUsageToKick(0.01);
app.HiveConfig.SetNodeUsageRangeToKick(0);
app.HiveConfig.SetEmergencyBalancerInflight(1); // to ensure fair distribution
});

Expand Down Expand Up @@ -4933,6 +4934,75 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT_VALUES_EQUAL(newDistribution[1].size(), TABLETS_PER_NODE - 1);
}

Y_UNIT_TEST(TestHiveBalancerHighUsage) {
static constexpr ui64 NUM_NODES = 2;
TTestBasicRuntime runtime(2, false);
Setup(runtime, true, 1, [](TAppPrepare& app) {
app.HiveConfig.SetTabletKickCooldownPeriod(0);
app.HiveConfig.SetResourceChangeReactionPeriod(0);
});
const int nodeBase = runtime.GetNodeId(0);
TActorId senderA = runtime.AllocateEdgeActor();
const ui64 hiveTablet = MakeDefaultHiveID();
const ui64 testerTablet = MakeTabletID(false, 1);

auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
{
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
}
}
return nodeTablets;
};

CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);

// wait for creation of nodes
{
TDispatchOptions options;
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
runtime.DispatchEvents(options);
}

TTabletTypes::EType tabletType = TTabletTypes::Dummy;
for (size_t i = 0; i < 2; ++i) {
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
ev->Record.SetObjectId(i);
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
MakeSureTabletIsUp(runtime, tabletId, 0);
}

auto initialDistribution = getDistribution();

std::array<double, NUM_NODES> usages = {.89, .91};
for (ui32 i = 0; i < 2; ++i) {
for (ui32 node = 0; node < NUM_NODES; ++node) {
TActorId sender = runtime.AllocateEdgeActor(node);
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
metrics->Record.SetTotalNodeUsage(usages[node]);

runtime.SendToPipe(hiveTablet, sender, metrics.Release(), node);
}
}

{
TDispatchOptions options;
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
runtime.DispatchEvents(options, TDuration::Seconds(10));
}

// Check that balancer moved no tablets
auto newDistribution = getDistribution();

UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);
}

Y_UNIT_TEST(TestUpdateTabletsObjectUpdatesMetrics) {
TTestBasicRuntime runtime(1, false);
Setup(runtime, true);
Expand Down
2 changes: 2 additions & 0 deletions ydb/core/mind/hive/monitoring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
UpdateConfig(db, "MinNetworkScatterToBalance", configUpdates);
UpdateConfig(db, "MinCounterScatterToBalance", configUpdates);
UpdateConfig(db, "MaxNodeUsageToKick", configUpdates, TSchemeIds::State::MaxNodeUsageToKick);
UpdateConfig(db, "NodeUsageRangeToKick", configUpdates);
UpdateConfig(db, "ResourceChangeReactionPeriod", configUpdates, TSchemeIds::State::ResourceChangeReactionPeriod);
UpdateConfig(db, "TabletKickCooldownPeriod", configUpdates, TSchemeIds::State::TabletKickCooldownPeriod);
UpdateConfig(db, "SpreadNeighbours", configUpdates, TSchemeIds::State::SpreadNeighbours);
Expand Down Expand Up @@ -1140,6 +1141,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
ShowConfig(out, "MinCounterScatterToBalance");
ShowConfig(out, "MinNodeUsageToBalance");
ShowConfig(out, "MaxNodeUsageToKick");
ShowConfig(out, "NodeUsageRangeToKick");
ShowConfig(out, "ResourceChangeReactionPeriod");
ShowConfig(out, "TabletKickCooldownPeriod");
ShowConfig(out, "NodeSelectStrategy");
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1469,6 +1469,7 @@ message THiveConfig {
optional double MinGroupUsageToBalance = 72 [default = 0.1];
optional uint64 StorageBalancerInflight = 73 [default = 1];
optional bool EnableDestroyOperations = 74 [default = false];
optional double NodeUsageRangeToKick = 75 [default = 0.2];
}

message TBlobCacheConfig {
Expand Down

0 comments on commit 56c68e1

Please sign in to comment.