Skip to content

Commit

Permalink
restore old behaviour of mapping space disk issues to group issues (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke authored Sep 9, 2024
1 parent c5a4fa9 commit 1b318e1
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 8 deletions.
8 changes: 4 additions & 4 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2075,7 +2075,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
++DisksColors[status];
switch (status) {
case Ydb::Monitoring::StatusFlag::BLUE: // disk is good, but not available
case Ydb::Monitoring::StatusFlag::YELLOW: // disk is initializing, not currently available
// No yellow or orange status here - this is intentional - they are used when a disk is running out of space, but is currently available
case Ydb::Monitoring::StatusFlag::RED: // disk is bad, probably not available
case Ydb::Monitoring::StatusFlag::GREY: // the status is absent, the disk is not available
IncrementFor(realm);
Expand All @@ -2091,7 +2091,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
if (ErasureSpecies == NONE) {
if (FailedDisks > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (ErasureSpecies == BLOCK_4_2) {
Expand All @@ -2105,7 +2105,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (ErasureSpecies == MIRROR_3_DC) {
Expand All @@ -2119,7 +2119,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
}
Expand Down
49 changes: 45 additions & 4 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {

const int GROUP_START_ID = 0x80000000;
const int VCARD_START_ID = 55;
const int PDISK_START_ID = 42;
const int DEFAULT_GROUP_GENERATION = 3;

const TPathId SUBDOMAIN_KEY = {7000000000, 1};
Expand Down Expand Up @@ -181,7 +182,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}

void AddVSlotsToSysViewResponse(NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr* ev, size_t groupCount,
const TVDisks& vslots, ui32 groupStartId = GROUP_START_ID) {
const TVDisks& vslots, ui32 groupStartId = GROUP_START_ID,
bool withPdisk = false) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();
Expand All @@ -190,10 +192,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor();
for (size_t i = 0; i < groupCount; ++i) {
auto vslotId = VCARD_START_ID;
auto pdiskId = PDISK_START_ID;
for (const auto& vslot : vslots) {
auto* entry = record.add_entries();
entry->CopyFrom(entrySample);
entry->mutable_key()->set_vslotid(vslotId);
if (withPdisk) {
entry->mutable_key()->set_pdiskid(pdiskId);
}
entry->mutable_info()->set_groupid(groupId);
entry->mutable_info()->set_failrealm(vslotId);
if (vslot.Status) {
Expand All @@ -202,6 +208,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
entry->mutable_info()->set_groupgeneration(vslot.Generation);
entry->mutable_info()->set_vdisk(vslotId);
++vslotId;
++pdiskId;
}
++groupId;
}
Expand All @@ -215,6 +222,22 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
entry->mutable_info()->set_name(STORAGE_POOL_NAME);
}

void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count, double occupancy) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();
auto pdiskId = PDISK_START_ID;
const size_t totalSize = 3'200'000'000'000ull;
for (size_t i = 0; i < count; ++i) {
auto* entry = record.add_entries();
entry->CopyFrom(entrySample);
entry->mutable_key()->set_pdiskid(pdiskId);
entry->mutable_info()->set_totalsize(totalSize);
entry->mutable_info()->set_availablesize((1 - occupancy) * totalSize);
++pdiskId;
}
}

void AddGroupVSlotInControllerConfigResponseWithStaticGroup(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev,
const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vslots)
{
Expand Down Expand Up @@ -415,7 +438,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
}

Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vdisks, bool forStaticGroup = false) {
Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vdisks, bool forStaticGroup = false, double occupancy = 0) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
Expand Down Expand Up @@ -451,12 +474,17 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
case NSysView::TEvSysView::EvGetVSlotsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
if (forStaticGroup) {
AddVSlotsToSysViewResponse(x, 1, vdisks, 0);
AddVSlotsToSysViewResponse(x, 1, vdisks, 0, true);
} else {
AddVSlotsToSysViewResponse(x, 1, vdisks);
AddVSlotsToSysViewResponse(x, 1, vdisks, GROUP_START_ID, true);
}
break;
}
case NSysView::TEvSysView::EvGetPDisksResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetPDisksResponse::TPtr*>(&ev);
AddPDisksToSysViewResponse(x, vdisks.size(), occupancy);
break;
}
case NSysView::TEvSysView::EvGetGroupsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetGroupsResponse::TPtr*>(&ev);
AddGroupsToSysViewResponse(x);
Expand Down Expand Up @@ -669,6 +697,19 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD);
}

Y_UNIT_TEST(YellowGroupIssueOnYellowSpace) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, NKikimrBlobStorage::READY}, false, 0.9);
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 0);
}

Y_UNIT_TEST(RedGroupIssueOnRedSpace) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, NKikimrBlobStorage::READY}, false, 0.95);
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}

/* HC currently infers group status on its own, so it's never unknown
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
Expand Down

0 comments on commit 1b318e1

Please sign in to comment.