From e53057764ad2eafb9893c9e26cb10d74983c7399 Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Thu, 5 Sep 2024 07:13:04 +0000 Subject: [PATCH] restore old behaviour of mapping space disk issues to group issues --- ydb/core/health_check/health_check.cpp | 8 ++-- ydb/core/health_check/health_check_ut.cpp | 49 +++++++++++++++++++++-- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index ddcaa5a3b7fd..979e85eb53ca 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -2075,7 +2075,7 @@ class TSelfCheckRequest : public TActorBootstrapped { ++DisksColors[status]; switch (status) { case Ydb::Monitoring::StatusFlag::BLUE: // disk is good, but not available - case Ydb::Monitoring::StatusFlag::YELLOW: // disk is initializing, not currently available + // No yellow or orange status here - this is intentional - they are used when a disk is running out of space, but is currently available case Ydb::Monitoring::StatusFlag::RED: // disk is bad, probably not available case Ydb::Monitoring::StatusFlag::GREY: // the status is absent, the disk is not available IncrementFor(realm); @@ -2091,7 +2091,7 @@ class TSelfCheckRequest : public TActorBootstrapped { if (ErasureSpecies == NONE) { if (FailedDisks > 0) { context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState}); - } else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) { + } else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) { context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); } } else if (ErasureSpecies == BLOCK_4_2) { @@ -2105,7 +2105,7 @@ class TSelfCheckRequest : public TActorBootstrapped { } else { context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); } - } else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) { + } else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) { context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); } } else if (ErasureSpecies == MIRROR_3_DC) { @@ -2119,7 +2119,7 @@ class TSelfCheckRequest : public TActorBootstrapped { } else { context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); } - } else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) { + } else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) { context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); } } diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index e28c968422ed..bbb00c7bfbfd 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -58,6 +58,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { const int GROUP_START_ID = 0x80000000; const int VCARD_START_ID = 55; + const int PDISK_START_ID = 42; const int DEFAULT_GROUP_GENERATION = 3; const TPathId SUBDOMAIN_KEY = {7000000000, 1}; @@ -181,7 +182,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } void AddVSlotsToSysViewResponse(NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr* ev, size_t groupCount, - const TVDisks& vslots, ui32 groupStartId = GROUP_START_ID) { + const TVDisks& vslots, ui32 groupStartId = GROUP_START_ID, + bool withPdisk = false) { auto& record = (*ev)->Get()->Record; auto entrySample = record.entries(0); record.clear_entries(); @@ -190,10 +192,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor(); for (size_t i = 0; i < groupCount; ++i) { auto vslotId = VCARD_START_ID; + auto pdiskId = PDISK_START_ID; for (const auto& vslot : vslots) { auto* entry = record.add_entries(); entry->CopyFrom(entrySample); entry->mutable_key()->set_vslotid(vslotId); + if (withPdisk) { + entry->mutable_key()->set_pdiskid(pdiskId); + } entry->mutable_info()->set_groupid(groupId); entry->mutable_info()->set_failrealm(vslotId); if (vslot.Status) { @@ -202,6 +208,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { entry->mutable_info()->set_groupgeneration(vslot.Generation); entry->mutable_info()->set_vdisk(vslotId); ++vslotId; + ++pdiskId; } ++groupId; } @@ -215,6 +222,22 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { entry->mutable_info()->set_name(STORAGE_POOL_NAME); } + void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count, double occupancy) { + auto& record = (*ev)->Get()->Record; + auto entrySample = record.entries(0); + record.clear_entries(); + auto pdiskId = PDISK_START_ID; + const size_t totalSize = 3'200'000'000'000ull; + for (size_t i = 0; i < count; ++i) { + auto* entry = record.add_entries(); + entry->CopyFrom(entrySample); + entry->mutable_key()->set_pdiskid(pdiskId); + entry->mutable_info()->set_totalsize(totalSize); + entry->mutable_info()->set_availablesize((1 - occupancy) * totalSize); + ++pdiskId; + } + } + void AddGroupVSlotInControllerConfigResponseWithStaticGroup(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vslots) { @@ -415,7 +438,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords); } - Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vdisks, bool forStaticGroup = false) { + Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vdisks, bool forStaticGroup = false, double occupancy = 0) { TPortManager tp; ui16 port = tp.GetPort(2134); ui16 grpcPort = tp.GetPort(2135); @@ -451,12 +474,17 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { case NSysView::TEvSysView::EvGetVSlotsResponse: { auto* x = reinterpret_cast(&ev); if (forStaticGroup) { - AddVSlotsToSysViewResponse(x, 1, vdisks, 0); + AddVSlotsToSysViewResponse(x, 1, vdisks, 0, true); } else { - AddVSlotsToSysViewResponse(x, 1, vdisks); + AddVSlotsToSysViewResponse(x, 1, vdisks, GROUP_START_ID, true); } break; } + case NSysView::TEvSysView::EvGetPDisksResponse: { + auto* x = reinterpret_cast(&ev); + AddPDisksToSysViewResponse(x, vdisks.size(), occupancy); + break; + } case NSysView::TEvSysView::EvGetGroupsResponse: { auto* x = reinterpret_cast(&ev); AddGroupsToSysViewResponse(x); @@ -669,6 +697,19 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD); } + Y_UNIT_TEST(YellowGroupIssueOnYellowSpace) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, NKikimrBlobStorage::READY}, false, 0.9); + Cerr << result.ShortDebugString() << Endl; + CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); + CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 0); + } + + Y_UNIT_TEST(RedGroupIssueOnRedSpace) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, NKikimrBlobStorage::READY}, false, 0.95); + Cerr << result.ShortDebugString() << Endl; + CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1); + } + /* HC currently infers group status on its own, so it's never unknown Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) { auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});