Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

restore old behaviour of mapping space disk issues to group issues #8767

Merged
merged 1 commit into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2075,7 +2075,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
++DisksColors[status];
switch (status) {
case Ydb::Monitoring::StatusFlag::BLUE: // disk is good, but not available
case Ydb::Monitoring::StatusFlag::YELLOW: // disk is initializing, not currently available
// No yellow or orange status here - this is intentional - they are used when a disk is running out of space, but is currently available
case Ydb::Monitoring::StatusFlag::RED: // disk is bad, probably not available
case Ydb::Monitoring::StatusFlag::GREY: // the status is absent, the disk is not available
IncrementFor(realm);
Expand All @@ -2091,7 +2091,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
if (ErasureSpecies == NONE) {
if (FailedDisks > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (ErasureSpecies == BLOCK_4_2) {
Expand All @@ -2105,7 +2105,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (ErasureSpecies == MIRROR_3_DC) {
Expand All @@ -2119,7 +2119,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
} else if (DisksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0 || DisksColors[Ydb::Monitoring::StatusFlag::ORANGE] > 0) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
}
Expand Down
49 changes: 45 additions & 4 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {

const int GROUP_START_ID = 0x80000000;
const int VCARD_START_ID = 55;
const int PDISK_START_ID = 42;
const int DEFAULT_GROUP_GENERATION = 3;

const TPathId SUBDOMAIN_KEY = {7000000000, 1};
Expand Down Expand Up @@ -181,7 +182,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}

void AddVSlotsToSysViewResponse(NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr* ev, size_t groupCount,
const TVDisks& vslots, ui32 groupStartId = GROUP_START_ID) {
const TVDisks& vslots, ui32 groupStartId = GROUP_START_ID,
bool withPdisk = false) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();
Expand All @@ -190,10 +192,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor();
for (size_t i = 0; i < groupCount; ++i) {
auto vslotId = VCARD_START_ID;
auto pdiskId = PDISK_START_ID;
for (const auto& vslot : vslots) {
auto* entry = record.add_entries();
entry->CopyFrom(entrySample);
entry->mutable_key()->set_vslotid(vslotId);
if (withPdisk) {
entry->mutable_key()->set_pdiskid(pdiskId);
}
entry->mutable_info()->set_groupid(groupId);
entry->mutable_info()->set_failrealm(vslotId);
if (vslot.Status) {
Expand All @@ -202,6 +208,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
entry->mutable_info()->set_groupgeneration(vslot.Generation);
entry->mutable_info()->set_vdisk(vslotId);
++vslotId;
++pdiskId;
}
++groupId;
}
Expand All @@ -215,6 +222,22 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
entry->mutable_info()->set_name(STORAGE_POOL_NAME);
}

void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count, double occupancy) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();
auto pdiskId = PDISK_START_ID;
const size_t totalSize = 3'200'000'000'000ull;
for (size_t i = 0; i < count; ++i) {
auto* entry = record.add_entries();
entry->CopyFrom(entrySample);
entry->mutable_key()->set_pdiskid(pdiskId);
entry->mutable_info()->set_totalsize(totalSize);
entry->mutable_info()->set_availablesize((1 - occupancy) * totalSize);
++pdiskId;
}
}

void AddGroupVSlotInControllerConfigResponseWithStaticGroup(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev,
const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vslots)
{
Expand Down Expand Up @@ -415,7 +438,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
}

Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vdisks, bool forStaticGroup = false) {
Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVDisks& vdisks, bool forStaticGroup = false, double occupancy = 0) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
Expand Down Expand Up @@ -451,12 +474,17 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
case NSysView::TEvSysView::EvGetVSlotsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
if (forStaticGroup) {
AddVSlotsToSysViewResponse(x, 1, vdisks, 0);
AddVSlotsToSysViewResponse(x, 1, vdisks, 0, true);
} else {
AddVSlotsToSysViewResponse(x, 1, vdisks);
AddVSlotsToSysViewResponse(x, 1, vdisks, GROUP_START_ID, true);
}
break;
}
case NSysView::TEvSysView::EvGetPDisksResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetPDisksResponse::TPtr*>(&ev);
AddPDisksToSysViewResponse(x, vdisks.size(), occupancy);
break;
}
case NSysView::TEvSysView::EvGetGroupsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetGroupsResponse::TPtr*>(&ev);
AddGroupsToSysViewResponse(x);
Expand Down Expand Up @@ -669,6 +697,19 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD);
}

Y_UNIT_TEST(YellowGroupIssueOnYellowSpace) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, NKikimrBlobStorage::READY}, false, 0.9);
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 0);
}

Y_UNIT_TEST(RedGroupIssueOnRedSpace) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, NKikimrBlobStorage::READY}, false, 0.95);
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}

/* HC currently infers group status on its own, so it's never unknown
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
Expand Down
Loading