Skip to content

Commit

Permalink
Fix pdisk restart race on vdisk log reading (#4190)
Browse files Browse the repository at this point in the history
  • Loading branch information
SammyVimes authored May 13, 2024
1 parent ce48e6c commit 8cbb144
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 3 deletions.
1 change: 1 addition & 0 deletions ydb/core/blobstorage/nodewarden/node_warden_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ namespace NKikimr::NStorage {

std::map<TVSlotId, TVDiskRecord> LocalVDisks;
std::map<TVSlotId, ui64> SlayInFlight;
std::set<ui32> PDiskRestartInFlight;
TIntrusiveList<TVDiskRecord, TUnreportedMetricTag> VDisksWithUnreportedMetrics;

void DestroyLocalVDisk(TVDiskRecord& vdisk);
Expand Down
13 changes: 13 additions & 0 deletions ydb/core/blobstorage/nodewarden/node_warden_pdisk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ namespace NKikimr::NStorage {
TActivationContext::Send(new IEventHandle(TEvents::TSystem::Poison, 0, actorId, {}, nullptr, 0));
Send(WhiteboardId, new NNodeWhiteboard::TEvWhiteboard::TEvPDiskStateDelete(pdiskId));
LocalPDisks.erase(it);
PDiskRestartInFlight.erase(pdiskId);

// mark vdisks still living over this PDisk as destroyed ones
for (auto it = LocalVDisks.lower_bound({LocalNodeId, pdiskId, 0}); it != LocalVDisks.end() &&
Expand Down Expand Up @@ -224,6 +225,11 @@ namespace NKikimr::NStorage {
}

void TNodeWarden::OnPDiskRestartFinished(ui32 pdiskId, NKikimrProto::EReplyStatus status) {
if (PDiskRestartInFlight.erase(pdiskId) == 0) {
// There was no restart in progress.
return;
}

const TPDiskKey pdiskKey(LocalNodeId, pdiskId);

const TVSlotId from(pdiskKey.NodeId, pdiskKey.PDiskId, 0);
Expand Down Expand Up @@ -278,6 +284,13 @@ namespace NKikimr::NStorage {
return;
}

const auto [_, inserted] = PDiskRestartInFlight.emplace(pdiskId);

if (!inserted) {
// Restart is already in progress.
return;
}

TIntrusivePtr<TPDiskConfig> pdiskConfig = CreatePDiskConfig(it->second.Record);

Cfg->PDiskKey.Initialize();
Expand Down
4 changes: 4 additions & 0 deletions ydb/core/blobstorage/nodewarden/node_warden_vdisk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ namespace NKikimr::NStorage {
return;
}

if (PDiskRestartInFlight.contains(vslotId.PDiskId)) {
return;
}

// find underlying PDisk and determine its media type
auto pdiskIt = LocalPDisks.find({vslotId.NodeId, vslotId.PDiskId});
Y_VERIFY_S(pdiskIt != LocalPDisks.end(), "PDiskId# " << vslotId.NodeId << ":" << vslotId.PDiskId << " not found");
Expand Down
7 changes: 5 additions & 2 deletions ydb/core/blobstorage/pdisk/blobstorage_pdisk_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,8 @@ class TPDiskActor : public TActorBootstrapped<TPDiskActor> {
}

void Handle(TEvBlobStorage::TEvAskWardenRestartPDiskResult::TPtr &ev) {
bool restartAllowed = ev->Get()->RestartAllowed;

bool isReadingLog = PDisk->InitPhase == EInitPhase::ReadingSysLog || PDisk->InitPhase == EInitPhase::ReadingLog;

if ((isReadingLog && CurrentStateFunc() != &TPDiskActor::StateError) || IsFormattingNow) {
Expand All @@ -985,11 +987,12 @@ class TPDiskActor : public TActorBootstrapped<TPDiskActor> {
PendingRestartResponse(false, s);
PendingRestartResponse = {};
}

Send(ev->Sender, new TEvBlobStorage::TEvNotifyWardenPDiskRestarted(PDisk->PDiskId, NKikimrProto::EReplyStatus::NOTREADY));

return;
}

bool restartAllowed = ev->Get()->RestartAllowed;

if (restartAllowed) {
MainKey = ev->Get()->MainKey;
SecureWipeBuffer((ui8*)ev->Get()->MainKey.Keys.data(), sizeof(NPDisk::TKey) * ev->Get()->MainKey.Keys.size());
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/blobstorage/ut_blobstorage/restart_pdisk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ Y_UNIT_TEST_SUITE(BSCRestartPDisk) {
if (serviceSetUpdate) {
const auto &pdisks = serviceSetUpdate->Record.GetServiceSet().GetPDisks();
const auto &pdisk = pdisks[0];
UNIT_ASSERT(pdisk.GetEntityStatus() == NKikimrBlobStorage::RESTART);
UNIT_ASSERT_EQUAL(NKikimrBlobStorage::RESTART, pdisk.GetEntityStatus());
gotServiceSetUpdate = true;
}
}
Expand Down

0 comments on commit 8cbb144

Please sign in to comment.