From 4ff1942a4d5b0929f856c3c827dbd395b32b60c8 Mon Sep 17 00:00:00 2001 From: kungurtsev Date: Sat, 27 Apr 2024 11:06:09 +0200 Subject: [PATCH] Fix DataShard BuildStats error handling (#4159) --- ydb/core/tx/datashard/datashard__stats.cpp | 18 ++++++- ydb/core/tx/datashard/datashard_impl.h | 26 ++++++++++ ydb/core/tx/datashard/datashard_ut_stats.cpp | 54 +++++++++++++++++++- 3 files changed, 96 insertions(+), 2 deletions(-) diff --git a/ydb/core/tx/datashard/datashard__stats.cpp b/ydb/core/tx/datashard/datashard__stats.cpp index d19a415740f4..a7e09d8ffe02 100644 --- a/ydb/core/tx/datashard/datashard__stats.cpp +++ b/ydb/core/tx/datashard/datashard__stats.cpp @@ -179,7 +179,6 @@ class TAsyncTableStatsBuilder : public TActorBootstrappedSender); + + auto msg = ev->Get(); + + LOG_ERROR_S(ctx, NKikimrServices::TX_DATASHARD, "Stats rebuilt error '" << msg->Message + << "', code: " << ui32(msg->Code) << ", datashard " << TabletID() << ", tableId " << msg->TableId); + + auto it = TableInfos.find(msg->TableId); + if (it != TableInfos.end()) { + it->second->StatsUpdateInProgress = false; + // if we have got an error, a compaction should have happened so restart build stats anyway + it->second->StatsNeedUpdate = true; + } +} class TDataShard::TTxInitiateStatsUpdate : public NTabletFlatExecutor::TTransactionBase { private: diff --git a/ydb/core/tx/datashard/datashard_impl.h b/ydb/core/tx/datashard/datashard_impl.h index b277cfe1ef43..7f5c4622e694 100644 --- a/ydb/core/tx/datashard/datashard_impl.h +++ b/ydb/core/tx/datashard/datashard_impl.h @@ -364,6 +364,7 @@ class TDataShard EvConfirmReadonlyLease, EvReadonlyLeaseConfirmation, EvPlanPredictedTxs, + EvTableStatsError, EvEnd }; @@ -400,6 +401,29 @@ class TDataShard ui64 SearchHeight = 0; }; + struct TEvTableStatsError : public TEventLocal { + enum class ECode { + FETCH_PAGE_FAILED, + RESOURCE_ALLOCATION_FAILED, + ACTOR_DIED, + UNKNOWN + }; + + TEvTableStatsError(ui64 tableId, ECode code, const TString& msg) + : TableId(tableId) + , Code(code) + , Message(msg) + {} + + TEvTableStatsError(ui64 tableId, ECode code) + : TEvTableStatsError(tableId, code, "") + {} + + const ui64 TableId; + const ECode Code; + const TString Message; + }; + struct TEvRemoveOldInReadSets : public TEventLocal {}; struct TEvRegisterScanActor : public TEventLocal { @@ -1247,6 +1271,7 @@ class TDataShard void Handle(TEvDataShard::TEvSplitPartitioningChanged::TPtr& ev, const TActorContext& ctx); void Handle(TEvDataShard::TEvGetTableStats::TPtr& ev, const TActorContext& ctx); void Handle(TEvPrivate::TEvAsyncTableStats::TPtr& ev, const TActorContext& ctx); + void Handle(TEvPrivate::TEvTableStatsError::TPtr& ev, const TActorContext& ctx); void Handle(TEvDataShard::TEvKqpScan::TPtr& ev, const TActorContext& ctx); void HandleSafe(TEvDataShard::TEvKqpScan::TPtr& ev, const TActorContext& ctx); void Handle(TEvDataShard::TEvUploadRowsRequest::TPtr& ev, const TActorContext& ctx); @@ -2936,6 +2961,7 @@ class TDataShard HFunc(TEvDataShard::TEvSplitPartitioningChanged, Handle); HFunc(TEvDataShard::TEvGetTableStats, Handle); HFunc(TEvPrivate::TEvAsyncTableStats, Handle); + HFunc(TEvPrivate::TEvTableStatsError, Handle); HFunc(TEvDataShard::TEvKqpScan, Handle); HFunc(TEvDataShard::TEvUploadRowsRequest, Handle); HFunc(TEvDataShard::TEvEraseRowsRequest, Handle); diff --git a/ydb/core/tx/datashard/datashard_ut_stats.cpp b/ydb/core/tx/datashard/datashard_ut_stats.cpp index f8907f5ccbde..780fd911652e 100644 --- a/ydb/core/tx/datashard/datashard_ut_stats.cpp +++ b/ydb/core/tx/datashard/datashard_ut_stats.cpp @@ -1,5 +1,4 @@ #include -#include "datashard_ut_common_kqp.h" #include "ydb/core/tablet_flat/shared_sausagecache.h" namespace NKikimr { @@ -378,6 +377,59 @@ Y_UNIT_TEST_SUITE(DataShardStats) { UNIT_ASSERT_LE(counters->ActiveBytes->Val(), 800*1024); // one index } + Y_UNIT_TEST(NoData) { + TPortManager pm; + TServerSettings serverSettings(pm.GetPort(2134)); + serverSettings.SetDomainName("Root") + .SetUseRealThreads(false); + + TServer::TPtr server = new TServer(serverSettings); + auto& runtime = *server->GetRuntime(); + auto sender = runtime.AllocateEdgeActor(); + + runtime.SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_TRACE); + runtime.SetLogPriority(NKikimrServices::TABLET_SAUSAGECACHE, NLog::PRI_TRACE); + + InitRoot(server, sender); + + auto [shards, tableId1] = CreateShardedTable(server, sender, "/Root", "table-1", 1); + const auto shard1 = GetTableShards(server, sender, "/Root/table-1").at(0); + + ExecSQL(server, sender, "UPSERT INTO `/Root/table-1` (key, value) VALUES (1, 1), (2, 2), (3, 3)"); + + bool captured = false; + auto observer = runtime.AddObserver([&](NSharedCache::TEvResult::TPtr& event) { + IActor *actor = runtime.FindActor(event->Recipient); + + Cerr << "Got SchemeShard NSharedCache::TEvResult from " << event->Sender << " to " << event->Recipient << "(" << actor->GetActivityType() << ")"<< Endl; + + if (actor && actor->GetActivityType() == 288) { + auto& message = *event->Get(); + event.Reset(static_cast *>( + new IEventHandle(event->Recipient, event->Sender, + new NSharedCache::TEvResult(message.Origin, message.Cookie, NKikimrProto::NODATA)))); + captured = true; + } + }); + + CompactTable(runtime, shard1, tableId1, false); + + for (int i = 0; i < 5 && !captured; ++i) { + TDispatchOptions options; + options.CustomFinalCondition = [&]() { return captured; }; + runtime.DispatchEvents(options, TDuration::Seconds(5)); + } + observer.Remove(); + + { + Cerr << "Waiting stats.." << Endl; + auto stats = WaitTableStats(runtime, 1); + UNIT_ASSERT_VALUES_EQUAL(stats.GetDatashardId(), shard1); + UNIT_ASSERT_VALUES_EQUAL(stats.GetTableStats().GetRowCount(), 3); + UNIT_ASSERT_VALUES_EQUAL(stats.GetTableStats().GetPartCount(), 1); + } + } + } // Y_UNIT_TEST_SUITE(DataShardStats) } // namespace NKikimr