From da4da459e07280239f5867118aa30f515baed25c Mon Sep 17 00:00:00 2001 From: Amitanand Aiyer Date: Mon, 9 Sep 2024 10:05:46 +0800 Subject: [PATCH] [#23399] DocDB: Fix StopActiveTxnsPriorTo from using local variables in the call back Summary: use a shared_ptr instead of using variables on the stack, to handle the case where the callback may run after the function has exited (due to timeout) Jira: DB-12318 Test Plan: yb_build.sh --cxx-test yb-admin-snapshot-schedule-test --gtest_filter YbAdminSnapshotScheduleTestWithYsql.TransactionDuringPITRRepro23399 Reviewers: asrivastava Reviewed By: asrivastava Subscribers: ybase Differential Revision: https://phorge.dev.yugabyte.com/D37883 --- src/yb/tablet/transaction_participant.cc | 53 ++++++++++--------- .../tools/yb-admin-snapshot-schedule-test.cc | 22 +++++++- 2 files changed, 49 insertions(+), 26 deletions(-) diff --git a/src/yb/tablet/transaction_participant.cc b/src/yb/tablet/transaction_participant.cc index a1e6079794ae..78ba97bcf340 100644 --- a/src/yb/tablet/transaction_participant.cc +++ b/src/yb/tablet/transaction_participant.cc @@ -110,6 +110,8 @@ DEFINE_RUNTIME_AUTO_bool(cdc_write_post_apply_metadata, kLocalPersisted, false, DEFINE_RUNTIME_bool(cdc_immediate_transaction_cleanup, true, "Clean up transactions from memory after apply, even if its changes have not yet been " "streamed by CDC."); +DEFINE_test_flag(int32, stopactivetxns_sleep_in_abort_cb_ms, 0, + "Delays the abort callback in StopActiveTxns to repro GitHub #23399."); DECLARE_int64(transaction_abort_check_timeout_ms); @@ -1305,33 +1307,34 @@ class TransactionParticipant::Impl // It is ok to attempt to abort txns that have committed. We don't care // if our request succeeds or not. - CountDownLatch latch(ids_to_abort.size()); - std::atomic failed{false}; - Status return_status = Status::OK(); + struct CallbackInfo { + CountDownLatch latch{0}; + StatusHolder return_status; + }; + auto cb_info = std::make_shared(); + cb_info->latch.Reset(ids_to_abort.size()); for (const auto& id : ids_to_abort) { - Abort( - id, [this, id, &failed, &return_status, &latch](Result result) { - VLOG_WITH_PREFIX(2) << "Aborting " << id << " got " << result; - if (!result || - (result->status != TransactionStatus::COMMITTED && result->status != ABORTED)) { - LOG(INFO) << "Could not abort " << id << " got " << result; - - bool expected = false; - if (failed.compare_exchange_strong(expected, true)) { - if (!result) { - return_status = result.status(); - } else { - return_status = - STATUS_FORMAT(IllegalState, "Wrong status after abort: $0", result->status); - } - } - } - latch.CountDown(); - }); + // Do not pass anything by reference, as the callback can outlive this function. + Abort(id, [this, id, cb_info](Result result) { + VLOG_WITH_PREFIX(2) << "Aborting " << id << " got " << result; + if (FLAGS_TEST_stopactivetxns_sleep_in_abort_cb_ms > 0) { + VLOG(2) << "Sleeping for " << FLAGS_TEST_stopactivetxns_sleep_in_abort_cb_ms << "ms."; + SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_stopactivetxns_sleep_in_abort_cb_ms)); + } + if (!result || + (result->status != TransactionStatus::COMMITTED && result->status != ABORTED)) { + LOG(INFO) << "Could not abort " << id << " got " << result; + cb_info->return_status.SetError( + !result + ? result.status() + : STATUS_FORMAT(IllegalState, "Wrong status after abort: $0", result->status)); + } + cb_info->latch.CountDown(); + }); } - - return latch.WaitUntil(deadline) ? return_status - : STATUS(TimedOut, "TimedOut while aborting old transactions"); + return cb_info->latch.WaitUntil(deadline) + ? cb_info->return_status.GetStatus() + : STATUS(TimedOut, "TimedOut while aborting old transactions"); } Result WaitForSafeTime(HybridTime safe_time, CoarseTimePoint deadline) { diff --git a/src/yb/tools/yb-admin-snapshot-schedule-test.cc b/src/yb/tools/yb-admin-snapshot-schedule-test.cc index 3c6b763d8b64..89fbfd818ba2 100644 --- a/src/yb/tools/yb-admin-snapshot-schedule-test.cc +++ b/src/yb/tools/yb-admin-snapshot-schedule-test.cc @@ -593,6 +593,7 @@ class YbAdminSnapshotScheduleTestWithYsql : public YbAdminSnapshotScheduleTest { } void TestPgsqlDropDefault(); + void TestTransactionDuringPITR(); }; class YbAdminSnapshotScheduleTestWithYsqlAndPackedRow : public YbAdminSnapshotScheduleTestWithYsql { @@ -5418,7 +5419,7 @@ TEST_F_EX(YbAdminSnapshotScheduleTest, CreateDuplicateSchedules, "already exists for the given keyspace ysql." + kTableName.namespace_name()); } -TEST_F(YbAdminSnapshotScheduleTestWithYsql, TransactionDuringPITR) { +void YbAdminSnapshotScheduleTestWithYsql::TestTransactionDuringPITR() { ASSERT_OK(PrepareCommon()); std::string db_name = "test_db_name"; std::string table_name = "test_table"; @@ -5441,6 +5442,25 @@ TEST_F(YbAdminSnapshotScheduleTestWithYsql, TransactionDuringPITR) { ASSERT_EQ(row_value, 3); } +TEST_F(YbAdminSnapshotScheduleTestWithYsql, TransactionDuringPITR) { + TestTransactionDuringPITR(); +} + +class YbAdminSnapshotScheduleTestWithYsqlRepro23399 : public YbAdminSnapshotScheduleTestWithYsql { + public: + std::vector ExtraTSFlags() override { + return { + "--TEST_stopactivetxns_sleep_in_abort_cb_ms=500" + }; + } +}; + +TEST_F_EX( + YbAdminSnapshotScheduleTestWithYsql, TransactionDuringPITRRepro23399, + YbAdminSnapshotScheduleTestWithYsqlRepro23399) { + TestTransactionDuringPITR(); +} + class YbAdminSnapshotScheduleTestWithYsqlTransactionalDDL : public YbAdminSnapshotScheduleTestWithYsql { public: