From bb93ebed59b490d1ce1456df3c26415f651be188 Mon Sep 17 00:00:00 2001 From: Minghui Yang Date: Thu, 19 Sep 2024 20:55:56 +0000 Subject: [PATCH] [#24021] YSQL: Add --TEST_check_catalog_version_overflow Summary: The bug appeared in a recent integration test run and had the following symptom: In ./Universe_logs/172.151.31.81/tserver/yb-tserver.ip-172-151-31-81.us-west-2.compute.internal.yugabyte.log.INFO.20240918-192635.1116559 ``` W0918 19:33:33.059890 1123973 tablet_rpc.cc:497] Query error (yb/tserver/service_util.h:330): Failed Read(tablet: 00000000000000000000000000000000, num_ops: 1, num_attempts: 2, txn: 00000000-0000-0000-0000-000000000000, subtxn: [none]) to tablet 00000000000000000000000000000000 on tablet server { uuid: b7b95ea542c642998d053ebba298a46a private: [host: "172.151.31.81" port: 7100] cloud_info: placement_cloud: "aws" placement_region: "us-west-2" placement_zone: "us-west-2a" after 2 attempt(s): The catalog snapshot used for this transaction has been invalidated: expected: 18446744073709551615, got: 131: MISMATCHED_SCHEMA (tablet server error 5) ``` Note the last breaking catalog version is 18446744073709551615 (-1 in int64) which is unreasonably big. The version check is done by tserver, the expected last breaking catalog version comes from the map `ysql_db_catalog_version_map_` by using `db_oid` as the key. The map gets its value from the tserver-master heartbeat response where we find the contents of the table `pg_yb_catalog_version`. The new contents of `pg_yb_catalog_version` are merged with the existing `ysql_db_catalog_version_map_` where we only insert/update the map when the new version is greater than the existing value. I added a new gflag `--TEST_check_catalog_version_overflow`, when set to true, will crash the tserver if the new version read from the heartbeat response is unreasonably big (i.e., becomes negative when casted to int64_t). Similar debugging logic is added to the master side as well. When the contents of `pg_yb_catalog_version` are read by yb-master to prepare the heartbeat response, if the version read from the table `pg_yb_catalog_version` is unreasonably big, we crash the master process. Also added `GUARDED_BY(lock_)` and `EXCLUDES(lock_)` to a few relevant functions. It is expected that this `--TEST_check_catalog_version_overflow` gflag is enabled in the integration test which showed the bug. If the bug has a repro, we may have a better clue on where the number 18446744073709551615 comes from. Jira: DB-12909 Test Plan: Manual test (1) create a local cluster and start the cluster with the new test gflag set: ``` ./bin/yb-ctl create --rf 1 --tserver_flags TEST_check_catalog_version_overflow=true --master_flags TEST_check_catalog_version_overflow=true ``` (2) run the following commands: ``` yugabyte=# select * from pg_yb_catalog_version; db_oid | current_version | last_breaking_version --------+-----------------+----------------------- 1 | 1 | 1 13254 | 1 | 1 13255 | 1 | 1 13257 | 1 | 1 13258 | 1 | 1 (5 rows) yugabyte=# SET yb_non_ddl_txn_for_sys_tables_allowed=1; SET yugabyte=# update pg_yb_catalog_version set current_version = -1, last_breaking_version = -1 where db_oid = 13257; UPDATE 1 yugabyte=# \q ``` Look into the yb-master log directory and saw a FATAL: ``` F0919 20:44:08.961647 2712654 sys_catalog.cc:1063] Check failed: static_cast(current_version) >= 0 (-1 vs. 0) 18446744073709551615 db_oid: 13257 ``` (3) Repeat the above test with the master side changed as: ``` + if (FLAGS_TEST_check_catalog_version_overflow && false) { ``` so that we can see the tserver FATAL: ``` F0919 20:52:29.832093 2715720 tablet_server.cc:968] Check failed: static_cast(new_version) >= 0 (-1 vs. 0) 18446744073709551615 db_oid: 13257 db_catalog_version_data: db_catalog_versions { db_oid: 1 current_version: 1 last_breaking_version: 1 } db_catalog_versions { db_oid: 13254 current_version: 1 last_breaking_version: 1 } db_catalog_versions { db_oid: 13255 current_version: 1 last_breaking_version: 1 } db_catalog_versions { db_oid: 13257 current_version: 18446744073709551615 last_breaking_version: 18446744073709551615 } db_catalog_versions { db_oid: 13258 current_version: 1 last_breaking_version: 1 } ``` Reviewers: fizaa Reviewed By: fizaa Subscribers: ybase, yql Differential Revision: https://phorge.dev.yugabyte.com/D38240 --- src/yb/common/common_flags.cc | 3 +++ src/yb/common/common_flags.h | 1 + src/yb/master/sys_catalog.cc | 10 +++++++++- src/yb/tserver/tablet_server.cc | 8 ++++++++ src/yb/tserver/tablet_server.h | 20 +++++++++++--------- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/yb/common/common_flags.cc b/src/yb/common/common_flags.cc index 72265407b761..b5f9eef1c518 100644 --- a/src/yb/common/common_flags.cc +++ b/src/yb/common/common_flags.cc @@ -143,6 +143,9 @@ DEFINE_NON_RUNTIME_string(placement_region, "datacenter1", DEFINE_NON_RUNTIME_string(placement_zone, "rack1", "The cloud availability zone in which this instance is started."); +DEFINE_test_flag(bool, check_catalog_version_overflow, false, + "Check whether received catalog version is unreasonably too big"); + namespace { constexpr const auto kMinRpcThrottleThresholdBytes = 16; diff --git a/src/yb/common/common_flags.h b/src/yb/common/common_flags.h index 07b799cfee31..ddfd25436529 100644 --- a/src/yb/common/common_flags.h +++ b/src/yb/common/common_flags.h @@ -28,6 +28,7 @@ DECLARE_bool(yb_enable_cdc_consistent_snapshot_streams); DECLARE_bool(ysql_yb_enable_replication_slot_consumption); DECLARE_uint32(wait_for_ysql_backends_catalog_version_client_master_rpc_margin_ms); DECLARE_bool(TEST_ysql_hide_catalog_version_increment_log); +DECLARE_bool(TEST_check_catalog_version_overflow); DECLARE_int32(ysql_clone_pg_schema_rpc_timeout_ms); DECLARE_bool(ysql_enable_auto_analyze_service); diff --git a/src/yb/master/sys_catalog.cc b/src/yb/master/sys_catalog.cc index 5f75671a5693..12b8512cca86 100644 --- a/src/yb/master/sys_catalog.cc +++ b/src/yb/master/sys_catalog.cc @@ -145,6 +145,7 @@ METRIC_DEFINE_counter( DECLARE_bool(create_initial_sys_catalog_snapshot); DECLARE_int32(master_discovery_timeout_ms); DECLARE_int32(retryable_request_timeout_secs); +DECLARE_bool(TEST_check_catalog_version_overflow); DEFINE_UNKNOWN_int32(sys_catalog_write_timeout_ms, 60000, "Timeout for writes into system catalog"); DEFINE_UNKNOWN_uint64(copy_tables_batch_bytes, 500_KB, @@ -1053,12 +1054,19 @@ Status SysCatalogTable::ReadYsqlDBCatalogVersionImplWithReadTime( } if (versions) { // When 'versions' is set we read all rows. + const uint32_t db_oid = db_oid_value->uint32_value(); const uint64_t current_version = static_cast(version_col_value->int64_value()); const uint64_t last_breaking_version = static_cast(last_breaking_version_col_value->int64_value()); + if (FLAGS_TEST_check_catalog_version_overflow) { + CHECK_GE(static_cast(current_version), 0) + << current_version << " db_oid: " << db_oid; + CHECK_GE(static_cast(last_breaking_version), 0) + << last_breaking_version << " db_oid: " << db_oid; + } auto insert_result = versions->insert( - std::make_pair(db_oid_value->uint32_value(), PgCatalogVersion{ + std::make_pair(db_oid, PgCatalogVersion{ .current_version = current_version, .last_breaking_version = last_breaking_version})); // There should not be any duplicate db_oid because it is a primary key. diff --git a/src/yb/tserver/tablet_server.cc b/src/yb/tserver/tablet_server.cc index ef3361a366b5..ec41ee58c621 100644 --- a/src/yb/tserver/tablet_server.cc +++ b/src/yb/tserver/tablet_server.cc @@ -964,6 +964,14 @@ void TabletServer::SetYsqlDBCatalogVersions( const uint32_t db_oid = db_catalog_version.db_oid(); const uint64_t new_version = db_catalog_version.current_version(); const uint64_t new_breaking_version = db_catalog_version.last_breaking_version(); + if (FLAGS_TEST_check_catalog_version_overflow) { + CHECK_GE(static_cast(new_version), 0) + << new_version << " db_oid: " << db_oid + << " db_catalog_version_data: " << db_catalog_version_data.ShortDebugString(); + CHECK_GE(static_cast(new_breaking_version), 0) + << new_breaking_version << " db_oid: " << db_oid + << " db_catalog_version_data: " << db_catalog_version_data.ShortDebugString(); + } if (!db_oid_set.insert(db_oid).second) { LOG(DFATAL) << "Ignoring duplicate db oid " << db_oid; continue; diff --git a/src/yb/tserver/tablet_server.h b/src/yb/tserver/tablet_server.h index ea8d6cf05d83..0a89f3a2402a 100644 --- a/src/yb/tserver/tablet_server.h +++ b/src/yb/tserver/tablet_server.h @@ -234,11 +234,12 @@ class TabletServer : public DbServerBase, public TabletServerIf { return publish_service_ptr_.get(); } - void SetYsqlCatalogVersion(uint64_t new_version, uint64_t new_breaking_version); - void SetYsqlDBCatalogVersions(const master::DBCatalogVersionDataPB& db_catalog_version_data); + void SetYsqlCatalogVersion(uint64_t new_version, uint64_t new_breaking_version) EXCLUDES(lock_); + void SetYsqlDBCatalogVersions(const master::DBCatalogVersionDataPB& db_catalog_version_data) + EXCLUDES(lock_); void get_ysql_catalog_version(uint64_t* current_version, - uint64_t* last_breaking_version) const override { + uint64_t* last_breaking_version) const EXCLUDES(lock_) override { std::lock_guard l(lock_); if (current_version) { *current_version = ysql_catalog_version_; @@ -248,9 +249,10 @@ class TabletServer : public DbServerBase, public TabletServerIf { } } - void get_ysql_db_catalog_version(uint32_t db_oid, - uint64_t* current_version, - uint64_t* last_breaking_version) const override { + void get_ysql_db_catalog_version( + uint32_t db_oid, + uint64_t* current_version, + uint64_t* last_breaking_version) const EXCLUDES(lock_) override { std::lock_guard l(lock_); auto it = ysql_db_catalog_version_map_.find(db_oid); bool not_found = it == ysql_db_catalog_version_map_.end(); @@ -267,14 +269,14 @@ class TabletServer : public DbServerBase, public TabletServerIf { } } - std::optional catalog_version_table_in_perdb_mode() const { + std::optional catalog_version_table_in_perdb_mode() const EXCLUDES(lock_) { std::lock_guard l(lock_); return catalog_version_table_in_perdb_mode_; } Status get_ysql_db_oid_to_cat_version_info_map( const tserver::GetTserverCatalogVersionInfoRequestPB& req, - tserver::GetTserverCatalogVersionInfoResponsePB* resp) const override; + tserver::GetTserverCatalogVersionInfoResponsePB* resp) const EXCLUDES(lock_) override; void UpdateTransactionTablesVersion(uint64_t new_version); @@ -468,7 +470,7 @@ class TabletServer : public DbServerBase, public TabletServerIf { // Latest known version from the YSQL catalog (as reported by last heartbeat response). uint64_t ysql_catalog_version_ = 0; uint64_t ysql_last_breaking_catalog_version_ = 0; - tserver::DbOidToCatalogVersionInfoMap ysql_db_catalog_version_map_; + tserver::DbOidToCatalogVersionInfoMap ysql_db_catalog_version_map_ GUARDED_BY(lock_); // See same variable comments in CatalogManager. std::optional catalog_version_table_in_perdb_mode_{std::nullopt};