Skip to content

Commit

Permalink
[#7126] Fix YbAdminSnapshotScheduleTest.UndeleteIndex
Browse files Browse the repository at this point in the history
Summary:
Fixes issues uncovered by YbAdminSnapshotScheduleTest.UndeleteIndex test.
1) DeleteTableInMemory could be called multiple times in the case of the index table.
   There is a check that just does noop when the table was already deleted.
   Adjusted this check to do the same when the table is being hidden.
2) Don't remove the table from names map during delete, when it was previously hidden.
   Otherwise, it would crash with fatal during cleanup.
3) DeleteTabletListAndSendRequests executes delete on tablet before commiting tablet info changes.
   As a result tablet could be deleted before and callback called, before info changes in memory.
   So table would hang in delete state. Because callback would think that tablet is not being deleted.
4) Decreased log flooding when compactions are being enabled in RocksDB.
   When compactions are being enabled we call SetOptions twice for each RocksDB, and each of them dumps all current options values.
   So while we have regular and intents DB we have 4 dumps of all rocksdb options.

Also added debug logging to `RWCLock::WriteLock()`, when it takes a too long time to acquire this lock, it would log the stack trace of the successful write lock.

Test Plan: ybd --gtest_filter YbAdminSnapshotScheduleTest.UndeleteIndex -n 20

Reviewers: bogdan

Reviewed By: bogdan

Subscribers: amitanand, ybase

Differential Revision: https://phabricator.dev.yugabyte.com/D11614
  • Loading branch information
spolitov committed May 22, 2021
1 parent f6b271e commit 5fc9ce1
Show file tree
Hide file tree
Showing 33 changed files with 432 additions and 248 deletions.
62 changes: 32 additions & 30 deletions ent/src/yb/master/async_snapshot_tasks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ using tserver::TabletServerErrorPB;
// AsyncTabletSnapshotOp
////////////////////////////////////////////////////////////

namespace {

std::string SnapshotIdToString(const std::string& snapshot_id) {
auto uuid = TryFullyDecodeTxnSnapshotId(snapshot_id);
return uuid.IsNil() ? snapshot_id : uuid.ToString();
}

}

AsyncTabletSnapshotOp::AsyncTabletSnapshotOp(Master *master,
ThreadPool* callback_pool,
const scoped_refptr<TabletInfo>& tablet,
Expand All @@ -52,8 +61,9 @@ AsyncTabletSnapshotOp::AsyncTabletSnapshotOp(Master *master,
}

string AsyncTabletSnapshotOp::description() const {
return Format("$0 Tablet Snapshot Operation $1 RPC",
*tablet_, tserver::TabletSnapshotOpRequestPB::Operation_Name(operation_));
return Format("$0 Tablet Snapshot Operation $1 RPC $2",
*tablet_, tserver::TabletSnapshotOpRequestPB::Operation_Name(operation_),
SnapshotIdToString(snapshot_id_));
}

TabletId AsyncTabletSnapshotOp::tablet_id() const {
Expand All @@ -64,43 +74,36 @@ TabletServerId AsyncTabletSnapshotOp::permanent_uuid() const {
return target_ts_desc_ != nullptr ? target_ts_desc_->permanent_uuid() : "";
}

bool AsyncTabletSnapshotOp::RetryAllowed(TabletServerErrorPB::Code code, const Status& status) {
switch (code) {
case TabletServerErrorPB::TABLET_NOT_FOUND:
return false;
case TabletServerErrorPB::INVALID_SNAPSHOT:
return operation_ != tserver::TabletSnapshotOpRequestPB::RESTORE_ON_TABLET;
default:
return TransactionError(status) != TransactionErrorCode::kSnapshotTooOld;
}
}

void AsyncTabletSnapshotOp::HandleResponse(int attempt) {
server::UpdateClock(resp_, master_->clock());

if (resp_.has_error()) {
Status status = StatusFromPB(resp_.error().status());

// Do not retry on a fatal error.
switch (resp_.error().code()) {
case TabletServerErrorPB::TABLET_NOT_FOUND:
LOG(WARNING) << "TS " << permanent_uuid() << ": snapshot failed for tablet "
<< tablet_->ToString() << " no further retry: " << status;
TransitionToCompleteState();
break;
case TabletServerErrorPB::INVALID_SNAPSHOT:
LOG(WARNING) << "TS " << permanent_uuid() << ": snapshot failed for tablet "
<< tablet_->ToString() << ": " << status;
if (operation_ == tserver::TabletSnapshotOpRequestPB::RESTORE_ON_TABLET) {
LOG(WARNING) << "No further retry for RESTORE snapshot operation: " << status;
TransitionToCompleteState();
}
break;
default:
LOG(WARNING) << "TS " << permanent_uuid() << ": snapshot failed for tablet "
<< tablet_->ToString() << ": " << status;
if (TransactionError(status) == TransactionErrorCode::kSnapshotTooOld) {
TransitionToCompleteState();
}
break;
if (!RetryAllowed(resp_.error().code(), status)) {
LOG_WITH_PREFIX(WARNING) << "Failed, NO retry: " << status;
TransitionToCompleteState();
} else {
LOG_WITH_PREFIX(WARNING) << "Failed, will be retried: " << status;
}
} else {
TransitionToCompleteState();
VLOG(1) << "TS " << permanent_uuid() << ": snapshot complete on tablet "
<< tablet_->ToString();
VLOG_WITH_PREFIX(1) << "Complete";
}

if (state() != MonitoredTaskState::kComplete) {
VLOG(1) << "TabletSnapshotOp task is not completed";
VLOG_WITH_PREFIX(1) << "TabletSnapshotOp task is not completed";
return;
}

Expand Down Expand Up @@ -162,9 +165,8 @@ bool AsyncTabletSnapshotOp::SendRequest(int attempt) {
req.set_propagated_hybrid_time(master_->clock()->Now().ToUint64());

ts_backup_proxy_->TabletSnapshotOpAsync(req, &resp_, &rpc_, BindRpcCallback());
VLOG(1) << "Send tablet snapshot request " << operation_ << " to " << permanent_uuid()
<< " (attempt " << attempt << "):\n"
<< req.DebugString();
VLOG_WITH_PREFIX(1) << "Sent to " << permanent_uuid() << " (attempt " << attempt << "): "
<< (VLOG_IS_ON(4) ? req.ShortDebugString() : "");
return true;
}

Expand Down
3 changes: 2 additions & 1 deletion ent/src/yb/master/async_snapshot_tasks.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ class AsyncTabletSnapshotOp : public enterprise::RetryingTSRpcTask {
void HandleResponse(int attempt) override;
bool SendRequest(int attempt) override;
void Finished(const Status& status) override;
bool RetryAllowed(tserver::TabletServerErrorPB::Code code, const Status& status);

scoped_refptr<TabletInfo> tablet_;
TabletInfoPtr tablet_;
const std::string snapshot_id_;
tserver::TabletSnapshotOpRequestPB::Operation operation_;
SnapshotScheduleId snapshot_schedule_id_ = SnapshotScheduleId::Nil();
Expand Down
5 changes: 5 additions & 0 deletions ent/src/yb/master/catalog_manager_ent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,11 @@ Status CatalogManager::RestoreSysCatalog(SnapshotScheduleRestoration* restoratio
// Load objects to restore and determine obsolete objects.
RestoreSysCatalogState state(restoration);
RETURN_NOT_OK(state.LoadObjects(schema(), doc_db));
{
SharedLock lock(mutex_);
RETURN_NOT_OK(state.PatchVersions(*table_ids_map_));
}
RETURN_NOT_OK(state.DetermineEntries());
{
auto existing = VERIFY_RESULT(CollectEntriesForSnapshot(restoration->filter.tables().tables()));
RETURN_NOT_OK(state.DetermineObsoleteObjects(existing));
Expand Down
15 changes: 14 additions & 1 deletion ent/src/yb/master/restore_sys_catalog_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,20 @@ Status RestoreSysCatalogState::LoadObjects(const Schema& schema, const docdb::Do
RETURN_NOT_OK(IterateSysCatalog(schema, doc_db, &namespaces_));
RETURN_NOT_OK(IterateSysCatalog(schema, doc_db, &tables_));
RETURN_NOT_OK(IterateSysCatalog(schema, doc_db, &tablets_));
return DetermineEntries();
return Status::OK();
}

Status RestoreSysCatalogState::PatchVersions(const TableInfoMap& tables) {
for (auto& id_and_pb : tables_) {
auto it = tables.find(id_and_pb.first);
if (it == tables.end()) {
return STATUS_FORMAT(NotFound, "Not found restoring table: $0", id_and_pb.first);
}

// Force schema update after restoration.
id_and_pb.second.set_version(it->second->LockForRead()->pb.version() + 1);
}
return Status::OK();
}

Status RestoreSysCatalogState::DetermineObsoleteObjects(const SysRowEntries& existing) {
Expand Down
14 changes: 12 additions & 2 deletions ent/src/yb/master/restore_sys_catalog_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,28 @@ class RestoreSysCatalogState {
public:
explicit RestoreSysCatalogState(SnapshotScheduleRestoration* restoration);

// Load objects from DB snapshot.
CHECKED_STATUS LoadObjects(const Schema& schema, const docdb::DocDB& doc_db);

// Patch table versions, so restored tables will have greater schema version to force schema
// update.
CHECKED_STATUS PatchVersions(const TableInfoMap& tables);

// Determine entries that should be restored. I.e. apply filter and serialize.
CHECKED_STATUS DetermineEntries();

// Determine objects that should be removed, i.e. was created after restoration time.
CHECKED_STATUS DetermineObsoleteObjects(const SysRowEntries& existing);

// Prepare write batch with object changes.
CHECKED_STATUS PrepareWriteBatch(const Schema& schema, docdb::DocWriteBatch* write_batch);

// Prepare write batch to delete obsolete tablet.
CHECKED_STATUS PrepareTabletCleanup(
const TabletId& id, SysTabletsEntryPB pb, const Schema& schema,
docdb::DocWriteBatch* write_batch);

// Prepare write batch to delete obsolete table.
CHECKED_STATUS PrepareTableCleanup(
const TableId& id, SysTablesEntryPB pb, const Schema& schema,
docdb::DocWriteBatch* write_batch);
Expand All @@ -62,8 +74,6 @@ class RestoreSysCatalogState {
CHECKED_STATUS IterateSysCatalog(
const Schema& schema, const docdb::DocDB& doc_db, std::unordered_map<std::string, PB>* map);

CHECKED_STATUS DetermineEntries();

Result<bool> MatchTable(const TableId& id, const SysTablesEntryPB& table);
Result<bool> TableMatchesIdentifier(
const TableId& id, const SysTablesEntryPB& table, const TableIdentifierPB& table_identifier);
Expand Down
12 changes: 8 additions & 4 deletions ent/src/yb/tools/yb-admin_client_ent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,8 @@ Result<TxnSnapshotId> ClusterAdminClient::SuitableSnapshotId(
req.set_snapshot_schedule_id(schedule_id.data(), schedule_id.size());
}

RETURN_NOT_OK(master_backup_proxy_->ListSnapshotSchedules(req, &resp, &rpc));
RETURN_NOT_OK_PREPEND(master_backup_proxy_->ListSnapshotSchedules(req, &resp, &rpc),
"Failed to list snapshot schedules");

if (resp.has_error()) {
return StatusFromPB(resp.error().status());
Expand All @@ -446,7 +447,8 @@ Result<TxnSnapshotId> ClusterAdminClient::SuitableSnapshotId(
master::CreateSnapshotRequestPB req;
master::CreateSnapshotResponsePB resp;
req.set_schedule_id(schedule_id.data(), schedule_id.size());
RETURN_NOT_OK(master_backup_proxy_->CreateSnapshot(req, &resp, &rpc));
RETURN_NOT_OK_PREPEND(master_backup_proxy_->CreateSnapshot(req, &resp, &rpc),
"Failed to create snapshot");
if (resp.has_error()) {
auto status = StatusFromPB(resp.error().status());
if (master::MasterError(status) == master::MasterErrorPB::PARALLEL_SNAPSHOT_OPERATION) {
Expand All @@ -471,7 +473,8 @@ Result<rapidjson::Document> ClusterAdminClient::RestoreSnapshotSchedule(
master::ListSnapshotsRequestPB req;
req.set_snapshot_id(snapshot_id.data(), snapshot_id.size());
master::ListSnapshotsResponsePB resp;
RETURN_NOT_OK(master_backup_proxy_->ListSnapshots(req, &resp, &rpc));
RETURN_NOT_OK_PREPEND(master_backup_proxy_->ListSnapshots(req, &resp, &rpc),
"Failed to list snapshots");
if (resp.has_error()) {
return StatusFromPB(resp.error().status());
}
Expand Down Expand Up @@ -500,7 +503,8 @@ Result<rapidjson::Document> ClusterAdminClient::RestoreSnapshotSchedule(
RestoreSnapshotResponsePB resp;
req.set_snapshot_id(snapshot_id.data(), snapshot_id.size());
req.set_restore_ht(restore_at.ToUint64());
RETURN_NOT_OK(master_backup_proxy_->RestoreSnapshot(req, &resp, &rpc));
RETURN_NOT_OK_PREPEND(master_backup_proxy_->RestoreSnapshot(req, &resp, &rpc),
"Failed to restore snapshot");

if (resp.has_error()) {
return StatusFromPB(resp.error().status());
Expand Down
3 changes: 2 additions & 1 deletion ent/src/yb/tserver/backup_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ void TabletServiceBackupImpl::TabletSnapshotOp(const TabletSnapshotOpRequestPB*
TRACE_EVENT1("tserver", "TabletSnapshotOp", "tablet_id: ", tablet_id);

LOG(INFO) << "Processing TabletSnapshotOp for tablet " << tablet_id << " from "
<< context.requestor_string() << ": " << req->operation();
<< context.requestor_string() << ": "
<< TabletSnapshotOpRequestPB::Operation_Name(req->operation());
VLOG(1) << "Full request: " << req->DebugString();

auto tablet = LookupLeaderTabletOrRespond(tablet_manager_, tablet_id, resp, &context);
Expand Down
33 changes: 2 additions & 31 deletions src/yb/common/entity_ids.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,41 +18,12 @@
#include <set>
#include <utility>

#include "yb/common/entity_ids_types.h"

#include "yb/util/result.h"
#include "yb/util/strongly_typed_string.h"

namespace yb {

// TODO: switch many of these to opaque types for additional type safety and efficiency.

using NamespaceName = std::string;
using TableName = std::string;
using UDTypeName = std::string;
using RoleName = std::string;

using NamespaceId = std::string;
using TableId = std::string;
using UDTypeId = std::string;
using CDCStreamId = std::string;

using PeerId = std::string;
using SnapshotId = std::string;
using TabletServerId = PeerId;
using TabletId = std::string;
using TablegroupId = std::string;
using TablespaceId = std::string;

YB_STRONGLY_TYPED_STRING(KvStoreId);

// TODO(#79): switch to YB_STRONGLY_TYPED_STRING
using RaftGroupId = std::string;

using NamespaceIdTableNamePair = std::pair<NamespaceId, TableName>;

using FlushRequestId = std::string;

using RedisConfigKey = std::string;

static const uint32_t kPgSequencesDataTableOid = 0xFFFF;
static const uint32_t kPgSequencesDataDatabaseOid = 0xFFFF;

Expand Down
54 changes: 54 additions & 0 deletions src/yb/common/entity_ids_types.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations
// under the License.
//

#ifndef YB_COMMON_ENTITY_IDS_TYPES_H
#define YB_COMMON_ENTITY_IDS_TYPES_H

#include <string>

#include "yb/util/strongly_typed_string.h"

namespace yb {

// TODO: switch many of these to opaque types for additional type safety and efficiency.
using NamespaceName = std::string;
using TableName = std::string;
using UDTypeName = std::string;
using RoleName = std::string;

using NamespaceId = std::string;
using TableId = std::string;
using UDTypeId = std::string;
using CDCStreamId = std::string;

using PeerId = std::string;
using SnapshotId = std::string;
using TabletServerId = PeerId;
using TabletId = std::string;
using TablegroupId = std::string;
using TablespaceId = std::string;

YB_STRONGLY_TYPED_STRING(KvStoreId);

// TODO(#79): switch to YB_STRONGLY_TYPED_STRING
using RaftGroupId = std::string;

using NamespaceIdTableNamePair = std::pair<NamespaceId, TableName>;

using FlushRequestId = std::string;

using RedisConfigKey = std::string;

} // namespace yb

#endif // YB_COMMON_ENTITY_IDS_TYPES_H
Loading

0 comments on commit 5fc9ce1

Please sign in to comment.