Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow backup on master #4699

Merged
merged 2 commits into from
Mar 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 72 additions & 72 deletions go/vt/proto/query/query.pb.go

Large diffs are not rendered by default.

465 changes: 237 additions & 228 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata.pb.go

Large diffs are not rendered by default.

106 changes: 53 additions & 53 deletions go/vt/proto/vtgate/vtgate.pb.go

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion go/vt/schemamanager/schemaswap/schema_swap.go
Original file line number Diff line number Diff line change
Expand Up @@ -1092,7 +1092,7 @@ func (shardSwap *shardSchemaSwap) takeSeedBackup() (err error) {
}

shardSwap.addShardLog(fmt.Sprintf("Taking backup on the seed tablet %v", seedTablet.Alias))
eventStream, err := shardSwap.parent.tabletClient.Backup(shardSwap.parent.ctx, seedTablet, *backupConcurrency)
eventStream, err := shardSwap.parent.tabletClient.Backup(shardSwap.parent.ctx, seedTablet, *backupConcurrency, false)
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vtcombo/tablet_map.go
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ func (itmc *internalTabletManagerClient) PromoteSlave(ctx context.Context, table
return "", fmt.Errorf("not implemented in vtcombo")
}

func (itmc *internalTabletManagerClient) Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int) (logutil.EventStream, error) {
func (itmc *internalTabletManagerClient) Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int, allowMaster bool) (logutil.EventStream, error) {
return nil, fmt.Errorf("not implemented in vtcombo")
}

Expand Down
33 changes: 25 additions & 8 deletions go/vt/vtctl/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func init() {
addCommand("Shards", command{
"BackupShard",
commandBackupShard,
"<keyspace/shard>",
"[-allow_master=false] <keyspace/shard>",
"Chooses a tablet and creates a backup for a shard."})
addCommand("Shards", command{
"RemoveBackup",
Expand All @@ -50,7 +50,7 @@ func init() {
addCommand("Tablets", command{
"Backup",
commandBackup,
"[-concurrency=4] <tablet alias>",
"[-concurrency=4] [-allow_master=false] <tablet alias>",
"Stops mysqld and uses the BackupStorage service to store a new backup. This function also remembers if the tablet was replicating so that it can restore the same state after the backup completes."})
addCommand("Tablets", command{
"RestoreFromBackup",
Expand All @@ -61,6 +61,8 @@ func init() {

func commandBackup(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error {
concurrency := subFlags.Int("concurrency", 4, "Specifies the number of compression/checksum jobs to run simultaneously")
allowMaster := subFlags.Bool("allow_master", false, "Allows backups to be taken on master. Warning!! If you are using the builtin backup engine, this will shutdown your master mysql for as long as it takes to create a backup ")

if err := subFlags.Parse(args); err != nil {
return err
}
Expand All @@ -77,11 +79,13 @@ func commandBackup(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.Fl
return err
}

return execBackup(ctx, wr, tabletInfo.Tablet, *concurrency)
return execBackup(ctx, wr, tabletInfo.Tablet, *concurrency, *allowMaster)
}

func commandBackupShard(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error {
concurrency := subFlags.Int("concurrency", 4, "Specifies the number of compression/checksum jobs to run simultaneously")
allowMaster := subFlags.Bool("allow_master", false, "Whether to use master tablet for backup. Warning!! If you are using the builtin backup engine, this will shutdown your master mysql for as long as it takes to create a backup ")

if err := subFlags.Parse(args); err != nil {
return err
}
Expand All @@ -103,13 +107,12 @@ func commandBackupShard(ctx context.Context, wr *wrangler.Wrangler, subFlags *fl
var secondsBehind uint32

for i := range tablets {
// only run a backup on a replica, rdonly or spare tablet type
// find a replica, rdonly or spare tablet type to run the backup on
switch tablets[i].Type {
case topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY, topodatapb.TabletType_SPARE:
default:
continue
}

// choose the first tablet as the baseline
if tabletForBackup == nil {
tabletForBackup = tablets[i].Tablet
Expand All @@ -124,16 +127,30 @@ func commandBackupShard(ctx context.Context, wr *wrangler.Wrangler, subFlags *fl
}
}

// if no other tablet is available and allowMaster is set to true
if tabletForBackup == nil && *allowMaster {
for i := range tablets {
switch tablets[i].Type {
case topodatapb.TabletType_MASTER:
tabletForBackup = tablets[i].Tablet
secondsBehind = 0
break
default:
continue
}
}
}

if tabletForBackup == nil {
return errors.New("no tablet available for backup")
}

return execBackup(ctx, wr, tabletForBackup, *concurrency)
return execBackup(ctx, wr, tabletForBackup, *concurrency, *allowMaster)
}

// execBackup is shared by Backup and BackupShard
func execBackup(ctx context.Context, wr *wrangler.Wrangler, tablet *topodatapb.Tablet, concurrency int) error {
stream, err := wr.TabletManagerClient().Backup(ctx, tablet, concurrency)
func execBackup(ctx context.Context, wr *wrangler.Wrangler, tablet *topodatapb.Tablet, concurrency int, allowMaster bool) error {
stream, err := wr.TabletManagerClient().Backup(ctx, tablet, concurrency, allowMaster)
if err != nil {
return err
}
Expand Down
8 changes: 5 additions & 3 deletions go/vt/vttablet/agentrpctest/test_agent_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -1156,21 +1156,23 @@ func agentRPCTestPromoteSlavePanic(ctx context.Context, t *testing.T, client tmc
//

var testBackupConcurrency = 24
var testBackupAllowMaster = false
var testBackupCalled = false
var testRestoreFromBackupCalled = false

func (fra *fakeRPCAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error {
func (fra *fakeRPCAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger, allowMaster bool) error {
if fra.panics {
panic(fmt.Errorf("test-triggered panic"))
}
compare(fra.t, "Backup args", concurrency, testBackupConcurrency)
compare(fra.t, "Backup args", allowMaster, testBackupAllowMaster)
logStuff(logger, 10)
testBackupCalled = true
return nil
}

func agentRPCTestBackup(ctx context.Context, t *testing.T, client tmclient.TabletManagerClient, tablet *topodatapb.Tablet) {
stream, err := client.Backup(ctx, tablet, testBackupConcurrency)
stream, err := client.Backup(ctx, tablet, testBackupConcurrency, testBackupAllowMaster)
if err != nil {
t.Fatalf("Backup failed: %v", err)
}
Expand All @@ -1179,7 +1181,7 @@ func agentRPCTestBackup(ctx context.Context, t *testing.T, client tmclient.Table
}

func agentRPCTestBackupPanic(ctx context.Context, t *testing.T, client tmclient.TabletManagerClient, tablet *topodatapb.Tablet) {
stream, err := client.Backup(ctx, tablet, testBackupConcurrency)
stream, err := client.Backup(ctx, tablet, testBackupConcurrency, testBackupAllowMaster)
if err != nil {
t.Fatalf("Backup failed: %v", err)
}
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/faketmclient/fake_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ func (e *eofEventStream) Recv() (*logutilpb.Event, error) {
}

// Backup is part of the tmclient.TabletManagerClient interface.
func (client *FakeTabletManagerClient) Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int) (logutil.EventStream, error) {
func (client *FakeTabletManagerClient) Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int, allowMaster bool) (logutil.EventStream, error) {
return &eofEventStream{}, nil
}

Expand Down
3 changes: 2 additions & 1 deletion go/vt/vttablet/grpctmclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -775,14 +775,15 @@ func (e *backupStreamAdapter) Recv() (*logutilpb.Event, error) {
}

// Backup is part of the tmclient.TabletManagerClient interface.
func (client *Client) Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int) (logutil.EventStream, error) {
func (client *Client) Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int, allowMaster bool) (logutil.EventStream, error) {
cc, c, err := client.dial(tablet)
if err != nil {
return nil, err
}

stream, err := c.Backup(ctx, &tabletmanagerdatapb.BackupRequest{
Concurrency: int64(concurrency),
AllowMaster: bool(allowMaster),
})
if err != nil {
cc.Close()
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/grpctmserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ func (s *server) Backup(request *tabletmanagerdatapb.BackupRequest, stream table
})
})

return s.agent.Backup(ctx, int(request.Concurrency), logger)
return s.agent.Backup(ctx, int(request.Concurrency), logger, bool(request.AllowMaster))
}

func (s *server) RestoreFromBackup(request *tabletmanagerdatapb.RestoreFromBackupRequest, stream tabletmanagerservicepb.TabletManager_RestoreFromBackupServer) (err error) {
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletmanager/rpc_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ type RPCAgent interface {

// Backup / restore related methods

Backup(ctx context.Context, concurrency int, logger logutil.Logger) error
Backup(ctx context.Context, concurrency int, logger logutil.Logger, allowMaster bool) error

RestoreFromBackup(ctx context.Context, logger logutil.Logger) error

Expand Down
10 changes: 5 additions & 5 deletions go/vt/vttablet/tabletmanager/rpc_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
)

// Backup takes a db backup and sends it to the BackupStorage
func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error {
func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger, allowMaster bool) error {
if err := agent.lock(ctx); err != nil {
return err
}
Expand All @@ -45,16 +45,16 @@ func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger lo
// but the process didn't find out about this.
// It is not safe to take backups from tablet in this state
currentTablet := agent.Tablet()
if currentTablet.Type == topodatapb.TabletType_MASTER {
return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode")
if !allowMaster && currentTablet.Type == topodatapb.TabletType_MASTER {
return fmt.Errorf("type MASTER cannot take backup. if you really need to do this, rerun the backup command with -allow_master")
}

tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias)
if err != nil {
return err
}
if tablet.Type == topodatapb.TabletType_MASTER {
return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode")
if !allowMaster && tablet.Type == topodatapb.TabletType_MASTER {
return fmt.Errorf("type MASTER cannot take backup. if you really need to do this, rerun the backup command with -allow_master")
}
originalType := tablet.Type

Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tmclient/rpc_client_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ type TabletManagerClient interface {
//

// Backup creates a database backup
Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int) (logutil.EventStream, error)
Backup(ctx context.Context, tablet *topodatapb.Tablet, concurrency int, allowMaster bool) (logutil.EventStream, error)

// RestoreFromBackup deletes local data and restores database from backup
RestoreFromBackup(ctx context.Context, tablet *topodatapb.Tablet) (logutil.EventStream, error)
Expand Down
1 change: 1 addition & 0 deletions proto/tabletmanagerdata.proto
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ message PromoteSlaveResponse {

message BackupRequest {
int64 concurrency = 1;
bool allowMaster = 2;
}

message BackupResponse {
Expand Down
23 changes: 15 additions & 8 deletions py/vtproto/tabletmanagerdata_pb2.py

Large diffs are not rendered by default.

64 changes: 64 additions & 0 deletions test/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,70 @@ def test_backup_rdonly(self):
def test_backup_replica(self):
self._test_backup('replica')

def test_backup_master(self):
"""Test backup flow.

test_backup will:
- create a shard with master and replica1 only
- run InitShardMaster
- insert some data
- take a backup on master
- insert more data on the master
- bring up tablet_replica2 after the fact, let it restore the backup
- check all data is right (before+after backup data)
- list the backup, remove it

"""
# insert data on master, wait for slave to get it
tablet_master.mquery('vt_test_keyspace', self._create_vt_insert_test)
self._insert_data(tablet_master, 1)
self._check_data(tablet_replica1, 1, 'replica1 tablet getting data')

# This will fail, make sure we get the right error.
_, err = utils.run_vtctl(['Backup', tablet_master.tablet_alias],
auto_log=True, expect_fail=True)
self.assertIn('type MASTER cannot take backup. if you really need to do this, rerun the backup command with -allow_master', err)

# And make sure there is no backup left.
backups = self._list_backups()
self.assertEqual(len(backups), 0, 'invalid backups: %s' % backups)

# backup the master
utils.run_vtctl(['Backup', '-allow_master=true', tablet_master.tablet_alias], auto_log=True)

# check that the backup shows up in the listing
backups = self._list_backups()
logging.debug('list of backups: %s', backups)
self.assertEqual(len(backups), 1)
self.assertTrue(backups[0].endswith(tablet_master.tablet_alias))

# insert more data on the master
self._insert_data(tablet_master, 2)

# now bring up the other slave, letting it restore from backup.
self._restore(tablet_replica2, tablet_type='replica')

# check the new slave has the data
self._check_data(tablet_replica2, 2, 'replica2 tablet getting data')

# check that the restored slave has the right local_metadata
result = tablet_replica2.mquery('_vt', 'select * from local_metadata')
metadata = {}
for row in result:
metadata[row[0]] = row[1]
self.assertEqual(metadata['Alias'], 'test_nj-0000062346')
self.assertEqual(metadata['ClusterAlias'], 'test_keyspace.0')
self.assertEqual(metadata['DataCenter'], 'test_nj')
self.assertEqual(metadata['PromotionRule'], 'neutral')

# remove the backup and check that the list is empty
self._remove_backup(backups[0])
backups = self._list_backups()
logging.debug('list of backups after remove: %s', backups)
self.assertEqual(len(backups), 0)

tablet_replica2.kill_vttablet()

def _test_backup(self, tablet_type):
"""Test backup flow.

Expand Down