diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index f11d9cfc5fa..5e61b1f4fe8 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -278,6 +278,7 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { return nil, nil } + c.core.ResetStores() start := time.Now() if err := c.storage.LoadStores(c.core.PutStore); err != nil { return nil, err @@ -1007,10 +1008,10 @@ func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) erro return err } -// buryStore marks a store as tombstone in cluster. +// BuryStore marks a store as tombstone in cluster. // The store should be empty before calling this func // State transition: Offline -> Tombstone. -func (c *RaftCluster) buryStore(storeID uint64) error { +func (c *RaftCluster) BuryStore(storeID uint64) error { c.Lock() defer c.Unlock() @@ -1143,7 +1144,7 @@ func (c *RaftCluster) checkStores() { // If the store is empty, it can be buried. regionCount := c.core.GetStoreRegionCount(offlineStore.GetId()) if regionCount == 0 { - if err := c.buryStore(offlineStore.GetId()); err != nil { + if err := c.BuryStore(offlineStore.GetId()); err != nil { log.Error("bury store failed", zap.Stringer("store", offlineStore), errs.ZapError(err)) diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index a2ff06ba3c2..54302950810 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -240,9 +240,9 @@ func (s *testClusterInfoSuite) TestSetOfflineStore(c *C) { for storeID := uint64(0); storeID <= 4; storeID++ { store := cluster.GetStore(storeID) if store == nil || store.IsUp() { - c.Assert(cluster.buryStore(storeID), NotNil) + c.Assert(cluster.BuryStore(storeID), NotNil) } else { - c.Assert(cluster.buryStore(storeID), IsNil) + c.Assert(cluster.BuryStore(storeID), IsNil) } } } @@ -262,7 +262,7 @@ func (s *testClusterInfoSuite) TestReuseAddress(c *C) { c.Assert(cluster.RemoveStore(3, true), IsNil) // store 4: tombstone c.Assert(cluster.RemoveStore(4, true), IsNil) - c.Assert(cluster.buryStore(4), IsNil) + c.Assert(cluster.BuryStore(4), IsNil) for id := uint64(1); id <= 4; id++ { storeInfo := cluster.GetStore(id) diff --git a/server/core/basic_cluster.go b/server/core/basic_cluster.go index 5bc22d04eb5..1de55eb8dee 100644 --- a/server/core/basic_cluster.go +++ b/server/core/basic_cluster.go @@ -275,6 +275,13 @@ func (bc *BasicCluster) PutStore(store *StoreInfo) { bc.Stores.SetStore(store) } +// ResetStores resets the store cache. +func (bc *BasicCluster) ResetStores() { + bc.Lock() + defer bc.Unlock() + bc.Stores = NewStoresInfo() +} + // DeleteStore deletes a store. func (bc *BasicCluster) DeleteStore(store *StoreInfo) { bc.Lock() diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 48dd36444f8..5a6e5d1eac5 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -1105,3 +1105,60 @@ func (s *clusterTestSuite) TestStaleTermHeartbeat(c *C) { err = rc.HandleRegionHeartbeat(region) c.Assert(err, NotNil) } + +// See https://github.com/tikv/pd/issues/4941 +func (s *clusterTestSuite) TestTransferLeaderBack(c *C) { + tc, err := tests.NewTestCluster(s.ctx, 2) + defer tc.Destroy() + c.Assert(err, IsNil) + err = tc.RunInitialServers() + c.Assert(err, IsNil) + tc.WaitLeader() + leaderServer := tc.GetServer(tc.GetLeader()) + svr := leaderServer.GetServer() + rc := cluster.NewRaftCluster(s.ctx, svr.GetClusterRootPath(), svr.ClusterID(), syncer.NewRegionSyncer(svr), svr.GetClient(), svr.GetHTTPClient()) + rc.InitCluster(svr.GetAllocator(), svr.GetPersistOptions(), svr.GetStorage(), svr.GetBasicCluster()) + storage := rc.GetStorage() + meta := &metapb.Cluster{Id: 123} + c.Assert(storage.SaveMeta(meta), IsNil) + n := 4 + stores := make([]*metapb.Store, 0, n) + for i := 1; i <= n; i++ { + store := &metapb.Store{Id: uint64(i), State: metapb.StoreState_Up} + stores = append(stores, store) + } + + for _, store := range stores { + c.Assert(storage.SaveStore(store), IsNil) + } + rc, err = rc.LoadClusterInfo() + c.Assert(err, IsNil) + c.Assert(rc, NotNil) + // offline a store + c.Assert(rc.RemoveStore(1, false), IsNil) + c.Assert(rc.GetStore(1).GetState(), Equals, metapb.StoreState_Offline) + + // transfer PD leader to another PD + tc.ResignLeader() + tc.WaitLeader() + leaderServer = tc.GetServer(tc.GetLeader()) + svr1 := leaderServer.GetServer() + rc1 := svr1.GetRaftCluster() + c.Assert(err, IsNil) + c.Assert(rc1, NotNil) + // tombstone a store, and remove its record + c.Assert(rc1.BuryStore(1), IsNil) + c.Assert(rc1.RemoveTombStoneRecords(), IsNil) + + // transfer PD leader back to the previous PD + tc.ResignLeader() + tc.WaitLeader() + leaderServer = tc.GetServer(tc.GetLeader()) + svr = leaderServer.GetServer() + rc = svr.GetRaftCluster() + c.Assert(rc, NotNil) + + // check store count + c.Assert(rc.GetConfig(), DeepEquals, meta) + c.Assert(rc.GetStoreCount(), Equals, 3) +}