Skip to content

Commit

Permalink
checker: when there are too many orphan peers, try to delete them (#5787
Browse files Browse the repository at this point in the history
) (#5817)

* This is an automated cherry-pick of #5787

ref #5786

Signed-off-by: ti-chi-bot <[email protected]>

* fix conflicts

Signed-off-by: nolouch <[email protected]>

Signed-off-by: ti-chi-bot <[email protected]>
Signed-off-by: nolouch <[email protected]>
Co-authored-by: 混沌DM <[email protected]>
Co-authored-by: nolouch <[email protected]>
  • Loading branch information
3 people authored Jan 11, 2023
1 parent 52cb349 commit 93f49ea
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 19 deletions.
51 changes: 36 additions & 15 deletions server/schedule/checker/rule_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,31 +316,52 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg
if len(fit.OrphanPeers) == 0 {
return nil, nil
}
isUnhealthyPeer := func(id uint64) bool {
for _, pendingPeer := range region.GetPendingPeers() {
if pendingPeer.GetId() == id {
return true
}
}
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.GetId() == id {
return true
}
}
return false
}
// remove orphan peers only when all rules are satisfied (count+role) and all peers selected
// by RuleFits is not pending or down.
hasUnhealthyFit := false
loopFits:
for _, rf := range fit.RuleFits {
if !rf.IsSatisfied() {
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
hasUnhealthyFit = true
break
}
for _, p := range rf.Peers {
for _, pendingPeer := range region.GetPendingPeers() {
if pendingPeer.Id == p.Id {
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
}
if isUnhealthyPeer(p.GetId()) {
hasUnhealthyFit = true
break loopFits
}
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.Id == p.Id {
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
}
}
}
// If hasUnhealthyFit is false, it is safe to delete the OrphanPeer.
if !hasUnhealthyFit {
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, fit.OrphanPeers[0].StoreId)
}
// If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2.
// Ref https://github.com/tikv/pd/issues/4045
if len(fit.OrphanPeers) >= 2 {
for _, orphanPeer := range fit.OrphanPeers {
if isUnhealthyPeer(orphanPeer.GetId()) {
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
}
}
}
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
peer := fit.OrphanPeers[0]
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, peer.StoreId)
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
}

func (c *RuleChecker) isDownPeer(region *core.RegionInfo, peer *metapb.Peer) bool {
Expand Down
28 changes: 24 additions & 4 deletions server/schedule/checker/rule_checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ func (s *testRuleCheckerSuite) TestIssue2419(c *C) {
c.Assert(op.Step(2).(operator.RemovePeer).FromStore, Equals, uint64(3))
}

// Ref https://github.com/tikv/pd/issues/3521
// Ref https://github.com/tikv/pd/issues/3521 https://github.com/tikv/pd/issues/5786
// The problem is when offline a store, we may add learner multiple times if
// the operator is timeout.
func (s *testRuleCheckerSuite) TestPriorityFixOrphanPeer(c *C) {
Expand All @@ -436,13 +436,33 @@ func (s *testRuleCheckerSuite) TestPriorityFixOrphanPeer(c *C) {
c.Assert(op, NotNil)
c.Assert(op.Step(0), FitsTypeOf, add)
c.Assert(op.Desc(), Equals, "replace-rule-offline-peer")
r := s.cluster.GetRegion(1).Clone(core.WithAddPeer(
// Ref 5786
originRegion := s.cluster.GetRegion(1)
learner4 := &metapb.Peer{Id: 114, StoreId: 4, Role: metapb.PeerRole_Learner}
testRegion := originRegion.Clone(
core.WithAddPeer(learner4),
core.WithAddPeer(&metapb.Peer{Id: 115, StoreId: 5, Role: metapb.PeerRole_Learner}),
core.WithPendingPeers([]*metapb.Peer{originRegion.GetStorePeer(2), learner4}),
)
s.cluster.PutRegion(testRegion)
op = s.rc.Check(s.cluster.GetRegion(1))
c.Assert(op, NotNil)
c.Assert("remove-orphan-peer", Equals, op.Desc())
c.Assert(op.Step(0).(operator.RemovePeer), FitsTypeOf, remove)
// Ref #3521
s.cluster.SetStoreOffline(2)
s.cluster.PutRegion(originRegion)
op = s.rc.Check(s.cluster.GetRegion(1))
c.Assert(op, NotNil)
c.Assert(op.Step(0).(operator.AddLearner), FitsTypeOf, add)
c.Assert("replace-rule-offline-peer", Equals, op.Desc())
testRegion = s.cluster.GetRegion(1).Clone(core.WithAddPeer(
&metapb.Peer{
Id: 5,
Id: 125,
StoreId: 4,
Role: metapb.PeerRole_Learner,
}))
s.cluster.PutRegion(r)
s.cluster.PutRegion(testRegion)
op = s.rc.Check(s.cluster.GetRegion(1))
c.Assert(op.Step(0), FitsTypeOf, remove)
c.Assert(op.Desc(), Equals, "remove-orphan-peer")
Expand Down

0 comments on commit 93f49ea

Please sign in to comment.