Skip to content

Commit

Permalink
This is an automated cherry-pick of tikv#5787
Browse files Browse the repository at this point in the history
ref tikv#5786

Signed-off-by: ti-chi-bot <[email protected]>
  • Loading branch information
HunDunDM authored and ti-chi-bot committed Dec 30, 2022
1 parent 1ced7dd commit f5d36a1
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 17 deletions.
51 changes: 36 additions & 15 deletions server/schedule/checker/rule_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,31 +273,52 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg
if len(fit.OrphanPeers) == 0 {
return nil, nil
}
isUnhealthyPeer := func(id uint64) bool {
for _, pendingPeer := range region.GetPendingPeers() {
if pendingPeer.GetId() == id {
return true
}
}
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.GetId() == id {
return true
}
}
return false
}
// remove orphan peers only when all rules are satisfied (count+role) and all peers selected
// by RuleFits is not pending or down.
hasUnhealthyFit := false
loopFits:
for _, rf := range fit.RuleFits {
if !rf.IsSatisfied() {
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
hasUnhealthyFit = true
break
}
for _, p := range rf.Peers {
for _, pendingPeer := range region.GetPendingPeers() {
if pendingPeer.Id == p.Id {
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
}
if isUnhealthyPeer(p.GetId()) {
hasUnhealthyFit = true
break loopFits
}
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.Id == p.Id {
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
}
}
}
// If hasUnhealthyFit is false, it is safe to delete the OrphanPeer.
if !hasUnhealthyFit {
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, fit.OrphanPeers[0].StoreId)
}
// If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2.
// Ref https://github.com/tikv/pd/issues/4045
if len(fit.OrphanPeers) >= 2 {
for _, orphanPeer := range fit.OrphanPeers {
if isUnhealthyPeer(orphanPeer.GetId()) {
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
}
}
}
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
peer := fit.OrphanPeers[0]
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, peer.StoreId)
checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc()
return nil, nil
}

func (c *RuleChecker) isDownPeer(region *core.RegionInfo, peer *metapb.Peer) bool {
Expand Down
35 changes: 33 additions & 2 deletions server/schedule/checker/rule_checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ func (s *testRuleCheckerSuite) TestIssue2419(c *C) {
c.Assert(op.Step(2).(operator.RemovePeer).FromStore, Equals, uint64(3))
}

// Ref https://github.com/tikv/pd/issues/3521
// Ref https://github.com/tikv/pd/issues/3521 https://github.com/tikv/pd/issues/5786
// The problem is when offline a store, we may add learner multiple times if
// the operator is timeout.
func (s *testRuleCheckerSuite) TestPriorityFixOrphanPeer(c *C) {
Expand All @@ -385,21 +385,52 @@ func (s *testRuleCheckerSuite) TestPriorityFixOrphanPeer(c *C) {
c.Assert(op, IsNil)
var add operator.AddLearner
var remove operator.RemovePeer
<<<<<<< HEAD
s.cluster.SetStoreOffline(2)
op = s.rc.Check(s.cluster.GetRegion(1))
c.Assert(op, NotNil)
c.Assert(op.Step(0), FitsTypeOf, add)
c.Assert(op.Desc(), Equals, "replace-rule-offline-peer")
r := s.cluster.GetRegion(1).Clone(core.WithAddPeer(
=======
// Ref 5786
originRegion := suite.cluster.GetRegion(1)
learner4 := &metapb.Peer{Id: 114, StoreId: 4, Role: metapb.PeerRole_Learner}
testRegion := originRegion.Clone(
core.WithAddPeer(learner4),
core.WithAddPeer(&metapb.Peer{Id: 115, StoreId: 5, Role: metapb.PeerRole_Learner}),
core.WithPendingPeers([]*metapb.Peer{originRegion.GetStorePeer(2), learner4}),
)
suite.cluster.PutRegion(testRegion)
op = suite.rc.Check(suite.cluster.GetRegion(1))
suite.NotNil(op)
suite.Equal("remove-orphan-peer", op.Desc())
suite.IsType(remove, op.Step(0))
// Ref #3521
suite.cluster.SetStoreOffline(2)
suite.cluster.PutRegion(originRegion)
op = suite.rc.Check(suite.cluster.GetRegion(1))
suite.NotNil(op)
suite.IsType(add, op.Step(0))
suite.Equal("replace-rule-offline-peer", op.Desc())
testRegion = suite.cluster.GetRegion(1).Clone(core.WithAddPeer(
>>>>>>> 531d9f32d (checker: when there are too many orphan peers, try to delete them (#5787))
&metapb.Peer{
Id: 5,
Id: 125,
StoreId: 4,
Role: metapb.PeerRole_Learner,
}))
<<<<<<< HEAD
s.cluster.PutRegion(r)
op = s.rc.Check(s.cluster.GetRegion(1))
c.Assert(op.Step(0), FitsTypeOf, remove)
c.Assert(op.Desc(), Equals, "remove-orphan-peer")
=======
suite.cluster.PutRegion(testRegion)
op = suite.rc.Check(suite.cluster.GetRegion(1))
suite.IsType(remove, op.Step(0))
suite.Equal("remove-orphan-peer", op.Desc())
>>>>>>> 531d9f32d (checker: when there are too many orphan peers, try to delete them (#5787))
}

func (s *testRuleCheckerSuite) TestIssue3293(c *C) {
Expand Down

0 comments on commit f5d36a1

Please sign in to comment.