From 2949a539177e1c6f5b2b40619595ca1b0726d639 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 26 Jun 2023 20:32:34 +0800 Subject: [PATCH] checker: fix the too many orphan peers cannot be removed (#6574) (#6576) close tikv/pd#6573, ref tikv/pd#6574 rule-checker: fix the too many orphan peers that cannot be removed - let the health peer can be removed once there exist redundant Signed-off-by: nolouch Co-authored-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/schedule/checker/rule_checker.go | 7 ++++ pkg/schedule/checker/rule_checker_test.go | 47 ++++++++++++++++++++--- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index 13b69484b04..e950044728e 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -455,11 +455,18 @@ loopFits: // If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2. // Ref https://github.com/tikv/pd/issues/4045 if len(fit.OrphanPeers) >= 2 { + hasHealthPeer := false for _, orphanPeer := range fit.OrphanPeers { if isUnhealthyPeer(orphanPeer.GetId()) { ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } + if hasHealthPeer { + // there already exists a healthy orphan peer, so we can remove other orphan Peers. + ruleCheckerRemoveOrphanPeerCounter.Inc() + return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) + } + hasHealthPeer = true } } ruleCheckerSkipRemoveOrphanPeerCounter.Inc() diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index 0593a2224d1..c204345faa3 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -168,6 +168,39 @@ func (suite *ruleCheckerTestSuite) TestFixOrphanPeers() { suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) } +func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers() { + suite.cluster.AddLeaderStore(1, 1) + suite.cluster.AddLeaderStore(2, 1) + suite.cluster.AddLeaderStore(3, 1) + suite.cluster.AddLeaderStore(4, 1) + suite.cluster.AddLeaderStore(5, 1) + suite.cluster.AddLeaderStore(6, 1) + suite.cluster.AddRegionWithLearner(1, 1, []uint64{2, 3}, []uint64{4, 5, 6}) + // Case1: + // store 4, 5, 6 are orphan peers, and peer on store 3 is pending and down peer. + region := suite.cluster.GetRegion(1) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(uint64(5), op.Step(0).(operator.RemovePeer).FromStore) + + // Case2: + // store 4, 5, 6 are orphan peers, and peer on store 3 is down peer. and peer on store 4, 5 are pending. + region = suite.cluster.GetRegion(1) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(4), region.GetStorePeer(5)})) + suite.cluster.PutRegion(region) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) +} + func (suite *ruleCheckerTestSuite) TestFixOrphanPeers2() { // check orphan peers can only be handled when all rules are satisfied. suite.cluster.AddLabelsStore(1, 1, map[string]string{"foo": "bar"}) @@ -312,7 +345,7 @@ func (suite *ruleCheckerTestSuite) TestFixRuleWitness() { suite.cluster.AddLabelsStore(1, 1, map[string]string{"A": "leader"}) suite.cluster.AddLabelsStore(2, 1, map[string]string{"B": "follower"}) suite.cluster.AddLabelsStore(3, 1, map[string]string{"C": "voter"}) - suite.cluster.AddLeaderRegion(1, 1, 2) + suite.cluster.AddLeaderRegion(1, 1) suite.ruleManager.SetRule(&placement.Rule{ GroupID: "pd", @@ -329,6 +362,7 @@ func (suite *ruleCheckerTestSuite) TestFixRuleWitness() { op := suite.rc.Check(suite.cluster.GetRegion(1)) suite.NotNil(op) suite.Equal("add-rule-peer", op.Desc()) + fmt.Println(op) suite.Equal(uint64(3), op.Step(0).(operator.AddLearner).ToStore) suite.True(op.Step(0).(operator.AddLearner).IsWitness) } @@ -337,24 +371,25 @@ func (suite *ruleCheckerTestSuite) TestFixRuleWitness2() { suite.cluster.AddLabelsStore(1, 1, map[string]string{"A": "leader"}) suite.cluster.AddLabelsStore(2, 1, map[string]string{"B": "voter"}) suite.cluster.AddLabelsStore(3, 1, map[string]string{"C": "voter"}) - suite.cluster.AddLeaderRegion(1, 1, 2, 3) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"D": "voter"}) + suite.cluster.AddLeaderRegion(1, 1, 2, 3, 4) suite.ruleManager.SetRule(&placement.Rule{ GroupID: "pd", ID: "r1", Index: 100, - Override: true, + Override: false, Role: placement.Voter, Count: 1, IsWitness: true, LabelConstraints: []placement.LabelConstraint{ - {Key: "C", Op: "in", Values: []string{"voter"}}, + {Key: "D", Op: "in", Values: []string{"voter"}}, }, }) op := suite.rc.Check(suite.cluster.GetRegion(1)) suite.NotNil(op) suite.Equal("fix-witness-peer", op.Desc()) - suite.Equal(uint64(3), op.Step(0).(operator.BecomeWitness).StoreID) + suite.Equal(uint64(4), op.Step(0).(operator.BecomeWitness).StoreID) } func (suite *ruleCheckerTestSuite) TestFixRuleWitness3() { @@ -366,7 +401,7 @@ func (suite *ruleCheckerTestSuite) TestFixRuleWitness3() { r := suite.cluster.GetRegion(1) // set peer3 to witness r = r.Clone(core.WithWitnesses([]*metapb.Peer{r.GetPeer(3)})) - + suite.cluster.PutRegion(r) op := suite.rc.Check(r) suite.NotNil(op) suite.Equal("fix-non-witness-peer", op.Desc())