diff --git a/server/cluster/unsafe_recovery_controller.go b/server/cluster/unsafe_recovery_controller.go index 01021069179..e640c9a2eb6 100644 --- a/server/cluster/unsafe_recovery_controller.go +++ b/server/cluster/unsafe_recovery_controller.go @@ -706,9 +706,6 @@ func (u *unsafeRecoveryController) getFailedPeers(region *metapb.Region) []*meta var failedPeers []*metapb.Peer for _, peer := range region.Peers { - if peer.Role == metapb.PeerRole_Learner || peer.Role == metapb.PeerRole_DemotingVoter { - continue - } if u.isFailed(peer) { failedPeers = append(failedPeers, peer) } diff --git a/server/cluster/unsafe_recovery_controller_test.go b/server/cluster/unsafe_recovery_controller_test.go index 1209b5cd0c4..aa9d84384d8 100644 --- a/server/cluster/unsafe_recovery_controller_test.go +++ b/server/cluster/unsafe_recovery_controller_test.go @@ -606,6 +606,48 @@ func TestAutoDetectMode(t *testing.T) { } } +// Failed learner replica/ store should be considered by auto-recover. +func TestAutoDetectModeWithOneLearner(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _, opt, _ := newTestScheduleConfig() + cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) + cluster.coordinator = newCoordinator(ctx, cluster, hbstream.NewTestHeartbeatStreams(ctx, cluster.meta.GetId(), cluster, true)) + cluster.coordinator.run() + for _, store := range newTestStores(1, "6.0.0") { + re.NoError(cluster.PutStore(store.GetMeta())) + } + recoveryController := newUnsafeRecoveryController(cluster) + re.NoError(recoveryController.RemoveFailedStores(nil, 60, true)) + + storeReport := pdpb.StoreReport{ + PeerReports: []*pdpb.PeerReport{ + { + RaftState: &raft_serverpb.RaftLocalState{LastIndex: 10, HardState: &eraftpb.HardState{Term: 1, Commit: 10}}, + RegionState: &raft_serverpb.RegionLocalState{ + Region: &metapb.Region{ + Id: 1001, + RegionEpoch: &metapb.RegionEpoch{ConfVer: 7, Version: 10}, + Peers: []*metapb.Peer{ + {Id: 11, StoreId: 1}, {Id: 12, StoreId: 2}, {Id: 13, StoreId: 3, Role: metapb.PeerRole_Learner}}}}}, + }, + } + req := newStoreHeartbeat(1, &storeReport) + req.StoreReport.Step = 1 + resp := &pdpb.StoreHeartbeatResponse{} + recoveryController.HandleStoreHeartbeat(req, resp) + hasStore3AsFailedStore := false + for _, failedStore := range resp.RecoveryPlan.ForceLeader.FailedStores { + if failedStore == 3 { + hasStore3AsFailedStore = true + break + } + } + re.True(hasStore3AsFailedStore) +} + func TestOneLearner(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) diff --git a/server/election/leadership.go b/server/election/leadership.go index c9215318c64..0489a6c7e4e 100644 --- a/server/election/leadership.go +++ b/server/election/leadership.go @@ -16,6 +16,7 @@ package election import ( "context" + "sync" "sync/atomic" "github.com/pingcap/failpoint" @@ -54,8 +55,9 @@ type Leadership struct { leaderKey string leaderValue string - keepAliveCtx context.Context - keepAliveCancelFunc context.CancelFunc + keepAliveCtx context.Context + keepAliveCancelFunc context.CancelFunc + keepAliveCancelFuncLock sync.Mutex } // NewLeadership creates a new Leadership. @@ -137,7 +139,9 @@ func (ls *Leadership) Keep(ctx context.Context) { if ls == nil { return } + ls.keepAliveCancelFuncLock.Lock() ls.keepAliveCtx, ls.keepAliveCancelFunc = context.WithCancel(ctx) + ls.keepAliveCancelFuncLock.Unlock() go ls.getLease().KeepAlive(ls.keepAliveCtx) } @@ -230,8 +234,10 @@ func (ls *Leadership) Reset() { if ls == nil || ls.getLease() == nil { return } + ls.keepAliveCancelFuncLock.Lock() if ls.keepAliveCancelFunc != nil { ls.keepAliveCancelFunc() } + ls.keepAliveCancelFuncLock.Unlock() ls.getLease().Close() }