Skip to content

Commit

Permalink
This is an automated cherry-pick of tikv#6804
Browse files Browse the repository at this point in the history
close tikv#6560

Signed-off-by: ti-chi-bot <[email protected]>
  • Loading branch information
JmPotato authored and ti-chi-bot committed Jul 17, 2023
1 parent f8bf1d7 commit cf93563
Show file tree
Hide file tree
Showing 9 changed files with 1,611 additions and 244 deletions.
7 changes: 0 additions & 7 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -796,13 +796,6 @@
"intervalFactor": 2,
"legendFormat": "{{type}}",
"refId": "B"
},
{
"expr": "pd_regions_offline_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"offline-peer-region-count\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{type}}",
"refId": "C"
}
],
"thresholds": [
Expand Down
9 changes: 0 additions & 9 deletions pkg/statistics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,6 @@ var (
Help: "Status of the regions.",
}, []string{"type"})

offlineRegionStatusGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Subsystem: "regions",
Name: "offline_status",
Help: "Status of the offline regions.",
}, []string{"type"})

clusterStatusGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Expand Down Expand Up @@ -190,7 +182,6 @@ func init() {
prometheus.MustRegister(hotCacheStatusGauge)
prometheus.MustRegister(storeStatusGauge)
prometheus.MustRegister(regionStatusGauge)
prometheus.MustRegister(offlineRegionStatusGauge)
prometheus.MustRegister(clusterStatusGauge)
prometheus.MustRegister(placementStatusGauge)
prometheus.MustRegister(configStatusGauge)
Expand Down
200 changes: 80 additions & 120 deletions pkg/statistics/region_collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ import (
"github.com/tikv/pd/server/config"
)

// RegionInfoProvider is an interface to provide the region information.
type RegionInfoProvider interface {
// GetRegion returns the region information according to the given region ID.
GetRegion(regionID uint64) *core.RegionInfo
}

// RegionStatisticType represents the type of the region's status.
type RegionStatisticType uint32

Expand All @@ -42,84 +48,92 @@ const (
WitnessLeader
)

var regionStatisticTypes = []RegionStatisticType{
MissPeer,
ExtraPeer,
DownPeer,
PendingPeer,
OfflinePeer,
LearnerPeer,
EmptyRegion,
OversizedRegion,
UndersizedRegion,
WitnessLeader,
}

const nonIsolation = "none"

var (
// WithLabelValues is a heavy operation, define variable to avoid call it every time.
regionMissPeerRegionCounter = regionStatusGauge.WithLabelValues("miss-peer-region-count")
regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count")
regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count")
regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count")
regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count")
regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count")
regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count")
regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count")
regionWitnesssLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count")

offlineMissPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("miss-peer-region-count")
offlineExtraPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("extra-peer-region-count")
offlineDownPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("down-peer-region-count")
offlinePendingPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("pending-peer-region-count")
offlineLearnerPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("learner-peer-region-count")
offlineOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count")
regionMissPeerRegionCounter = regionStatusGauge.WithLabelValues("miss-peer-region-count")
regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count")
regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count")
regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count")
regionOfflinePeerRegionCounter = regionStatusGauge.WithLabelValues("offline-peer-region-count")
regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count")
regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count")
regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count")
regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count")
regionWitnessLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count")
)

// RegionInfo is used to record the status of region.
type RegionInfo struct {
*core.RegionInfo
// RegionInfoWithTS is used to record the extra timestamp status of a region.
type RegionInfoWithTS struct {
id uint64
startMissVoterPeerTS int64
startDownPeerTS int64
}

// RegionStatistics is used to record the status of regions.
type RegionStatistics struct {
sync.RWMutex
<<<<<<< HEAD
conf sc.Config
stats map[RegionStatisticType]map[uint64]*RegionInfo
offlineStats map[RegionStatisticType]map[uint64]*core.RegionInfo
=======
rip RegionInfoProvider
conf sc.CheckerConfig
stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS
>>>>>>> 40eaa35f2 (statistics: get region info via core cluster inside RegionStatistics (#6804))
index map[uint64]RegionStatisticType
offlineIndex map[uint64]RegionStatisticType
ruleManager *placement.RuleManager
storeConfigManager *config.StoreConfigManager
}

// NewRegionStatistics creates a new RegionStatistics.
<<<<<<< HEAD
func NewRegionStatistics(conf sc.Config, ruleManager *placement.RuleManager, storeConfigManager *config.StoreConfigManager) *RegionStatistics {
=======
func NewRegionStatistics(
rip RegionInfoProvider,
conf sc.CheckerConfig,
ruleManager *placement.RuleManager,
storeConfigManager *config.StoreConfigManager,
) *RegionStatistics {
>>>>>>> 40eaa35f2 (statistics: get region info via core cluster inside RegionStatistics (#6804))
r := &RegionStatistics{
rip: rip,
conf: conf,
ruleManager: ruleManager,
storeConfigManager: storeConfigManager,
stats: make(map[RegionStatisticType]map[uint64]*RegionInfo),
offlineStats: make(map[RegionStatisticType]map[uint64]*core.RegionInfo),
stats: make(map[RegionStatisticType]map[uint64]*RegionInfoWithTS),
index: make(map[uint64]RegionStatisticType),
offlineIndex: make(map[uint64]RegionStatisticType),
}
r.stats[MissPeer] = make(map[uint64]*RegionInfo)
r.stats[ExtraPeer] = make(map[uint64]*RegionInfo)
r.stats[DownPeer] = make(map[uint64]*RegionInfo)
r.stats[PendingPeer] = make(map[uint64]*RegionInfo)
r.stats[LearnerPeer] = make(map[uint64]*RegionInfo)
r.stats[EmptyRegion] = make(map[uint64]*RegionInfo)
r.stats[OversizedRegion] = make(map[uint64]*RegionInfo)
r.stats[UndersizedRegion] = make(map[uint64]*RegionInfo)
r.stats[WitnessLeader] = make(map[uint64]*RegionInfo)

r.offlineStats[MissPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[ExtraPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[DownPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[PendingPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[LearnerPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[OfflinePeer] = make(map[uint64]*core.RegionInfo)
for _, typ := range regionStatisticTypes {
r.stats[typ] = make(map[uint64]*RegionInfoWithTS)
}
return r
}

// GetRegionStatsByType gets the status of the region by types. The regions here need to be cloned, otherwise, it may cause data race problems.
// GetRegionStatsByType gets the status of the region by types.
// The regions here need to be cloned, otherwise, it may cause data race problems.
func (r *RegionStatistics) GetRegionStatsByType(typ RegionStatisticType) []*core.RegionInfo {
r.RLock()
defer r.RUnlock()
res := make([]*core.RegionInfo, 0, len(r.stats[typ]))
for _, r := range r.stats[typ] {
res = append(res, r.RegionInfo.Clone())
for regionID := range r.stats[typ] {
res = append(res, r.rip.GetRegion(regionID).Clone())
}
return res
}
Expand All @@ -132,17 +146,6 @@ func (r *RegionStatistics) IsRegionStatsType(regionID uint64, typ RegionStatisti
return exist
}

// GetOfflineRegionStatsByType gets the status of the offline region by types. The regions here need to be cloned, otherwise, it may cause data race problems.
func (r *RegionStatistics) GetOfflineRegionStatsByType(typ RegionStatisticType) []*core.RegionInfo {
r.RLock()
defer r.RUnlock()
res := make([]*core.RegionInfo, 0, len(r.stats[typ]))
for _, r := range r.offlineStats[typ] {
res = append(res, r.Clone())
}
return res
}

func (r *RegionStatistics) deleteEntry(deleteIndex RegionStatisticType, regionID uint64) {
for typ := RegionStatisticType(1); typ <= deleteIndex; typ <<= 1 {
if deleteIndex&typ != 0 {
Expand All @@ -151,14 +154,6 @@ func (r *RegionStatistics) deleteEntry(deleteIndex RegionStatisticType, regionID
}
}

func (r *RegionStatistics) deleteOfflineEntry(deleteIndex RegionStatisticType, regionID uint64) {
for typ := RegionStatisticType(1); typ <= deleteIndex; typ <<= 1 {
if deleteIndex&typ != 0 {
delete(r.offlineStats[typ], regionID)
}
}
}

// RegionStatsNeedUpdate checks whether the region's status need to be updated
// due to some special state types.
func (r *RegionStatistics) RegionStatsNeedUpdate(region *core.RegionInfo) bool {
Expand All @@ -175,15 +170,13 @@ func (r *RegionStatistics) RegionStatsNeedUpdate(region *core.RegionInfo) bool {
func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.StoreInfo) {
r.Lock()
defer r.Unlock()
// Region state.
regionID := region.GetID()
var (
peerTypeIndex RegionStatisticType
offlinePeerTypeIndex RegionStatisticType
deleteIndex RegionStatisticType
desiredReplicas = r.conf.GetMaxReplicas()
desiredVoters = desiredReplicas
peerTypeIndex RegionStatisticType
deleteIndex RegionStatisticType
)
desiredReplicas := r.conf.GetMaxReplicas()
desiredVoters := desiredReplicas
// Check if the region meets count requirements of its rules.
if r.conf.IsPlacementRulesEnabled() {
if !r.ruleManager.IsInitialized() {
log.Warn("ruleManager haven't been initialized")
Expand All @@ -199,19 +192,6 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
}
}
}

var isRemoving bool

for _, store := range stores {
if store.IsRemoving() {
peer := region.GetStorePeer(store.GetID())
if peer != nil {
isRemoving = true
break
}
}
}

// Better to make sure once any of these conditions changes, it will trigger the heartbeat `save_cache`.
// Otherwise, the state may be out-of-date for a long time, which needs another way to apply the change ASAP.
// For example, see `RegionStatsNeedUpdate` above to know how `OversizedRegion` and `UndersizedRegion` are updated.
Expand All @@ -220,6 +200,17 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
ExtraPeer: len(region.GetPeers()) > desiredReplicas,
DownPeer: len(region.GetDownPeers()) > 0,
PendingPeer: len(region.GetPendingPeers()) > 0,
OfflinePeer: func() bool {
for _, store := range stores {
if store.IsRemoving() {
peer := region.GetStorePeer(store.GetID())
if peer != nil {
return true
}
}
}
return false
}(),
LearnerPeer: len(region.GetLearners()) > 0,
EmptyRegion: region.GetApproximateSize() <= core.EmptyRegionApproximateSize,
OversizedRegion: region.IsOversized(
Expand All @@ -232,18 +223,13 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
),
WitnessLeader: region.GetLeader().GetIsWitness(),
}

// Check if the region meets any of the conditions and update the corresponding info.
regionID := region.GetID()
for typ, c := range conditions {
if c {
if isRemoving && typ < EmptyRegion {
r.offlineStats[typ][regionID] = region
offlinePeerTypeIndex |= typ
}
info := r.stats[typ][regionID]
if info == nil {
info = &RegionInfo{
RegionInfo: region,
}
info = &RegionInfoWithTS{id: regionID}
}
if typ == DownPeer {
if info.startDownPeerTS != 0 {
Expand All @@ -263,18 +249,7 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
peerTypeIndex |= typ
}
}

if isRemoving {
r.offlineStats[OfflinePeer][regionID] = region
offlinePeerTypeIndex |= OfflinePeer
}

if oldIndex, ok := r.offlineIndex[regionID]; ok {
deleteIndex = oldIndex &^ offlinePeerTypeIndex
}
r.deleteOfflineEntry(deleteIndex, regionID)
r.offlineIndex[regionID] = offlinePeerTypeIndex

// Remove the info if any of the conditions are not met any more.
if oldIndex, ok := r.index[regionID]; ok {
deleteIndex = oldIndex &^ peerTypeIndex
}
Expand All @@ -289,9 +264,6 @@ func (r *RegionStatistics) ClearDefunctRegion(regionID uint64) {
if oldIndex, ok := r.index[regionID]; ok {
r.deleteEntry(oldIndex, regionID)
}
if oldIndex, ok := r.offlineIndex[regionID]; ok {
r.deleteOfflineEntry(oldIndex, regionID)
}
}

// Collect collects the metrics of the regions' status.
Expand All @@ -302,18 +274,12 @@ func (r *RegionStatistics) Collect() {
regionExtraPeerRegionCounter.Set(float64(len(r.stats[ExtraPeer])))
regionDownPeerRegionCounter.Set(float64(len(r.stats[DownPeer])))
regionPendingPeerRegionCounter.Set(float64(len(r.stats[PendingPeer])))
regionOfflinePeerRegionCounter.Set(float64(len(r.stats[OfflinePeer])))
regionLearnerPeerRegionCounter.Set(float64(len(r.stats[LearnerPeer])))
regionEmptyRegionCounter.Set(float64(len(r.stats[EmptyRegion])))
regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion])))
regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion])))
regionWitnesssLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader])))

offlineMissPeerRegionCounter.Set(float64(len(r.offlineStats[MissPeer])))
offlineExtraPeerRegionCounter.Set(float64(len(r.offlineStats[ExtraPeer])))
offlineDownPeerRegionCounter.Set(float64(len(r.offlineStats[DownPeer])))
offlinePendingPeerRegionCounter.Set(float64(len(r.offlineStats[PendingPeer])))
offlineLearnerPeerRegionCounter.Set(float64(len(r.offlineStats[LearnerPeer])))
offlineOfflinePeerRegionCounter.Set(float64(len(r.offlineStats[OfflinePeer])))
regionWitnessLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader])))
}

// Reset resets the metrics of the regions' status.
Expand All @@ -322,18 +288,12 @@ func (r *RegionStatistics) Reset() {
regionExtraPeerRegionCounter.Set(0)
regionDownPeerRegionCounter.Set(0)
regionPendingPeerRegionCounter.Set(0)
regionOfflinePeerRegionCounter.Set(0)
regionLearnerPeerRegionCounter.Set(0)
regionEmptyRegionCounter.Set(0)
regionOversizedRegionCounter.Set(0)
regionUndersizedRegionCounter.Set(0)
regionWitnesssLeaderRegionCounter.Set(0)

offlineMissPeerRegionCounter.Set(0)
offlineExtraPeerRegionCounter.Set(0)
offlineDownPeerRegionCounter.Set(0)
offlinePendingPeerRegionCounter.Set(0)
offlineLearnerPeerRegionCounter.Set(0)
offlineOfflinePeerRegionCounter.Set(0)
regionWitnessLeaderRegionCounter.Set(0)
}

// LabelStatistics is the statistics of the level of labels.
Expand Down
Loading

0 comments on commit cf93563

Please sign in to comment.