Skip to content

Commit

Permalink
[horus] Organize downtime recovery logic (#445)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfordjody authored Oct 10, 2024
1 parent 1be672d commit 06b5e00
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 9 deletions.
1 change: 1 addition & 0 deletions app/horus/base/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type DowntimeConfiguration struct {
AllSystemUser string `yaml:"allSystemUser"`
AllSystemPassword string `yaml:"allSystemPassword"`
DingTalk *DingTalkConfiguration `yaml:"dingTalk"`
Slack *SlackConfiguration `yaml:"slack"`
}

type CleanerConfiguration struct {
Expand Down
16 changes: 14 additions & 2 deletions app/horus/base/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ type NodeDataInfo struct {
UpdateTime time.Time `json:"update_time" xorm:"update_time updated"`
RecoveryMark int64 `json:"recovery_mark" xorm:"recovery_mark"`
RecoveryQL string `json:"recovery_ql" xorm:"recovery_ql"`
DownTimeRecoveryQL []string `json:"downtime_recovery_ql xorm:downtime_recovery_ql"`
DownTimeRecoveryMark int64 `json:"downtime_recovery_mark xorm:downtime_recovery_mark"`
DownTimeRecoveryMark int64 `json:"downtime_recovery_mark" xorm:"downtime_recovery_mark"`
DownTimeRecoveryQL string `json:"downtime_recovery_ql" xorm:"downtime_recovery_ql"`
}

type PodDataInfo struct {
Expand Down Expand Up @@ -122,6 +122,11 @@ func (n *NodeDataInfo) RecoveryMarker() (bool, error) {
return n.Update()
}

func (n *NodeDataInfo) DownTimeRecoveryMarker() (bool, error) {
n.DownTimeRecoveryMark = 1
return n.Update()
}

func (n *NodeDataInfo) RestartMarker() (bool, error) {
n.Restart = 1
return n.Update()
Expand Down Expand Up @@ -149,6 +154,13 @@ func GetRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) {
return ndi, err
}

func GetDownTimeRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
session := db.Where(fmt.Sprintf("downtime_recovery_mark = 0 AND first_date > DATE_SUB(CURDATE(), INTERVAL %d DAY)", day))
err := session.Find(&ndi)
return ndi, err
}

func GetRestartNodeDataInfoDate() ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
session := db.Where("restart = 0 and repair = 0 and module_name = ?", "nodeDown")
Expand Down
40 changes: 35 additions & 5 deletions app/horus/core/horuser/node_downtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
klog.Infof("clusterName:%v\n", clusterName)

nodeDownTimeRes := make(map[string]int)
cq := len(h.cc.NodeDownTime.AbnormalityQL)
aq := len(h.cc.NodeDownTime.AbnormalityQL)
rq := len(h.cc.NodeDownTime.AbnormalRecoveryQL)
for _, ql := range h.cc.NodeDownTime.AbnormalityQL {
ql := ql
res, err := h.InstantQuery(addr, ql, clusterName, h.cc.NodeDownTime.PromQueryTimeSecond)
Expand All @@ -92,12 +93,34 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
nodeDownTimeRes[nodeName]++
}
}

for _, ql := range h.cc.NodeDownTime.AbnormalRecoveryQL {
ql := ql
res, err := h.InstantQuery(addr, ql, clusterName, h.cc.NodeDownTime.PromQueryTimeSecond)
if err != nil {
klog.Errorf("downtimeNodes InstantQuery err:%v", err)
klog.Infof("clusterName:%v\n", clusterName)
continue
}

for _, v := range res {
v := v
nodeName := string(v.Metric["node"])
if nodeName == "" {
klog.Error("downtimeNodes InstantQuery nodeName empty.")
klog.Infof("clusterName:%v\n metric:%v\n", clusterName, v.Metric)
continue
}
nodeDownTimeRes[nodeName]++
}
}

WithDownNodeIPs := make(map[string]string)

for node, count := range nodeDownTimeRes {
if count < cq {
klog.Error("downtimeNodes node not reach threshold")
klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, cq, count)
if count < aq {
klog.Error("downtimeNodes not reach threshold")
klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, aq, count)
continue
}
abnormalInfoSystemQL := fmt.Sprintf(h.cc.NodeDownTime.AbnormalInfoSystemQL, node)
Expand Down Expand Up @@ -142,12 +165,19 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
return "", nil
}()

for node, count := range nodeDownTimeRes {
if count < rq {
klog.Error("downtimeNodes not reach recovery threshold")
klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, aq, count)
continue
}
}

write := db.NodeDataInfo{
NodeName: nodeName,
NodeIP: nodeIP,
ClusterName: clusterName,
ModuleName: NODE_DOWN,
RecoveryQL: ,
}
exist, _ := write.Check()
if exist {
Expand Down
41 changes: 41 additions & 0 deletions app/horus/core/horuser/node_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,44 @@ func (h *Horuser) recoveryNodes(n db.NodeDataInfo) {
}
klog.Infof("RecoveryMarker result success:%v", success)
}

func (h *Horuser) DownTimeRecoveryNodes(n db.NodeDataInfo) {
promAddr := h.cc.PromMultiple[n.ClusterName]
if promAddr == "" {
klog.Error("recoveryNodes promAddr by clusterName empty.")
klog.Infof("clusterName:%v nodeName:%v", n.ClusterName, n.NodeName)
return
}
vecs, err := h.InstantQuery(promAddr, n.DownTimeRecoveryQL, n.ClusterName, h.cc.NodeDownTime.PromQueryTimeSecond)
if err != nil {
klog.Errorf("recoveryNodes InstantQuery err:%v", err)
klog.Infof("recoveryQL:%v", n.DownTimeRecoveryQL)
return
}
if len(vecs) != 1 {
klog.Infof("Expected 1 result, but got:%d", len(vecs))
return
}
if err != nil {
klog.Errorf("recoveryNodes InstantQuery err:%v", err)
klog.Infof("recoveryQL:%v", n.DownTimeRecoveryQL)
return
}
klog.Info("recoveryNodes InstantQuery success.")

err = h.UnCordon(n.NodeName, n.ClusterName)
res := "Success"
if err != nil {
res = fmt.Sprintf("result failed:%v", err)
}
msg := fmt.Sprintf("\n【集群: %v】\n【宕机节点已达到恢复临界点】\n【已恢复调度节点: %v】\n【处理结果:%v】\n【日期: %v】\n", n.ClusterName, n.NodeName, res, n.CreateTime)
alerter.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
alerter.SlackSend(h.cc.NodeDownTime.Slack, msg)

success, err := n.DownTimeRecoveryMarker()
if err != nil {
klog.Errorf("DownTimeRecoveryMarker result failed err:%v", err)
return
}
klog.Infof("DownTimeRecoveryMarker result success:%v", success)
}
4 changes: 2 additions & 2 deletions manifests/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ customModular:
title: "自定义通知"

nodeDownTime:
enabled: false
enabled: true
intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL:
Expand All @@ -91,7 +91,7 @@ nodeDownTime:
title: "自定义通知"

podStagnationCleaner:
enabled: true
enabled: false
intervalSecond: 15
doubleSecond: 60
fieldSelector: "status.phase!=Running"
Expand Down

0 comments on commit 06b5e00

Please sign in to comment.