Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[horus] Organize downtime recovery logic #445

Merged
merged 1 commit into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/horus/base/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type DowntimeConfiguration struct {
AllSystemUser string `yaml:"allSystemUser"`
AllSystemPassword string `yaml:"allSystemPassword"`
DingTalk *DingTalkConfiguration `yaml:"dingTalk"`
Slack *SlackConfiguration `yaml:"slack"`
}

type CleanerConfiguration struct {
Expand Down
16 changes: 14 additions & 2 deletions app/horus/base/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ type NodeDataInfo struct {
UpdateTime time.Time `json:"update_time" xorm:"update_time updated"`
RecoveryMark int64 `json:"recovery_mark" xorm:"recovery_mark"`
RecoveryQL string `json:"recovery_ql" xorm:"recovery_ql"`
DownTimeRecoveryQL []string `json:"downtime_recovery_ql xorm:downtime_recovery_ql"`
DownTimeRecoveryMark int64 `json:"downtime_recovery_mark xorm:downtime_recovery_mark"`
DownTimeRecoveryMark int64 `json:"downtime_recovery_mark" xorm:"downtime_recovery_mark"`
DownTimeRecoveryQL string `json:"downtime_recovery_ql" xorm:"downtime_recovery_ql"`
}

type PodDataInfo struct {
Expand Down Expand Up @@ -122,6 +122,11 @@ func (n *NodeDataInfo) RecoveryMarker() (bool, error) {
return n.Update()
}

func (n *NodeDataInfo) DownTimeRecoveryMarker() (bool, error) {
n.DownTimeRecoveryMark = 1
return n.Update()
}

func (n *NodeDataInfo) RestartMarker() (bool, error) {
n.Restart = 1
return n.Update()
Expand Down Expand Up @@ -149,6 +154,13 @@ func GetRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) {
return ndi, err
}

func GetDownTimeRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
session := db.Where(fmt.Sprintf("downtime_recovery_mark = 0 AND first_date > DATE_SUB(CURDATE(), INTERVAL %d DAY)", day))
err := session.Find(&ndi)
return ndi, err
}

func GetRestartNodeDataInfoDate() ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
session := db.Where("restart = 0 and repair = 0 and module_name = ?", "nodeDown")
Expand Down
40 changes: 35 additions & 5 deletions app/horus/core/horuser/node_downtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
klog.Infof("clusterName:%v\n", clusterName)

nodeDownTimeRes := make(map[string]int)
cq := len(h.cc.NodeDownTime.AbnormalityQL)
aq := len(h.cc.NodeDownTime.AbnormalityQL)
rq := len(h.cc.NodeDownTime.AbnormalRecoveryQL)
for _, ql := range h.cc.NodeDownTime.AbnormalityQL {
ql := ql
res, err := h.InstantQuery(addr, ql, clusterName, h.cc.NodeDownTime.PromQueryTimeSecond)
Expand All @@ -92,12 +93,34 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
nodeDownTimeRes[nodeName]++
}
}

for _, ql := range h.cc.NodeDownTime.AbnormalRecoveryQL {
ql := ql
res, err := h.InstantQuery(addr, ql, clusterName, h.cc.NodeDownTime.PromQueryTimeSecond)
if err != nil {
klog.Errorf("downtimeNodes InstantQuery err:%v", err)
klog.Infof("clusterName:%v\n", clusterName)
continue
}

for _, v := range res {
v := v
nodeName := string(v.Metric["node"])
if nodeName == "" {
klog.Error("downtimeNodes InstantQuery nodeName empty.")
klog.Infof("clusterName:%v\n metric:%v\n", clusterName, v.Metric)
continue
}
nodeDownTimeRes[nodeName]++
}
}

WithDownNodeIPs := make(map[string]string)

for node, count := range nodeDownTimeRes {
if count < cq {
klog.Error("downtimeNodes node not reach threshold")
klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, cq, count)
if count < aq {
klog.Error("downtimeNodes not reach threshold")
klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, aq, count)
continue
}
abnormalInfoSystemQL := fmt.Sprintf(h.cc.NodeDownTime.AbnormalInfoSystemQL, node)
Expand Down Expand Up @@ -142,12 +165,19 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
return "", nil
}()

for node, count := range nodeDownTimeRes {
if count < rq {
klog.Error("downtimeNodes not reach recovery threshold")
klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, aq, count)
continue
}
}

write := db.NodeDataInfo{
NodeName: nodeName,
NodeIP: nodeIP,
ClusterName: clusterName,
ModuleName: NODE_DOWN,
RecoveryQL: ,
}
exist, _ := write.Check()
if exist {
Expand Down
41 changes: 41 additions & 0 deletions app/horus/core/horuser/node_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,44 @@ func (h *Horuser) recoveryNodes(n db.NodeDataInfo) {
}
klog.Infof("RecoveryMarker result success:%v", success)
}

func (h *Horuser) DownTimeRecoveryNodes(n db.NodeDataInfo) {
promAddr := h.cc.PromMultiple[n.ClusterName]
if promAddr == "" {
klog.Error("recoveryNodes promAddr by clusterName empty.")
klog.Infof("clusterName:%v nodeName:%v", n.ClusterName, n.NodeName)
return
}
vecs, err := h.InstantQuery(promAddr, n.DownTimeRecoveryQL, n.ClusterName, h.cc.NodeDownTime.PromQueryTimeSecond)
if err != nil {
klog.Errorf("recoveryNodes InstantQuery err:%v", err)
klog.Infof("recoveryQL:%v", n.DownTimeRecoveryQL)
return
}
if len(vecs) != 1 {
klog.Infof("Expected 1 result, but got:%d", len(vecs))
return
}
if err != nil {
klog.Errorf("recoveryNodes InstantQuery err:%v", err)
klog.Infof("recoveryQL:%v", n.DownTimeRecoveryQL)
return
}
klog.Info("recoveryNodes InstantQuery success.")

err = h.UnCordon(n.NodeName, n.ClusterName)
res := "Success"
if err != nil {
res = fmt.Sprintf("result failed:%v", err)
}
msg := fmt.Sprintf("\n【集群: %v】\n【宕机节点已达到恢复临界点】\n【已恢复调度节点: %v】\n【处理结果:%v】\n【日期: %v】\n", n.ClusterName, n.NodeName, res, n.CreateTime)
alerter.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
alerter.SlackSend(h.cc.NodeDownTime.Slack, msg)

success, err := n.DownTimeRecoveryMarker()
if err != nil {
klog.Errorf("DownTimeRecoveryMarker result failed err:%v", err)
return
}
klog.Infof("DownTimeRecoveryMarker result success:%v", success)
}
4 changes: 2 additions & 2 deletions manifests/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ customModular:
title: "自定义通知"

nodeDownTime:
enabled: false
enabled: true
intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL:
Expand All @@ -91,7 +91,7 @@ nodeDownTime:
title: "自定义通知"

podStagnationCleaner:
enabled: true
enabled: false
intervalSecond: 15
doubleSecond: 60
fieldSelector: "status.phase!=Running"
Expand Down
Loading