diff --git a/app/horus/base/config/config.go b/app/horus/base/config/config.go index 1001e630..fd553966 100644 --- a/app/horus/base/config/config.go +++ b/app/horus/base/config/config.go @@ -77,6 +77,7 @@ type DowntimeConfiguration struct { AllSystemUser string `yaml:"allSystemUser"` AllSystemPassword string `yaml:"allSystemPassword"` DingTalk *DingTalkConfiguration `yaml:"dingTalk"` + Slack *SlackConfiguration `yaml:"slack"` } type CleanerConfiguration struct { diff --git a/app/horus/base/db/db.go b/app/horus/base/db/db.go index 75a43ddd..b1afe246 100644 --- a/app/horus/base/db/db.go +++ b/app/horus/base/db/db.go @@ -40,8 +40,8 @@ type NodeDataInfo struct { UpdateTime time.Time `json:"update_time" xorm:"update_time updated"` RecoveryMark int64 `json:"recovery_mark" xorm:"recovery_mark"` RecoveryQL string `json:"recovery_ql" xorm:"recovery_ql"` - DownTimeRecoveryQL []string `json:"downtime_recovery_ql xorm:downtime_recovery_ql"` - DownTimeRecoveryMark int64 `json:"downtime_recovery_mark xorm:downtime_recovery_mark"` + DownTimeRecoveryMark int64 `json:"downtime_recovery_mark" xorm:"downtime_recovery_mark"` + DownTimeRecoveryQL string `json:"downtime_recovery_ql" xorm:"downtime_recovery_ql"` } type PodDataInfo struct { @@ -122,6 +122,11 @@ func (n *NodeDataInfo) RecoveryMarker() (bool, error) { return n.Update() } +func (n *NodeDataInfo) DownTimeRecoveryMarker() (bool, error) { + n.DownTimeRecoveryMark = 1 + return n.Update() +} + func (n *NodeDataInfo) RestartMarker() (bool, error) { n.Restart = 1 return n.Update() @@ -149,6 +154,13 @@ func GetRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) { return ndi, err } +func GetDownTimeRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) { + var ndi []NodeDataInfo + session := db.Where(fmt.Sprintf("downtime_recovery_mark = 0 AND first_date > DATE_SUB(CURDATE(), INTERVAL %d DAY)", day)) + err := session.Find(&ndi) + return ndi, err +} + func GetRestartNodeDataInfoDate() ([]NodeDataInfo, error) { var ndi []NodeDataInfo session := db.Where("restart = 0 and repair = 0 and module_name = ?", "nodeDown") diff --git a/app/horus/core/horuser/node_downtime.go b/app/horus/core/horuser/node_downtime.go index 0ddeacdb..c1ce6bd1 100644 --- a/app/horus/core/horuser/node_downtime.go +++ b/app/horus/core/horuser/node_downtime.go @@ -71,7 +71,8 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) { klog.Infof("clusterName:%v\n", clusterName) nodeDownTimeRes := make(map[string]int) - cq := len(h.cc.NodeDownTime.AbnormalityQL) + aq := len(h.cc.NodeDownTime.AbnormalityQL) + rq := len(h.cc.NodeDownTime.AbnormalRecoveryQL) for _, ql := range h.cc.NodeDownTime.AbnormalityQL { ql := ql res, err := h.InstantQuery(addr, ql, clusterName, h.cc.NodeDownTime.PromQueryTimeSecond) @@ -92,12 +93,34 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) { nodeDownTimeRes[nodeName]++ } } + + for _, ql := range h.cc.NodeDownTime.AbnormalRecoveryQL { + ql := ql + res, err := h.InstantQuery(addr, ql, clusterName, h.cc.NodeDownTime.PromQueryTimeSecond) + if err != nil { + klog.Errorf("downtimeNodes InstantQuery err:%v", err) + klog.Infof("clusterName:%v\n", clusterName) + continue + } + + for _, v := range res { + v := v + nodeName := string(v.Metric["node"]) + if nodeName == "" { + klog.Error("downtimeNodes InstantQuery nodeName empty.") + klog.Infof("clusterName:%v\n metric:%v\n", clusterName, v.Metric) + continue + } + nodeDownTimeRes[nodeName]++ + } + } + WithDownNodeIPs := make(map[string]string) for node, count := range nodeDownTimeRes { - if count < cq { - klog.Error("downtimeNodes node not reach threshold") - klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, cq, count) + if count < aq { + klog.Error("downtimeNodes not reach threshold") + klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, aq, count) continue } abnormalInfoSystemQL := fmt.Sprintf(h.cc.NodeDownTime.AbnormalInfoSystemQL, node) @@ -142,12 +165,19 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) { return "", nil }() + for node, count := range nodeDownTimeRes { + if count < rq { + klog.Error("downtimeNodes not reach recovery threshold") + klog.Infof("clusterName:%v\n nodeName:%v\n threshold:%v count:%v", clusterName, node, aq, count) + continue + } + } + write := db.NodeDataInfo{ NodeName: nodeName, NodeIP: nodeIP, ClusterName: clusterName, ModuleName: NODE_DOWN, - RecoveryQL: , } exist, _ := write.Check() if exist { diff --git a/app/horus/core/horuser/node_recovery.go b/app/horus/core/horuser/node_recovery.go index cdda8932..91650618 100644 --- a/app/horus/core/horuser/node_recovery.go +++ b/app/horus/core/horuser/node_recovery.go @@ -93,3 +93,44 @@ func (h *Horuser) recoveryNodes(n db.NodeDataInfo) { } klog.Infof("RecoveryMarker result success:%v", success) } + +func (h *Horuser) DownTimeRecoveryNodes(n db.NodeDataInfo) { + promAddr := h.cc.PromMultiple[n.ClusterName] + if promAddr == "" { + klog.Error("recoveryNodes promAddr by clusterName empty.") + klog.Infof("clusterName:%v nodeName:%v", n.ClusterName, n.NodeName) + return + } + vecs, err := h.InstantQuery(promAddr, n.DownTimeRecoveryQL, n.ClusterName, h.cc.NodeDownTime.PromQueryTimeSecond) + if err != nil { + klog.Errorf("recoveryNodes InstantQuery err:%v", err) + klog.Infof("recoveryQL:%v", n.DownTimeRecoveryQL) + return + } + if len(vecs) != 1 { + klog.Infof("Expected 1 result, but got:%d", len(vecs)) + return + } + if err != nil { + klog.Errorf("recoveryNodes InstantQuery err:%v", err) + klog.Infof("recoveryQL:%v", n.DownTimeRecoveryQL) + return + } + klog.Info("recoveryNodes InstantQuery success.") + + err = h.UnCordon(n.NodeName, n.ClusterName) + res := "Success" + if err != nil { + res = fmt.Sprintf("result failed:%v", err) + } + msg := fmt.Sprintf("\n【集群: %v】\n【宕机节点已达到恢复临界点】\n【已恢复调度节点: %v】\n【处理结果:%v】\n【日期: %v】\n", n.ClusterName, n.NodeName, res, n.CreateTime) + alerter.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg) + alerter.SlackSend(h.cc.NodeDownTime.Slack, msg) + + success, err := n.DownTimeRecoveryMarker() + if err != nil { + klog.Errorf("DownTimeRecoveryMarker result failed err:%v", err) + return + } + klog.Infof("DownTimeRecoveryMarker result success:%v", success) +} diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml index 22eb575f..02ce3bc0 100644 --- a/manifests/horus/horus.yaml +++ b/manifests/horus/horus.yaml @@ -64,7 +64,7 @@ customModular: title: "自定义通知" nodeDownTime: - enabled: false + enabled: true intervalSecond: 15 promQueryTimeSecond: 60 abnormalityQL: @@ -91,7 +91,7 @@ nodeDownTime: title: "自定义通知" podStagnationCleaner: - enabled: true + enabled: false intervalSecond: 15 doubleSecond: 60 fieldSelector: "status.phase!=Running"