From 6118649162fbe2eb23f323d4baefa96bd59f3e4f Mon Sep 17 00:00:00 2001 From: mfordjody <11638005@qq.com> Date: Wed, 9 Oct 2024 09:49:35 +0800 Subject: [PATCH] [horus] Fixes and optimizations (#441) --- app/horus/base/config/config.go | 1 - app/horus/cmd/main.go | 27 ++++++++++++++----------- app/horus/core/horuser/node_downtime.go | 4 ++-- app/horus/core/horuser/node_restart.go | 4 ++-- manifests/horus/horus.yaml | 11 +++++----- 5 files changed, 24 insertions(+), 23 deletions(-) diff --git a/app/horus/base/config/config.go b/app/horus/base/config/config.go index 34d813a9..fdb38d1c 100644 --- a/app/horus/base/config/config.go +++ b/app/horus/base/config/config.go @@ -47,7 +47,6 @@ type SlackConfiguration struct { } type RecoveryConfiguration struct { - Enabled bool `yaml:"enabled"` DayNumber int `yaml:"dayNumber"` IntervalSecond int `yaml:"intervalSecond"` PromQueryTimeSecond int64 `yaml:"promQueryTimeSecond"` diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go index 8b3e4a76..cb15d52e 100644 --- a/app/horus/cmd/main.go +++ b/app/horus/cmd/main.go @@ -79,6 +79,7 @@ func main() { err := srv.ListenAndServe() if err != nil { klog.Errorf("horus metrics err:%v", err) + return err } return nil }) @@ -87,16 +88,16 @@ func main() { err := ticker.Manager(ctx) if err != nil { klog.Errorf("horus ticker manager start failed err:%v", err) + return err } return nil }) group.Add(func() error { - if c.NodeRecovery.Enabled { - klog.Info("horus node recovery manager start success.") - err := horus.RecoveryManager(ctx) - if err != nil { - klog.Errorf("horus node recovery manager start failed err:%v", err) - } + klog.Info("horus node recovery manager start success.") + err := horus.RecoveryManager(ctx) + if err != nil { + klog.Errorf("horus node recovery manager start failed err:%v", err) + return err } return nil }) @@ -106,6 +107,7 @@ func main() { err := horus.CustomizeModularManager(ctx) if err != nil { klog.Errorf("horus node customize modular manager start failed err:%v", err) + return err } } return nil @@ -116,17 +118,17 @@ func main() { err := horus.DownTimeManager(ctx) if err != nil { klog.Errorf("horus node downtime manager start failed err:%v", err) + return err } } return nil }) group.Add(func() error { - if c.NodeDownTime.Enabled { - klog.Info("horus node downtime restart manager start success.") - err := horus.DowntimeRestartManager(ctx) - if err != nil { - klog.Errorf("horus node downtime restart manager start failed err:%v", err) - } + klog.Info("horus node downtime restart manager start success.") + err := horus.DowntimeRestartManager(ctx) + if err != nil { + klog.Errorf("horus node downtime restart manager start failed err:%v", err) + return err } return nil }) @@ -136,6 +138,7 @@ func main() { err := horus.PodStagnationCleanManager(ctx) if err != nil { klog.Errorf("horus pod stagnation clean manager start failed err:%v", err) + return err } } return nil diff --git a/app/horus/core/horuser/node_downtime.go b/app/horus/core/horuser/node_downtime.go index 51b66a4c..6cb57404 100644 --- a/app/horus/core/horuser/node_downtime.go +++ b/app/horus/core/horuser/node_downtime.go @@ -28,7 +28,7 @@ import ( ) const ( - NODE_DOWN = "node_down" + NODE_DOWN = "nodeDown" NODE_DOWN_REASON = "downtime" ) @@ -118,7 +118,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) { WithDownNodeIPs[node] = str } - msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机标准:%v】", h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs)) + msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机临界点:%v】", h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs)) newfound := 0 for nodeName, _ := range WithDownNodeIPs { diff --git a/app/horus/core/horuser/node_restart.go b/app/horus/core/horuser/node_restart.go index c331fe0e..d8aedbec 100644 --- a/app/horus/core/horuser/node_restart.go +++ b/app/horus/core/horuser/node_restart.go @@ -83,8 +83,8 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) { klog.Infof("RestartMarker did not success for node %v", node.NodeName) } - if node.Restart < 2 { - klog.Info("It's been rebooted once.") + if node.Restart > 2 { + klog.Error("It's been rebooted once.") return } } diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml index ba1218bf..6a52d9a4 100644 --- a/manifests/horus/horus.yaml +++ b/manifests/horus/horus.yaml @@ -28,7 +28,6 @@ promMultiple: cluster: http://192.168.15.134:31974 nodeRecovery: - enabled: false dayNumber: 1 intervalSecond: 15 promQueryTimeSecond: 60 @@ -65,12 +64,12 @@ customModular: title: "自定义通知" nodeDownTime: - enabled: false - intervalSecond: 5 + enabled: true + intervalSecond: 15 promQueryTimeSecond: 60 abnormalityQL: - - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 13 -# - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes )) * 100 < 20 + - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 20 + - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes )) * 100 > 25 # - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 15 abnormalInfoSystemQL: node_os_info{node="%s"} @@ -88,7 +87,7 @@ nodeDownTime: title: "自定义通知" podStagnationCleaner: - enabled: true + enabled: false intervalSecond: 15 doubleSecond: 60 fieldSelector: "status.phase!=Running"