Skip to content

Commit

Permalink
[horus] Fixes and optimizations (#441)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfordjody authored Oct 9, 2024
1 parent f94f3f5 commit 6118649
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 23 deletions.
1 change: 0 additions & 1 deletion app/horus/base/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ type SlackConfiguration struct {
}

type RecoveryConfiguration struct {
Enabled bool `yaml:"enabled"`
DayNumber int `yaml:"dayNumber"`
IntervalSecond int `yaml:"intervalSecond"`
PromQueryTimeSecond int64 `yaml:"promQueryTimeSecond"`
Expand Down
27 changes: 15 additions & 12 deletions app/horus/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func main() {
err := srv.ListenAndServe()
if err != nil {
klog.Errorf("horus metrics err:%v", err)
return err
}
return nil
})
Expand All @@ -87,16 +88,16 @@ func main() {
err := ticker.Manager(ctx)
if err != nil {
klog.Errorf("horus ticker manager start failed err:%v", err)
return err
}
return nil
})
group.Add(func() error {
if c.NodeRecovery.Enabled {
klog.Info("horus node recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus node recovery manager start failed err:%v", err)
}
klog.Info("horus node recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus node recovery manager start failed err:%v", err)
return err
}
return nil
})
Expand All @@ -106,6 +107,7 @@ func main() {
err := horus.CustomizeModularManager(ctx)
if err != nil {
klog.Errorf("horus node customize modular manager start failed err:%v", err)
return err
}
}
return nil
Expand All @@ -116,17 +118,17 @@ func main() {
err := horus.DownTimeManager(ctx)
if err != nil {
klog.Errorf("horus node downtime manager start failed err:%v", err)
return err
}
}
return nil
})
group.Add(func() error {
if c.NodeDownTime.Enabled {
klog.Info("horus node downtime restart manager start success.")
err := horus.DowntimeRestartManager(ctx)
if err != nil {
klog.Errorf("horus node downtime restart manager start failed err:%v", err)
}
klog.Info("horus node downtime restart manager start success.")
err := horus.DowntimeRestartManager(ctx)
if err != nil {
klog.Errorf("horus node downtime restart manager start failed err:%v", err)
return err
}
return nil
})
Expand All @@ -136,6 +138,7 @@ func main() {
err := horus.PodStagnationCleanManager(ctx)
if err != nil {
klog.Errorf("horus pod stagnation clean manager start failed err:%v", err)
return err
}
}
return nil
Expand Down
4 changes: 2 additions & 2 deletions app/horus/core/horuser/node_downtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
)

const (
NODE_DOWN = "node_down"
NODE_DOWN = "nodeDown"
NODE_DOWN_REASON = "downtime"
)

Expand Down Expand Up @@ -118,7 +118,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
WithDownNodeIPs[node] = str
}

msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n已达到宕机标准:%v】", h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n已达到宕机临界点:%v】", h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
newfound := 0

for nodeName, _ := range WithDownNodeIPs {
Expand Down
4 changes: 2 additions & 2 deletions app/horus/core/horuser/node_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
klog.Infof("RestartMarker did not success for node %v", node.NodeName)
}

if node.Restart < 2 {
klog.Info("It's been rebooted once.")
if node.Restart > 2 {
klog.Error("It's been rebooted once.")
return
}
}
11 changes: 5 additions & 6 deletions manifests/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ promMultiple:
cluster: http://192.168.15.134:31974

nodeRecovery:
enabled: false
dayNumber: 1
intervalSecond: 15
promQueryTimeSecond: 60
Expand Down Expand Up @@ -65,12 +64,12 @@ customModular:
title: "自定义通知"

nodeDownTime:
enabled: false
intervalSecond: 5
enabled: true
intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL:
- 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 13
# - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes )) * 100 < 20
- 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 20
- (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes )) * 100 > 25
# - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
abnormalInfoSystemQL:
node_os_info{node="%s"}
Expand All @@ -88,7 +87,7 @@ nodeDownTime:
title: "自定义通知"

podStagnationCleaner:
enabled: true
enabled: false
intervalSecond: 15
doubleSecond: 60
fieldSelector: "status.phase!=Running"
Expand Down

0 comments on commit 6118649

Please sign in to comment.