Skip to content

Commit

Permalink
[horus] Downtime function test results completed (#442)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfordjody authored Oct 9, 2024
1 parent 6118649 commit def734b
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 30 deletions.
2 changes: 1 addition & 1 deletion app/horus/base/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ func GetRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) {

func GetRestartNodeDataInfoDate() ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
session := db.Where("restart = 0 and repair = 0 and module_name = ?", "node_down")
session := db.Where("restart = 0 and repair = 0 and module_name = ?", "nodeDown")
err := session.Find(&ndi)
return ndi, err
}
Expand Down
12 changes: 7 additions & 5 deletions app/horus/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,13 @@ func main() {
return nil
})
group.Add(func() error {
klog.Info("horus node recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus node recovery manager start failed err:%v", err)
return err
if c.CustomModular.Enabled {
klog.Info("horus node recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus node recovery manager start failed err:%v", err)
return err
}
}
return nil
})
Expand Down
1 change: 1 addition & 0 deletions app/horus/core/horuser/node_downtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
}

msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机临界点:%v】", h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))

newfound := 0

for nodeName, _ := range WithDownNodeIPs {
Expand Down
23 changes: 2 additions & 21 deletions app/horus/core/horuser/node_drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
package horuser

import (
"context"
"fmt"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
)
Expand Down Expand Up @@ -54,33 +52,16 @@ func (h *Horuser) Drain(nodeName, clusterName string) (err error) {
break
}
}
klog.Errorf("node Drain evict pod result items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", items+1, count, nodeName, clusterName, pods.Name, pods.Namespace)
if ds {
continue
}
klog.Errorf("node Drain evict pod result items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", items+1, count, nodeName, clusterName, pods.Name, pods.Namespace)

err = h.Evict(pods.Name, pods.Namespace, clusterName)
if err != nil {
klog.Errorf("node Drain evict pod err:%v items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", err, items+1, count, nodeName, clusterName, pods.Name, pods.Namespace)
return err
}
err = h.Finalizer(clusterName, pods.Name, pods.Namespace)
if err != nil {
klog.Errorf("node Drain finalizer pod err:%v items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", err, items+1, count, nodeName, clusterName, pods.Name, pods.Namespace)
return err
}

var oldPod *corev1.Pod
var _ = h.Terminating(clusterName, oldPod)
newPod, _ := kubeClient.CoreV1().Pods(oldPod.Namespace).Get(context.Background(), oldPod.Name, v1.GetOptions{})
if newPod == nil {
return err
}
if newPod.UID != oldPod.UID {
return err
}
if newPod.DeletionTimestamp.IsZero() {
return err
}
}
return nil
}
2 changes: 1 addition & 1 deletion app/horus/core/horuser/node_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
klog.Infof("RestartMarker result success:%v", success)

if success {
msg := fmt.Sprintf("\n等待宕机节点腾空后重启\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName, node.FirstDate, node.ClusterName)
msg := fmt.Sprintf("\n宕机节点等待腾空后重启\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName, node.FirstDate, node.ClusterName)
alerter.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)

cmd := exec.Command("/bin/bash", "core/horuser/restart.sh", node.NodeIP, h.cc.NodeDownTime.AllSystemUser, h.cc.NodeDownTime.AllSystemPassword)
Expand Down
4 changes: 2 additions & 2 deletions manifests/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ kubeMultiple:
cluster: config.1

promMultiple:
cluster: http://192.168.15.134:31974
cluster: http://192.168.15.133:31974

nodeRecovery:
dayNumber: 1
Expand Down Expand Up @@ -64,7 +64,7 @@ customModular:
title: "自定义通知"

nodeDownTime:
enabled: true
enabled: false
intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL:
Expand Down

0 comments on commit def734b

Please sign in to comment.