diff --git a/app/horus/base/db/db.go b/app/horus/base/db/db.go index 112fc656..d552ad0d 100644 --- a/app/horus/base/db/db.go +++ b/app/horus/base/db/db.go @@ -149,7 +149,7 @@ func GetRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) { func GetRestartNodeDataInfoDate() ([]NodeDataInfo, error) { var ndi []NodeDataInfo - session := db.Where("restart = 0 and repair = 0 and module_name = ?", "node_down") + session := db.Where("restart = 0 and repair = 0 and module_name = ?", "nodeDown") err := session.Find(&ndi) return ndi, err } diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go index cb15d52e..f67786a4 100644 --- a/app/horus/cmd/main.go +++ b/app/horus/cmd/main.go @@ -93,11 +93,13 @@ func main() { return nil }) group.Add(func() error { - klog.Info("horus node recovery manager start success.") - err := horus.RecoveryManager(ctx) - if err != nil { - klog.Errorf("horus node recovery manager start failed err:%v", err) - return err + if c.CustomModular.Enabled { + klog.Info("horus node recovery manager start success.") + err := horus.RecoveryManager(ctx) + if err != nil { + klog.Errorf("horus node recovery manager start failed err:%v", err) + return err + } } return nil }) diff --git a/app/horus/core/horuser/node_downtime.go b/app/horus/core/horuser/node_downtime.go index 6cb57404..68c96beb 100644 --- a/app/horus/core/horuser/node_downtime.go +++ b/app/horus/core/horuser/node_downtime.go @@ -119,6 +119,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) { } msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机临界点:%v】", h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs)) + newfound := 0 for nodeName, _ := range WithDownNodeIPs { diff --git a/app/horus/core/horuser/node_drain.go b/app/horus/core/horuser/node_drain.go index fd01562e..92434535 100644 --- a/app/horus/core/horuser/node_drain.go +++ b/app/horus/core/horuser/node_drain.go @@ -16,9 +16,7 @@ package horuser import ( - "context" "fmt" - corev1 "k8s.io/api/core/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2" ) @@ -54,33 +52,16 @@ func (h *Horuser) Drain(nodeName, clusterName string) (err error) { break } } - klog.Errorf("node Drain evict pod result items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", items+1, count, nodeName, clusterName, pods.Name, pods.Namespace) if ds { continue } + klog.Errorf("node Drain evict pod result items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", items+1, count, nodeName, clusterName, pods.Name, pods.Namespace) + err = h.Evict(pods.Name, pods.Namespace, clusterName) if err != nil { klog.Errorf("node Drain evict pod err:%v items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", err, items+1, count, nodeName, clusterName, pods.Name, pods.Namespace) return err } - err = h.Finalizer(clusterName, pods.Name, pods.Namespace) - if err != nil { - klog.Errorf("node Drain finalizer pod err:%v items:%d count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", err, items+1, count, nodeName, clusterName, pods.Name, pods.Namespace) - return err - } - - var oldPod *corev1.Pod - var _ = h.Terminating(clusterName, oldPod) - newPod, _ := kubeClient.CoreV1().Pods(oldPod.Namespace).Get(context.Background(), oldPod.Name, v1.GetOptions{}) - if newPod == nil { - return err - } - if newPod.UID != oldPod.UID { - return err - } - if newPod.DeletionTimestamp.IsZero() { - return err - } } return nil } diff --git a/app/horus/core/horuser/node_restart.go b/app/horus/core/horuser/node_restart.go index d8aedbec..b5b08faa 100644 --- a/app/horus/core/horuser/node_restart.go +++ b/app/horus/core/horuser/node_restart.go @@ -69,7 +69,7 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) { klog.Infof("RestartMarker result success:%v", success) if success { - msg := fmt.Sprintf("\n【等待宕机节点腾空后重启】\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName, node.FirstDate, node.ClusterName) + msg := fmt.Sprintf("\n【宕机节点等待腾空后重启】\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName, node.FirstDate, node.ClusterName) alerter.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg) cmd := exec.Command("/bin/bash", "core/horuser/restart.sh", node.NodeIP, h.cc.NodeDownTime.AllSystemUser, h.cc.NodeDownTime.AllSystemPassword) diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml index 6a52d9a4..fdac8695 100644 --- a/manifests/horus/horus.yaml +++ b/manifests/horus/horus.yaml @@ -25,7 +25,7 @@ kubeMultiple: cluster: config.1 promMultiple: - cluster: http://192.168.15.134:31974 + cluster: http://192.168.15.133:31974 nodeRecovery: dayNumber: 1 @@ -64,7 +64,7 @@ customModular: title: "自定义通知" nodeDownTime: - enabled: true + enabled: false intervalSecond: 15 promQueryTimeSecond: 60 abnormalityQL: