Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[horus] Simulate and fix test scenarios #401

Merged
merged 1 commit into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/horus/basic/config/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ type ModularConfiguration struct {
CordonDailyLimit map[string]int `yaml:"cordonDailyLimit"`
AbnormalityQL map[string]string `yaml:"abnormalityQL"`
RecoveryQL map[string]string `yaml:"recoveryQL"`
CheckIntervalSecond int `yaml:"checkIntervalSecond"`
IntervalSecond int `yaml:"intervalSecond"`
PromQueryTimeSecond int64 `yaml:"promQueryTimeSecond"`
KubeMultiple map[string]string `yaml:"kubeMultiple"`
DingTalk *DingTalkConfiguration `yaml:"dingTalk"`
Expand Down
8 changes: 4 additions & 4 deletions app/horus/core/horuser/node_cordon.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ func (h *Horuser) Cordon(nodeName, clusterName, moduleName string) (err error) {
annotations["dubbo.apache.org/disable-by"] = "horus"

node.Spec.Unschedulable = true
if node.Spec.Unschedulable {
klog.Infof("Node %v is already cordoned.", nodeName)
return nil
}
//if node.Spec.Unschedulable != false {
// klog.Infof("Node %v is already cordoned.", nodeName)
// return
//}
ctxSecond, cancelSecond := h.GetK8sContext()
defer cancelSecond()
node, err = kubeClient.CoreV1().Nodes().Update(ctxSecond, node, v1.UpdateOptions{})
Expand Down
7 changes: 2 additions & 5 deletions app/horus/core/horuser/node_modular.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (
)

func (h *Horuser) CustomizeModularManager(ctx context.Context) error {
go wait.UntilWithContext(ctx, h.CustomizeModular, time.Duration(h.cc.CustomModular.CheckIntervalSecond)*time.Second)
go wait.UntilWithContext(ctx, h.CustomizeModular, time.Duration(h.cc.CustomModular.IntervalSecond)*time.Second)
<-ctx.Done()
return nil
}
Expand Down Expand Up @@ -87,9 +87,6 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
return
}
err = h.Cordon(nodeName, clusterName, moduleName)
if err != nil {
return
}

write := db.NodeDataInfo{
NodeName: nodeName,
Expand Down Expand Up @@ -118,7 +115,7 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", res)
}
msg := fmt.Sprintf("\n【集群:%v】\n【发现 %s 异常已禁止调度】\n【已禁止调度节点:%v】\n 【处理结果: %v】\n 【今日操作次数:%v】\n",
msg := fmt.Sprintf("\n【集群:%v】\n【发现 %s 达到禁止调度条件】\n【禁止调度节点:%v】\n 【处理结果: %v】\n 【今日操作次数:%v】\n",
clusterName, moduleName, nodeName, res, len(data)+1)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
alert.SlackSend(h.cc.CustomModular.Slack, msg)
Expand Down
6 changes: 1 addition & 5 deletions app/horus/core/horuser/node_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,11 @@ func (h *Horuser) recoveryNodes(n db.NodeDataInfo) {
}
klog.Infof("recoveryNodes check success.")
err = h.UnCordon(n.NodeName, n.ClusterName)
if err == nil {
klog.Infof("Node %v is already uncordoned.", n.NodeName)
return
}
res := "Success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
}
msg := fmt.Sprintf("\n【集群: %v】\n【异常节点恢复调度】\n【已恢复调度节点: %v】\n【处理结果:%v】\n【日期: %v】\n", n.ClusterName, n.NodeName, res, n.CreateTime)
msg := fmt.Sprintf("\n【集群: %v】\n【封锁节点恢复调度】\n【已恢复调度节点: %v】\n【处理结果:%v】\n【日期: %v】\n", n.ClusterName, n.NodeName, res, n.CreateTime)
alert.DingTalkSend(h.cc.NodeRecovery.DingTalk, msg)
alert.SlackSend(h.cc.CustomModular.Slack, msg)

Expand Down
4 changes: 2 additions & 2 deletions app/horus/core/horuser/node_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ func (h *Horuser) RestartOrRepair(ctx context.Context) {
}

func (h *Horuser) TryRestart(node db.NodeDataInfo) {
err := h.Drain(node.ClusterName, node.NodeName)
err := h.Drain(node.NodeName, node.ClusterName)
if err != nil {
msg := fmt.Sprintf("\n【安全驱逐节点重启就绪:%v】\n", err)
msg := fmt.Sprintf("\n【安全驱逐节点:%v】\n", err)
alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
return
}
Expand Down
9 changes: 4 additions & 5 deletions app/horus/core/horuser/node_uncordon.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,10 @@ func (h *Horuser) UnCordon(nodeName, clusterName string) (err error) {
}

node.Spec.Unschedulable = false
if !node.Spec.Unschedulable {
klog.Infof("Node %v is already uncordoned.", nodeName)
return nil
}

//if node.Spec.Unschedulable != true {
// klog.Infof("Node %v is already uncordoned.", nodeName)
// return
//}
ctxSecond, cancelSecond := h.GetK8sContext()
defer cancelSecond()
node, err = kubeClient.CoreV1().Nodes().Update(ctxSecond, node, v1.UpdateOptions{})
Expand Down
29 changes: 14 additions & 15 deletions manifests/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.

address: 0.0.0.0:38089

kubeTimeSecond: 5

mysql:
Expand All @@ -36,12 +35,12 @@ kubeMultiple:
cluster: config.1

promMultiple:
cluster: http://192.168.15.128:30484
cluster: http://192.168.15.128:30937

nodeRecovery:
enabled: false
enabled: true
dayNumber: 1
intervalSecond: 5
intervalSecond: 15
promQueryTimeSecond: 60
dingTalk:
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=37f8891e60e524013275cc01efafdb5976b81ef7269ce271b769bcd025826c12"
Expand All @@ -52,16 +51,16 @@ nodeRecovery:
webhookUrl: "https://hooks.slack.com/services/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"

customModular:
enabled: false
enabled: true
cordonDailyLimit:
filesystem_readonly: 1
node_cpu: 1
abnormalityQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/"} != 1
node_cpu: |-
100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 30
recoveryQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/",node="%s"} == 0
intervalSecond: 5
node_cpu: |-
100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle",node="%s"}[5m])) * 100) < 10
intervalSecond: 15
promQueryTimeSecond: 60
kubeMultiple:
cluster: config.1
Expand All @@ -74,8 +73,8 @@ customModular:
webhookUrl: "https://hooks.slack.com/services/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"

nodeDownTime:
enabled: true
intervalSecond: 5
enabled: false
intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL:
- node_disk_info
Expand All @@ -92,8 +91,8 @@ nodeDownTime:

podAbnormal:
enabled: false
intervalSecond: 5
doubleSecond: 10
intervalSecond: 15
doubleSecond: 60
labelSelector: "app.kubernetes.io/name=horus"
fieldSelector: "status.phase!=Running"
kubeMultiple:
Expand Down
Loading