-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhana.go
163 lines (146 loc) · 5.07 KB
/
hana.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
package main
import (
"encoding/json"
"encoding/xml"
"net/http"
"os"
"os/exec"
"regexp"
"strings"
"sync"
"time"
//"github.com/MalloZup/hasap-alerts-handler/internal"
log "github.com/sirupsen/logrus"
)
// CrmMon is the cluster pacemaker xml status via crm_mon
type CrmMon struct {
XMLName xml.Name `xml:"crm_mon"`
Version string `xml:"version,attr"`
Resources struct {
Clone []struct {
ID string `xml:"id,attr"`
MultiState string `xml:"multi_state,attr"`
Unique string `xml:"unique,attr"`
Managed string `xml:"managed,attr"`
Failed string `xml:"failed,attr"`
FailureIgnored string `xml:"failure_ignored,attr"`
Resource []struct {
ID string `xml:"id,attr"`
ResourceAgent string `xml:"resource_agent,attr"`
Role string `xml:"role,attr"`
Active string `xml:"active,attr"`
Orphaned string `xml:"orphaned,attr"`
Blocked string `xml:"blocked,attr"`
Managed string `xml:"managed,attr"`
Failed string `xml:"failed,attr"`
FailureIgnored string `xml:"failure_ignored,attr"`
NodesRunningOn string `xml:"nodes_running_on,attr"`
Node struct {
Name string `xml:"name,attr"`
ID string `xml:"id,attr"`
Cached string `xml:"cached,attr"`
} `xml:"node"`
} `xml:"resource"`
} `xml:"clone"`
} `xml:"resources"`
}
// HanaDiskFull type
// Prometheus can send depending on interval same alerts multiple times in short interval
// of time. like 5s (depending on timeout)
// Prevent that an handler is called multiple times until it performing operation
// the mutex will ensure that we run only 1 handler until it finish
type HanaDiskFull struct {
mu sync.Mutex
alertManagerIP string
}
// handle when Hana Primary node has disk full
func (ns *HanaDiskFull) handlerHanaDiskFull(_ http.ResponseWriter, req *http.Request) {
// see type description for this mutex
ns.mu.Lock()
defer ns.mu.Unlock()
// read body json from Prometheus alertmanager
decoder := json.NewDecoder(req.Body)
var alerts PromAlert
err := decoder.Decode(&alerts)
if err != nil {
log.Warnf("error by decoding json from hana-handler http: %s", err)
}
log.Infof("HanaDiskFullHandler called by %s", alerts.Receiver)
for _, a := range alerts.Alerts {
log.Infof("%s generated by %s", a.Labels.Alertname, a.GeneratorURL)
// look only for hana components and selfhealing true labels
if strings.ToLower(a.Labels.Component) != "hana" &&
strings.ToLower(a.Labels.Selfhealing) != "true" {
continue
}
// TODO: we need to check if the alerts was called by alertName HanaFileSystemFull
// read crm_mon xml for detecting if hana is primary on node
var cMon *CrmMon
crmMonXML, err := exec.Command("/usr/sbin/crm_mon", "-X", "--inactive").Output()
if err != nil {
log.Warnf("error while executing crm_mon: %w", err)
return
}
err = xml.Unmarshal(crmMonXML, &cMon)
if err != nil {
log.Warnf("error while parsing crm_mon XML %w", err)
return
}
// used to see if the hana primary resource is running on local node
nodeHostname, err := os.Hostname()
if err != nil {
log.Warnf("error could not get hostname %w", err)
return
}
// check if the current node where the alert is executed
// has the HANADB as primary db, then return the res name if yes
// this will be used for taking over the res
primaryRes := lookUpHanaNodePrimary(cMon, nodeHostname)
if primaryRes != "" {
cmd := exec.Command("/usr/sbin/crm", "resource", "move", primaryRes, "force")
log.Infoln("[SELFHEALING]: selfhealing HANA primary node. Migrating to other node")
err := cmd.Run()
if err != nil {
// it we don't have an alertManagerIP don't try to send alerts
// TODO: think to refactor this in a kind of logger
if ns.alertManagerIP != "" {
var a *AlertFire
a = new(AlertFire)
a.Status = "firing"
a.Labels.Alertname = "HanaDiskHandlerFailure"
a.Labels.Component = "fail to selfhealing hana disk"
a.Labels.Severity = "critical"
a.Labels.Instance = nodeHostname
a.Annotations.Summary = "alert-handler failed to self-heal"
a.GeneratorURL = "unit-test"
a.sendAlert("http://" + ns.alertManagerIP + ":9093/api/v1/alerts")
}
log.Warnln("[CRITICAL]: move resource hana resource failed")
}
// this is not clean but ok for demo.
// we loop until the primary node is not anymore primary
// TODO wait until the hana primary has migrated
for i := 1; i <= 10; i++ {
isPrimary := lookUpHanaNodePrimary(cMon, nodeHostname)
if isPrimary == "" {
log.Infof("HANA %s node is not primary anymore...", nodeHostname)
}
time.Sleep(5 * time.Second)
}
}
}
}
// TODO: it could be that this is not safe since node could be an array TO verify it
func lookUpHanaNodePrimary(cmon *CrmMon, hostname string) string {
for _, n := range cmon.Resources.Clone {
matched, _ := regexp.MatchString(`msl_SAPHana_.*`, n.ID)
if matched {
for _, r := range n.Resource {
if r.Role == "Master" && r.Node.Name == hostname {
return n.ID
}
}
}
}
return ""
}