Skip to content

Commit 8912739

Browse files
Merge pull request #174 from MalloZup/sbd-timeouts
Implement SBD watchdog and msgwait metrics
2 parents 7a7dbdc + 428cb9f commit 8912739

File tree

5 files changed

+91
-3
lines changed

5 files changed

+91
-3
lines changed

collector/sbd/sbd.go

+48
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"os"
77
"os/exec"
88
"regexp"
9+
"strconv"
910
"strings"
1011

1112
"github.com/pkg/errors"
@@ -20,6 +21,7 @@ const subsystem = "sbd"
2021
const SBD_STATUS_UNHEALTHY = "unhealthy"
2122
const SBD_STATUS_HEALTHY = "healthy"
2223

24+
// NewCollector create a new sbd collector
2325
func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
2426
err := checkArguments(sbdPath, sbdConfigPath)
2527
if err != nil {
@@ -33,6 +35,7 @@ func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
3335
}
3436

3537
c.SetDescriptor("devices", "SBD devices; one line per device", []string{"device", "status"})
38+
c.SetDescriptor("timeouts", "SBD timeouts for each device and type", []string{"device", "type"})
3639

3740
return c, nil
3841
}
@@ -68,6 +71,15 @@ func (c *sbdCollector) CollectWithError(ch chan<- prometheus.Metric) error {
6871
ch <- c.MakeGaugeMetric("devices", 1, sbdDev, sbdStatus)
6972
}
7073

74+
sbdWatchdogs, sbdMsgWaits := c.getSbdTimeouts(sbdDevices)
75+
for sbdDev, sbdWatchdog := range sbdWatchdogs {
76+
ch <- c.MakeGaugeMetric("timeouts", sbdWatchdog, sbdDev, "watchdog")
77+
}
78+
79+
for sbdDev, sbdMsgWait := range sbdMsgWaits {
80+
ch <- c.MakeGaugeMetric("timeouts", sbdMsgWait, sbdDev, "msgwait")
81+
}
82+
7183
return nil
7284
}
7385

@@ -132,3 +144,39 @@ func (c *sbdCollector) getSbdDeviceStatuses(sbdDevices []string) map[string]stri
132144

133145
return sbdStatuses
134146
}
147+
148+
// for each sbd device, extract the watchdog and msgwait timeout via regex
149+
func (c *sbdCollector) getSbdTimeouts(sbdDevices []string) (map[string]float64, map[string]float64) {
150+
sbdWatchdogs := make(map[string]float64)
151+
sbdMsgWaits := make(map[string]float64)
152+
for _, sbdDev := range sbdDevices {
153+
sbdDump, _ := exec.Command(c.sbdPath, "-d", sbdDev, "dump").Output()
154+
155+
regexW := regexp.MustCompile(`Timeout \(msgwait\) *: \d+`)
156+
regex := regexp.MustCompile(`Timeout \(watchdog\) *: \d+`)
157+
158+
msgWaitLine := regexW.FindStringSubmatch(string(sbdDump))
159+
watchdogLine := regex.FindStringSubmatch(string(sbdDump))
160+
161+
if watchdogLine == nil || msgWaitLine == nil {
162+
continue
163+
}
164+
165+
// get the timeout from the line
166+
regexNumber := regexp.MustCompile(`\d+`)
167+
watchdogTimeout := regexNumber.FindString(string(watchdogLine[0]))
168+
msgWaitTimeout := regexNumber.FindString(string(msgWaitLine[0]))
169+
170+
// map the timeout to the device
171+
if s, err := strconv.ParseFloat(watchdogTimeout, 64); err == nil {
172+
sbdWatchdogs[sbdDev] = s
173+
}
174+
175+
// map the timeout to the device
176+
if s, err := strconv.ParseFloat(msgWaitTimeout, 64); err == nil {
177+
sbdMsgWaits[sbdDev] = s
178+
}
179+
180+
}
181+
return sbdWatchdogs, sbdMsgWaits
182+
}

collector/sbd/sbd_test.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,13 @@ func TestNewSbdCollectorChecksSbdExecutableBits(t *testing.T) {
214214
}
215215

216216
func TestSBDCollector(t *testing.T) {
217-
collector, _ := NewCollector("../../test/fake_sbd.sh", "../../test/fake_sbdconfig")
217+
collector, _ := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")
218+
assertcustom.Metrics(t, collector, "sbd.metrics")
219+
}
220+
221+
func TestWatchdog(t *testing.T) {
222+
collector, err := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")
223+
224+
assert.Nil(t, err)
218225
assertcustom.Metrics(t, collector, "sbd.metrics")
219226
}

doc/metrics.md

+14-1
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ The status of each Corosync ring; `1` means healthy, `0` means faulty.
198198
The SBD subsystems collect devices stats by parsing its configuration and the output of `sbd --dump`.
199199

200200
0. [Sample](../test/sbd.metrics)
201-
2. [`ha_cluster_sbd_devices`](#ha_cluster_sbd_devices)
201+
1. [`ha_cluster_sbd_devices`](#ha_cluster_sbd_devices)
202+
2. [`ha_cluster_sbd_timeouts`](#ha_cluster_sbd_timeouts)
202203

203204
### `ha_cluster_sbd_devices`
204205

@@ -214,6 +215,18 @@ Either the value is `1`, or the line is absent altogether.
214215

215216
The total number of lines for this metric will be the cardinality of `device`.
216217

218+
### `ha_cluster_sbd_timeouts`
219+
220+
#### Description
221+
222+
The SBD timeouts pro SBD device
223+
Value is an integer expessing the timeout
224+
225+
#### Labels
226+
227+
- `device`: the path of the SBD device
228+
- `type`: either `watchdog` or `msgwait`
229+
217230

218231
## DRBD
219232

test/fake_sbd_dump.sh

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env bash
2+
3+
cat <<EOF
4+
==Dumping header on disk /dev/vdc
5+
Header version : 2.1
6+
UUID : 1ed3171d-066d-47ca-8f76-aec25d9efed4
7+
Number of slots : 255
8+
Sector size : 512
9+
Timeout (watchdog) : 9
10+
Timeout (allocate) : 2
11+
Timeout (loop) : 1
12+
Timeout (msgwait) : 10
13+
==Header on disk /dev/vdc is dumped
14+
EOF

test/sbd.metrics

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
# HELP ha_cluster_sbd_devices SBD devices; one line per device
22
# TYPE ha_cluster_sbd_devices gauge
33
ha_cluster_sbd_devices{device="/dev/vdc",status="healthy"} 1
4-
ha_cluster_sbd_devices{device="/dev/vdd",status="unhealthy"} 1
4+
ha_cluster_sbd_devices{device="/dev/vdd",status="healthy"} 1
5+
# HELP ha_cluster_sbd_timeouts SBD timeouts for each device and type
6+
# TYPE ha_cluster_sbd_timeouts gauge
7+
ha_cluster_sbd_timeouts{device="/dev/vdc",type="msgwait"} 10
8+
ha_cluster_sbd_timeouts{device="/dev/vdc",type="watchdog"} 9
9+
ha_cluster_sbd_timeouts{device="/dev/vdd",type="msgwait"} 10
10+
ha_cluster_sbd_timeouts{device="/dev/vdd",type="watchdog"} 9

0 commit comments

Comments
 (0)