Skip to content

Commit 37c0e46

Browse files
brablcOndrej Brablc
authored andcommitted
Introduce disk alerter
Fix access to rootfs Rename constant Fix shellcheck findings Workaround for strange swarm api behavior Add more logging Found mystery culprit - two instances running and overwriting temp files
1 parent 064c5c9 commit 37c0e46

File tree

8 files changed

+124
-47
lines changed

8 files changed

+124
-47
lines changed

config.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ SCRIPT_PATH=$(readlink -f "$0")
55
SCRIPT_DIR=${SCRIPT_PATH%/*}
66
cd "$SCRIPT_DIR"
77

8+
export DISK_USAGE_MAX=${DISK_USAGE_MAX:-85}
89
export LABEL_PORT="swarm-health-alerter.port"
910
export LABEL_SOCK="swarm-health-alerter.sock"
1011
export LOGGER_USE_SYSLOG=0

disk-alerter.sh

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env bash
2+
3+
source "./config.sh"
4+
source "./logger.sh"
5+
source "./checks.sh"
6+
7+
DATA_PREFIX="$DATA_DIR/disk-alerter"
8+
9+
function check_disks() {
10+
local swarm_name=$SWARM_NAME
11+
while read -r mount usage; do
12+
unique_name="${swarm_name} ${mount}"
13+
unique_code=$(echo "${unique_name,,}" | sed -e 's/ /_/g' -e 's/[^a-zA-Z0-9_-]/_/g')
14+
random_str=$(tr -dc 'a-zA-Z0-9' </dev/urandom | head -c 10)
15+
read -r unique_id _ < <(echo -n "$unique_name $random_str" | md5sum)
16+
prefix="${DATA_PREFIX}-${unique_code}"
17+
pending_file="${prefix}.pending"
18+
log_file="${prefix}.log"
19+
20+
df -h "$mount" >"$log_file"
21+
22+
action=""
23+
appendix=""
24+
message="${swarm_name} disk ${mount:7:24} at $HOSTNAME usage $usage% >= $DISK_USAGE_MAX"
25+
if ((usage >= DISK_USAGE_MAX)); then
26+
if [[ -f $pending_file ]]; then
27+
log_warn "Pending alert: $message"
28+
else
29+
echo "$unique_id" >"$pending_file"
30+
action="create"
31+
appendix="is out of space"
32+
fi
33+
else
34+
if [[ -f $pending_file ]]; then
35+
action="resolve"
36+
appendix="has space"
37+
unique_id=$(cat "$pending_file")
38+
rm -f "$pending_file"
39+
fi
40+
fi
41+
if [[ -n $action ]]; then
42+
jq -n \
43+
--arg action "$action" \
44+
--arg unique_id "$unique_id" \
45+
--arg message "$message $appendix" \
46+
--arg summary "$(cat "$log_file")" \
47+
'{
48+
"action": $action,
49+
"unique_id": $unique_id,
50+
"message": $message,
51+
"summary": $summary
52+
}' | /bin/bash -c "$ALERT_SCRIPT"
53+
fi
54+
rm -f "$log_file"
55+
done < <(df -h -PT | awk -vlimit="$DISK_MAX_USAGE" 'NR>1&&int($6)>limit&&/rootfs/ {print($7,int($6))}')
56+
}
57+
58+
log_info "Disk alerter is entering loop with ${LOOP_SLEEP} sleep on entry ..."
59+
60+
while true; do
61+
sleep "$LOOP_SLEEP"
62+
check_disks
63+
done

docker-cmd.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env bash
2+
# shellcheck disable=SC2064,SC2181
23

34
source "./config.sh"
45
source "./logger.sh"
@@ -11,19 +12,23 @@ if [ $? != 0 ]; then
1112
services=$(./services.sh 2>&1)
1213
fi
1314

14-
if [[ "$services" != "" ]]; then
15+
if [[ $services != "" ]]; then
1516
log_info "Starting port/sock alerter (initial list of services) ..."
1617
echo "$services"
1718

1819
./port-alerter.sh &
1920
trap "kill $!" EXIT
2021

2122
log_info "Starting swarm alerter (initial state of nodes) ..."
22-
./nodes.sh --verbose
23+
./nodes.sh
2324

2425
./swarm-alerter.sh &
2526
trap "kill $!" EXIT
2627
fi
2728

29+
log_info "Starting disk alerter..."
30+
./disk-alerter.sh &
31+
trap "kill $!" EXIT
32+
2833
log_info "Starting event alerter ..."
2934
./event-alerter.py

integrations/zenduty.sh

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
#!/usr/bin/env bash
22

3-
SCRIPT_PATH=$(readlink -f $0)
3+
SCRIPT_PATH=$(readlink -f "$0")
44
SCRIPT_DIR=${SCRIPT_PATH%/*}
55
source "$SCRIPT_DIR/../logger.sh"
66

77
DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/../data}
88

9-
input_file=$(mktemp $DATA_DIR/zenduty-input.XXXXXX)
10-
trap "rm -f $input_file" EXIT
9+
input_file=$(mktemp "$DATA_DIR/zenduty-input.XXXXXX")
10+
trap 'rm -f "$input_file"' EXIT
1111

12-
if ! timeout 2s cat >$input_file; then
12+
if ! timeout 2s cat >"$input_file"; then
1313
log_error "Reading from stdin timed out."
1414
exit 1
1515
fi
@@ -19,9 +19,9 @@ if [[ -z $ZENDUTY_API_KEY ]]; then
1919
exit 1
2020
fi
2121

22-
action=$(jq -r .action $input_file)
23-
message=$(jq -r .message $input_file)
24-
entity_id=$(jq -r .unique_id $input_file)
22+
action=$(jq -r .action "$input_file")
23+
message=$(jq -r .message "$input_file")
24+
entity_id=$(jq -r .unique_id "$input_file")
2525

2626
alert_type=""
2727
case $action in
@@ -38,33 +38,33 @@ case $action in
3838
;;
3939
esac
4040

41-
request_file=$DATA_DIR/${entity_id}-zenduty-request.json
42-
response_file=$DATA_DIR/${entity_id}-zenduty-response.json
41+
request_file=$DATA_DIR/zenduty-request-${entity_id}.json
42+
response_file=$DATA_DIR/-zenduty-response-${entity_id}.json
4343

4444
jq -r \
4545
--arg alert_type "$alert_type" \
4646
'{
47-
"alert_type": $alert_type,
48-
"entity_id": .unique_id,
49-
"message": .message,
50-
"summary": .summary
51-
}' $input_file >$request_file
47+
"alert_type": $alert_type,
48+
"entity_id": .unique_id,
49+
"message": .message,
50+
"summary": .summary
51+
}' "$input_file" >"$request_file"
5252

5353
test -n "$VERBOSE" && log_info "Request file:"
54-
test -n "$VERBOSE" && (jq . $request_file 2>/dev/null || cat $request_file)
54+
test -n "$VERBOSE" && (jq . "$request_file" 2>/dev/null || cat "$request_file")
5555

5656
url="https://www.zenduty.com/api/events/${ZENDUTY_API_KEY}/"
57-
curl -s -X POST "$url" -H 'Content-Type: application/json' -d @$request_file >$response_file
57+
curl -s -X POST "$url" -H 'Content-Type: application/json' -d @"$request_file" >"$response_file"
5858
return_code=$?
5959

6060
if [ $return_code -ne 0 ]; then
6161
log_error "Curl failed with code $return_code"
6262
fi
6363

6464
test -n "$VERBOSE" && log_info "Response file:"
65-
test -n "$VERBOSE" && (jq . $response_file 2>/dev/null || cat $response_file)
65+
test -n "$VERBOSE" && (jq . "$response_file" 2>/dev/null || cat "$response_file")
6666

6767
if [[ $action == "resolve" ]]; then
68-
rm -f $request_file
69-
rm -f $response_file
68+
rm -f "$request_file"
69+
rm -f "$response_file"
7070
fi

nodes.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
#!/usr/bin/env bash
22

3+
# shellcheck disable=SC2002
4+
35
source "./config.sh"
46
source "./logger.sh"
57
source "./checks.sh"
68

79
if ! ./docker-api.sh /nodes >/tmp/nodes; then
10+
log_error "Call to docker-api.sh failed."
11+
cat /tmp/nodes
812
log_error "$(jq -r .message /tmp/nodes 2>/dev/null || cat /tmp/nodes)"
913
exit 1
1014
fi

port-alerter.sh

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,30 +6,32 @@ source "./checks.sh"
66

77
declare -A REPORTED_SOCKS
88

9+
DATA_PREFIX="$DATA_DIR/port-alerter"
10+
911
function check_services() {
1012
local swarm_name=$SWARM_NAME
11-
while read service_name network_alias check_type check_value; do
12-
unique_name=$(echo "${swarm_name} ${service_name} ${network_alias} ${check_type} ${check_value}")
13+
while read -r service_name network_alias check_type check_value; do
14+
unique_name="${swarm_name} ${service_name} ${network_alias} ${check_type} ${check_value}"
1315
unique_code=$(echo "${unique_name,,}" | sed -e 's/ /_/g' -e 's/[^a-zA-Z0-9_-]/_/g')
1416
random_str=$(tr -dc 'a-zA-Z0-9' </dev/urandom | head -c 10)
15-
read unique_id _ < <(echo -n "$unique_name $random_str" | md5sum)
16-
prefix="$DATA_DIR/${unique_code}"
17+
read -r unique_id _ < <(echo -n "$unique_name $random_str" | md5sum)
18+
prefix="${DATA_PREFIX}-${unique_code}"
1719
pending_file="${prefix}.pending"
1820
log_file="${prefix}.log"
1921

2022
if [[ $check_type == "port" ]]; then
2123
port=$check_value
2224
real_port="$port"
2325
# used for testing
24-
if [[ -f "$DATA_DIR/test-change-port-$port" ]]; then
25-
real_port=$(<"$DATA_DIR/test-change-port-$port")
26+
if [[ -f "${DATA_PREFIX}-test-change-port-$port" ]]; then
27+
real_port=$(<"${DATA_PREFIX}-test-change-port-$port")
2628
fi
2729
WAIT="tcp://$network_alias:$real_port"
2830
WHERE="via mesh"
2931
fi
3032

3133
if [[ $check_type == "sock" ]]; then
32-
IFS=":" read sock_type sock_file <<<"$check_value"
34+
IFS=":" read -r _sock_type sock_file <<<"$check_value"
3335
if [[ ! -S $sock_file ]]; then
3436
if [[ ! -v REPORTED_SOCKS[$sock_file] ]]; then
3537
log_warn "Sock file $sock_file does not exist locally!"
@@ -44,43 +46,42 @@ function check_services() {
4446
action=""
4547
appendix=""
4648
message="${swarm_name} service ${service_name} (${network_alias}:${check_value})"
47-
/usr/local/bin/dockerize -timeout 5s -wait "$WAIT" true 2>$log_file
48-
if [ $? -ne 0 ]; then
49+
if ! /usr/local/bin/dockerize -timeout 5s -wait "$WAIT" true 2>"$log_file"; then
4950
if [[ -f $pending_file ]]; then
5051
log_warn "Pending alert: $message"
5152
else
52-
echo "$unique_id" >$pending_file
53+
echo "$unique_id" >"$pending_file"
5354
action="create"
5455
appendix="not available $WHERE"
5556
fi
5657
else
5758
if [[ -f $pending_file ]]; then
5859
action="resolve"
5960
appendix="is available $WHERE"
60-
unique_id=$(cat $pending_file)
61-
rm -f $pending_file
61+
unique_id=$(cat "$pending_file")
62+
rm -f "$pending_file"
6263
fi
6364
fi
6465
if [[ -n $action ]]; then
6566
jq -n \
6667
--arg action "$action" \
6768
--arg unique_id "$unique_id" \
6869
--arg message "$message $appendix" \
69-
--arg summary "$(cat $log_file)" \
70+
--arg summary "$(cat "$log_file")" \
7071
'{
71-
"action": $action,
72-
"unique_id": $unique_id,
73-
"message": $message,
74-
"summary": $summary
75-
}' | /bin/bash -c "$ALERT_SCRIPT"
72+
"action": $action,
73+
"unique_id": $unique_id,
74+
"message": $message,
75+
"summary": $summary
76+
}' | /bin/bash -c "$ALERT_SCRIPT"
7677
fi
77-
rm -f $log_file
78+
rm -f "$log_file"
7879
done < <(./services.sh)
7980
}
8081

81-
log_info "Entering loop with ${LOOP_SLEEP} sleep on entry ..."
82+
log_info "Port alerter is entering loop with ${LOOP_SLEEP} sleep on entry ..."
8283

8384
while true; do
84-
sleep $LOOP_SLEEP
85+
sleep "$LOOP_SLEEP"
8586
check_services
8687
done

swarm-alerter.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,24 @@ source "./config.sh"
44
source "./logger.sh"
55
source "./checks.sh"
66

7+
DATA_PREFIX="$DATA_DIR/swarm-alerter"
8+
79
function check_nodes() {
810
local active_node_count
9-
active_node_count=$(./nodes.sh | wc -l)
1011
local swarm_name=$SWARM_NAME
11-
local prefix="$DATA_DIR/swarm_failure"
12+
local prefix="${DATA_PREFIX}"
1213
local pending_file="${prefix}.pending"
1314
local log_file="${prefix}.log"
1415
local where="at $HOSTNAME"
1516

16-
./nodes.sh --verbose >"$log_file"
17+
active_node_count=$(./nodes.sh | wc -l)
1718

1819
action=""
1920
appendix=""
2021
message="${swarm_name} swarm active managers count $active_node_count"
2122
if ((SWARM_MANAGER_MIN > active_node_count)); then
23+
./nodes.sh --verbose &>"$log_file"
24+
2225
if [[ -f $pending_file ]]; then
2326
log_warn "Pending alert: $message"
2427
else
@@ -48,10 +51,9 @@ function check_nodes() {
4851
"summary": $summary
4952
}' | /bin/bash -c "$ALERT_SCRIPT"
5053
fi
51-
rm -f "$log_file"
5254
}
5355

54-
log_info "Entering loop with ${LOOP_SLEEP} sleep on entry ..."
56+
log_info "Swarm alerter is entering loop with ${LOOP_SLEEP} sleep on entry ..."
5557

5658
while true; do
5759
sleep "$LOOP_SLEEP"

test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ PARAMS=(
1414
--env LOOP_SLEEP="${LOOP_SLEEP:-10}"
1515
--env SWARM_NAME="${SWARM_NAME:-Swarm}"
1616
--env ZENDUTY_API_KEY="${ZENDUTY_API_KEY:-N/A}"
17+
--volume /:/rootfs:ro
1718
--volume /var/run/docker.sock:/var/run/docker.sock
1819
--volume .:/app/
1920
)

0 commit comments

Comments
 (0)