Skip to content

Commit 8060525

Browse files
committed
Add controller healthcheck
1 parent dcde09b commit 8060525

File tree

2 files changed

+29
-10
lines changed

2 files changed

+29
-10
lines changed

src/controller_healthcheck.sh

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,36 @@
11
#!/bin/bash
22

3+
ZONE=$(basename $(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/zone 2> /dev/null))
4+
35
while true; do
4-
# kill all running hosts that slurm thinks are "down"
5-
sinfo --format %N -t down -Nh -p all | grep -Fx -f - <(gcloud compute instances list --format="csv(name)") | xargs gcloud compute instances delete
6+
# kill all running hosts that slurm thinks are "down"
7+
# sinfo --format %N -t down -Nh -p all | grep -Fx -f - <(gcloud compute instances list --format="csv(name)") | xargs gcloud compute instances delete
8+
9+
#
10+
# resume all down hosts
11+
scontrol update nodename=$(sinfo -h --format %N -t down -p all) state=resume
612

7-
# resume all down hosts
8-
scontrol update nodename=$(sinfo -h --format %N -t down -p all) state=resume
13+
#
14+
# increase disks' sizes
15+
# for each persistent disk attached to instance,
16+
for DISK_DEV in $(find /dev/disk/by-id -name "google*"); do
17+
GOOGLE_DISK_NAME=$(sed 's|/dev/disk/by-id/google-||' <<< "$DISK_DEV")
18+
# if this persistent disk is mounted to wolF NFS, check if it's low on space
19+
if df $DISK_DEV | grep -q /mnt/nfs; then
20+
DISK_SIZE_GB=$(df -B1G $DISK_DEV | awk 'NR == 2 { print int($3 + $4) }')
21+
FREE_SPACE_GB=$(df -B1G $DISK_DEV | awk 'NR == 2 { print int($4) }')
22+
if [[ $((100*FREE_SPACE_GB/DISK_SIZE_GB)) -lt 10 ]]; then
23+
gcloud_exp_backoff compute disks resize $GOOGLE_DISK_NAME --quiet --zone $ZONE --size $((DISK_SIZE_GB+50))
24+
sudo resize2fs $DISK_DEV
25+
fi
26+
fi
27+
done
928

10-
# increase disk size
11-
# cannot have partitions
12-
# TODO
29+
# release stuck jobs
30+
squeue -t PD -o '%i'$'\t''%R' | awk -F'\t' '$2 == "(launch failed requeued held)" { print $1 }' | xargs scontrol release
1331

14-
# check for resource contraints; pause jobs
15-
# TODO
32+
# check for resource contraints; pause jobs
33+
# TODO
1634

17-
sleep 600
35+
sleep 600
1836
done

src/docker_entrypoint_controller.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@
88

99
mysqld --user root &
1010
sudo -E -u $HOST_USER /sgcpd/src/provision_server.py
11+
/sgcpd/src/container_heartbeat.sh &
1112
sudo -E -u $HOST_USER /bin/bash

0 commit comments

Comments
 (0)