|
1 | 1 | #!/bin/bash |
2 | 2 |
|
| 3 | +ZONE=$(basename $(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/zone 2> /dev/null)) |
| 4 | + |
3 | 5 | while true; do |
4 | | - # kill all running hosts that slurm thinks are "down" |
5 | | - sinfo --format %N -t down -Nh -p all | grep -Fx -f - <(gcloud compute instances list --format="csv(name)") | xargs gcloud compute instances delete |
| 6 | + # kill all running hosts that slurm thinks are "down" |
| 7 | + # sinfo --format %N -t down -Nh -p all | grep -Fx -f - <(gcloud compute instances list --format="csv(name)") | xargs gcloud compute instances delete |
| 8 | + |
| 9 | + # |
| 10 | + # resume all down hosts |
| 11 | + scontrol update nodename=$(sinfo -h --format %N -t down -p all) state=resume |
6 | 12 |
|
7 | | - # resume all down hosts |
8 | | - scontrol update nodename=$(sinfo -h --format %N -t down -p all) state=resume |
| 13 | + # |
| 14 | + # increase disks' sizes |
| 15 | + # for each persistent disk attached to instance, |
| 16 | + for DISK_DEV in $(find /dev/disk/by-id -name "google*"); do |
| 17 | + GOOGLE_DISK_NAME=$(sed 's|/dev/disk/by-id/google-||' <<< "$DISK_DEV") |
| 18 | + # if this persistent disk is mounted to wolF NFS, check if it's low on space |
| 19 | + if df $DISK_DEV | grep -q /mnt/nfs; then |
| 20 | + DISK_SIZE_GB=$(df -B1G $DISK_DEV | awk 'NR == 2 { print int($3 + $4) }') |
| 21 | + FREE_SPACE_GB=$(df -B1G $DISK_DEV | awk 'NR == 2 { print int($4) }') |
| 22 | + if [[ $((100*FREE_SPACE_GB/DISK_SIZE_GB)) -lt 10 ]]; then |
| 23 | + gcloud_exp_backoff compute disks resize $GOOGLE_DISK_NAME --quiet --zone $ZONE --size $((DISK_SIZE_GB+50)) |
| 24 | + sudo resize2fs $DISK_DEV |
| 25 | + fi |
| 26 | + fi |
| 27 | + done |
9 | 28 |
|
10 | | - # increase disk size |
11 | | - # cannot have partitions |
12 | | - # TODO |
| 29 | + # release stuck jobs |
| 30 | + squeue -t PD -o '%i'$'\t''%R' | awk -F'\t' '$2 == "(launch failed requeued held)" { print $1 }' | xargs scontrol release |
13 | 31 |
|
14 | | - # check for resource contraints; pause jobs |
15 | | - # TODO |
| 32 | + # check for resource contraints; pause jobs |
| 33 | + # TODO |
16 | 34 |
|
17 | | - sleep 600 |
| 35 | + sleep 600 |
18 | 36 | done |
0 commit comments