-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsumnerinfo
More file actions
executable file
·111 lines (91 loc) · 4.89 KB
/
sumnerinfo
File metadata and controls
executable file
·111 lines (91 loc) · 4.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/bin/bash
## a few commands to display current state of HPC running Slurm job scheduler
## @sbamin
#### required functions ####
## calculate difference between two timestamps
## https://unix.stackexchange.com/a/24636/28675
## https://askubuntu.com/a/1158876/52398
diff_dates(){
TS1=$(date -d "$1" +%s)
TS2=$(date -d "$2" +%s)
period=$((60*60)) # hours
[ "$TS2" -ge "$TS1" ] \
&& max_walltime=$(( ("$TS2" - "$TS1")/("$period") )) \
|| max_walltime=$(( ("$TS1" - "$TS2")/("$period") ))
echo "$max_walltime Hours"
# [ $TS2 -ge $TS1 ] \
# && TZ="${TZ:-UTC}" date -d @$((TS2-TS1)) '+%dDays plus %H:%M:%S' \
# || TZ="${TZ:-UTC}" date -d @$((TS1-TS2)) '+-%dDays plus %H:%M:%S'
}
temptstamp=$(date +%d%b%y_%H%M%S_%Z)
echo -e "\n####################### SUMNER INFO ########################\n${temptstamp}\n"
if [[ -x $(command -v sinfo) ]]; then
echo -e "### CPUs ###\n"
sinfo -o "%10P %10t %10D %20C"
echo -e "\n### Nodes ###\n"
sinfo -s
else
echo "Command: 'sinfo' is not available on node: $(hostname)"
fi
if [[ -x $(command -v squeue) ]]; then
## Read https://slurm.schedmd.com/squeue.html
## Top Users
TOPUSERS=$(squeue -a -l -r -h --noconvert --states=all | awk '{print $2,$5,$4}' | sort | uniq -c | sort -k1,1nr | head -n10 | sort -k3,3r -k1,1nr | awk '{$1=$1};1' OFS=",")
## Via Brian Geuther
TOPCORES=$(squeue -ho "%u,%c" | awk -F ',' '{arr[$1]+=$2} END {for (i in arr) {print i,arr[i]}}' | sort -k2,2nr | awk '{print $2,$1}' OFS="," | head -n10)
LONGRUNS=$(squeue -ho "%u,%l" | awk -F"," '{if($2 ~ "^[1,2]{1,2}-.*") print $1}' OFS="\t" | sort | uniq -c | sort -k1,1nr | awk '{print $1,$2}' OFS="," | head -n10)
MYJOBS=$(squeue -u "${USER}" -a -l -r -h --noconvert --states=all | awk '{print $2,$5,$4}' | sort | uniq -c | sort -k1,1nr | head -n10 | sort -k3,3r -k1,1nr | awk '{$1=$1};1' OFS="\t")
## print as columns
echo -e "\n############## TOP USERS ###############"
paste <(echo "By Queue"; echo "$TOPUSERS") <(echo "By CPUs"; echo "$TOPCORES") <(echo "By walltime of min 24 HR"; echo "$LONGRUNS") | column -s $'\t' -t
echo -e "\n############### MY JOBS ################"
echo "$MYJOBS"
else
echo "Command: 'squeue' is not available on node: $(hostname)"
fi
echo -e "\n############## FAIR SHARE ##############\nIf FairShare < 1, know how fair share works before panicking!\nhttps://slurm.schedmd.com/SLUG19/Priority_and_Fair_Trees.pdf\n"
if [[ -x $(command -v sshare) ]]; then
## PS: Bad idea to run sshare in workflow as it may slow down scheduler
## https://slurm.schedmd.com/sshare.html
sshare -u "$USER" -U
else
echo "Command: 'mdiag -f | grep $USER' is not available, likely on compute nodes"
fi
## this will return non-zero if command is not found on the node, mostly on compute nodes
if [[ -x $(command -v sinfo) ]]; then
SYSRES_INFO="$(sinfo -T -h | awk '{if($1 ~ "^root.*") print $3}')"
SYSRES_INFO="${SYSRES_INFO:-"NULL"}"
if [[ "$SYSRES_INFO" != "NULL" ]]; then
SYSRES_START="$(date -d $(sinfo -T -h | awk '{if($1 ~ "^root.*") print $3}') '+%Y-%m-%d %H:%M:%S')"
TODAY="$(date '+%Y-%m-%d %H:%M:%S')"
## Approx. time left until reservation kicks in
if [[ -x $(command -v datediff) ]]; then
## datediff is from https://github.com/hroptatyr/dateutils/releases
MAXWT=$(datediff --from-zone="${TZ:-UTC}" -f "%d:%H:%M:%S" "${TODAY}" "${SYSRES_START}")
else
MAXWT=$(diff_dates "${TODAY}" "${SYSRES_START}")
fi
echo -e "\n##### System Reservation Kicks In ######\n"
printf "WHEN: %s\tMAX WALLTIME: \033[33;5;7mApprox. %s\033[0m\n" "$SYSRES_START" "$MAXWT"
echo -e "PS: Jobs will not run if walltime exceeds MAX WALLTIME\n"
sinfo -T
fi
else
echo -e "\nCan't check system reservation date.\nsinfo command is not available, likely on compute nodes."
fi
### Disk space ##
## ToDo: Prefer pasting from ssh banner message/motd file.
## replace with your common lab space
TIER1PATH="/projects/fake_space"
TIER1SPACE=$(/bin/df -h "${TIER1PATH}" | grep projects | tail -n1 | grep -Po "([0-9]{1,3})(?=\%)")
TIER1SPACE="${TIER1SPACE:-50}"
TIER1USER="/projects/${USER}"
TIER1USER_SPACE=$(/bin/df -h "${TIER1USER}" | grep projects | tail -n1 | grep -Po "([0-9]{1,3})(?=\%)")
FASTPATH="/fastscratch"
FASTSPACE=$(/bin/df -h "${FASTPATH}" | grep ctgs0 | tail -n1 | grep -Po "([0-9]{1,3})(?=\%)")
if [[ "${TIER1USER_SPACE}" -gt 98 ]] || [[ "${TIER1SPACE}" -gt 98 ]] || [[ "${FASTSPACE}" -gt 99 ]] || [[ "${TIER2QUOTA}" -gt 99 ]]; then
printf "\n######### \033[33;5;7m❗❗ WARN: DISK SPACE USED❗❗ \033[0m ##########\n\ntier1 /projects/%s: %s %%\ntier1 at %s: %s %%\n\n/fastscratch: %s %%\n" "${USER}" "${TIER1USER_SPACE}" "${TIER1PATH}" "${TIER1SPACE}" "${FASTSPACE}"
else
printf "\n############## DISK QUOTA ##############\n\ntier1 /projects/%s: %s %%\ntier1 at %s: %s %%\n\n/fastscratch: %s %%\n" "${USER}" "${TIER1USER_SPACE}" "${TIER1PATH}" "${TIER1SPACE}" "${FASTSPACE}"
fi
echo -e "\n########################### END ############################\n"