Skip to content

Add slabinfo checks #70

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ nobase_dist_conf_DATA = scripts/lbnl_cmd.nhc scripts/common.nhc \
scripts/lbnl_fs.nhc scripts/lbnl_hw.nhc \
scripts/lbnl_job.nhc scripts/lbnl_moab.nhc \
scripts/lbnl_net.nhc scripts/lbnl_nv.nhc \
scripts/lbnl_ps.nhc
scripts/lbnl_ps.nhc \
scripts/osc_slabinfo.nhc

MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure install-sh missing
DISTCLEANFILES =
Expand Down
160 changes: 160 additions & 0 deletions scripts/osc_slabinfo.nhc
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# OSC NHC - slabinfo checks
#
# Trey Dockendorf <[email protected]>
# 21 September 2018
#

SLAB_NAME=()
SLAB_ACTIVE_OBJS=()
SLAB_NUM_OBJS=()
SLAB_ACTIVE_SLABS=()
SLAB_NUM_SLABS=()
export SLAB_NAME SLAB_ACTIVE_OBJS SLAB_NUM_OBJS SLAB_ACTIVE_SLABS SLAB_NUM_SLABS

function nhc_slabinfo_gather_data() {
local LINE_CNT
local -a FIELD

SLAB_NAME=() SLAB_ACTIVE_OBJS=() SLAB_NUM_OBJS=() SLAB_ACTIVE_SLABS=() SLAB_NUM_SLABS=()

((LINE_CNT=0))
while read -a FIELD; do
if [[ "${FIELD[0]}" == "#" ]]; then
continue
fi
if [[ "${FIELD[0]}" == "slabinfo" ]]; then
continue
fi
SLAB_NAME[$LINE_CNT]="${FIELD[0]}"
SLAB_ACTIVE_OBJS[$LINE_CNT]="${FIELD[1]}"
SLAB_NUM_OBJS[$LINE_CNT]="${FIELD[2]}"
SLAB_ACTIVE_SLABS[$LINE_CNT]="${FIELD[13]}"
SLAB_NUM_SLABS[$LINE_CNT]="${FIELD[14]}"
dbg "Got slabinfo ${SLAB_NAME[$LINE_CNT]} ${SLAB_ACTIVE_OBJS[$LINE_CNT]} ${SLAB_NUM_OBJS[$LINE_CNT]} ${SLAB_ACTIVE_SLABS[$LINE_CNT]} ${SLAB_NUM_SLABS[$LINE_CNT]}"
((LINE_CNT++))
done < /proc/slabinfo

export SLAB_NAME SLAB_ACTIVE_OBJS SLAB_NUM_OBJS SLAB_ACTIVE_SLABS SLAB_NUM_SLABS
}

# Do the actual work of looking at slabinfo.
# Used by check_slabinfo_{active_objs,num_objs,active_slabs,num_slabs}.
# Additional parameters are the check name and the type of resource to look at.
function nhc_slabinfo_check() {
local NONFATAL=0 ALL=0 LOG=0 SYSLOG=0 MATCH="" ACTION=""
local CHECK="$1" COL="$2" THRESHOLD
local THIS_NAME MSG i

if [[ ${#SLAB_NAME[*]} -eq 0 ]]; then
nhc_slabinfo_gather_data
fi

shift 2
OPTIND=1
while getopts ":0alsm:e:" OPTION ; do
case "$OPTION" in
0) NONFATAL=1 ;;
a) ALL=1 ;;
l) LOG=1 ;;
s) SYSLOG=1 ;;
m) MATCH="$OPTARG" ;;
e) ACTION="$OPTARG" ;;
:) die 1 "$CHECK: Option -$OPTARG requires an argument." ; return 1 ;;
\?) die 1 "$CHECK: Invalid option: -$OPTARG" ; return 1 ;;
esac
done
shift $((OPTIND-1))
THRESHOLD="$1"
if [[ -z "$THRESHOLD" || "${THRESHOLD//[^0-9]}" != "$THRESHOLD" ]]; then
die 1 "$CHECK: Syntax error: threshold must be an integer."
fi
if [[ -z "$MATCH" ]]; then
MATCH="*"
fi
dbg "Looking for slabinfo matching \"$MATCH\""
for ((i=0; i < ${#SLAB_NAME[*]}; i++)); do
THIS_NAME="${SLAB_NAME[$i]}"
dbg "CHECKING \"$THIS_NAME\" vs. \"$MATCH\""
if ! mcheck "$THIS_NAME" "$MATCH"; then
continue
fi
dbg "Matching slabinfo found: $THIS_NAME: active_objs=${SLAB_ACTIVE_OBJS[$i]} num_objs=${SLAB_NUM_OBJS[$i]} active_slabs=${SLAB_ACTIVE_SLABS[$i]} num_slabs=${SLAB_NUM_SLABS[$i]}"
case $COL in
1) if [[ ${SLAB_ACTIVE_OBJS[$i]} -lt $THRESHOLD ]]; then continue ; fi
MSG="$CHECK: slabinfo \"$THIS_NAME\" using ${SLAB_ACTIVE_OBJS[$i]} active_objs (limit $THRESHOLD)"
;;
2) if [[ ${SLAB_NUM_OBJS[$i]} -lt $THRESHOLD ]]; then continue ; fi
MSG="$CHECK: slabinfo \"$THIS_NAME\" using ${SLAB_NUM_OBJS[$i]} num_objs (limit $THRESHOLD)"
;;
3) if [[ ${SLAB_ACTIVE_SLABS[$i]} -lt $THRESHOLD ]]; then continue ; fi
MSG="$CHECK: slabinfo \"$THIS_NAME\" using ${SLAB_ACTIVE_SLABS[$i]} active_slabs (limit $THRESHOLD)"
;;
4) if [[ ${SLAB_NUM_SLABS[$i]} -lt $THRESHOLD ]]; then continue ; fi
MSG="$CHECK: slabinfo \"$THIS_NAME\" using ${SLAB_NUM_SLABS[$i]} num_slabs (limit $THRESHOLD)"
;;
esac
# We have a winner. Or loser, as the case may be.
if [[ "$LOG" == "1" ]]; then
log $MSG
fi
if [[ "$SYSLOG" == "1" ]]; then
syslog $MSG
fi
if [[ "$ACTION" != "" ]]; then
${SHELL:-/bin/bash} -c "$ACTION" &
fi
if [[ $ALL -ge 1 ]]; then
if [[ -n "$MSG" ]]; then
log "$MSG ($ALL)"
fi
((ALL++))
continue
elif [[ $NONFATAL == 1 ]]; then
if [[ -n "$MSG" ]]; then
log "$MSG (non-fatal)"
fi
return 0
fi
die 1 "$MSG"
return 1
done
# -a (all) does not necessarily imply -0 (non-fatal). A value of 1 for $ALL
# means -a was passed in but no errors were found. 2 or above is an error.
if [[ $ALL -gt 1 ]]; then
# We had at least 1 flagged process. Fail unless we're also non-fatal.
if [[ $NONFATAL == 1 ]]; then
if [[ -n "$MSG" ]]; then
log "$MSG (non-fatal)"
fi
return 0
fi
((ALL--))
die $ALL "$MSG (last of $ALL)"
return $ALL
fi
return 0
}

# Checks slabinfo active_objs is above a threshold
# check_slabinfo_active_objs [-0] [-a] [-l] [-s] [-m <match>] [-e <action>] <threshold>
function check_slabinfo_active_objs() {
nhc_slabinfo_check $FUNCNAME 1 "$@"
}

# Checks slabinfo num_objs is above a threshold
# check_slabinfo_num_objs [-0] [-a] [-l] [-s] [-m <match>] [-e <action>] <threshold>
function check_slabinfo_num_objs() {
nhc_slabinfo_check $FUNCNAME 2 "$@"
}

# Checks slabinfo active_slabs is above a threshold
# check_slabinfo_active_slabs [-0] [-a] [-l] [-s] [-m <match>] [-e <action>] <threshold>
function check_slabinfo_active_slabs() {
nhc_slabinfo_check $FUNCNAME 3 "$@"
}

# Checks slabinfo num_slabs is above a threshold
# check_slabinfo_num_slabs [-0] [-a] [-l] [-s] [-m <match>] [-e <action>] <threshold>
function check_slabinfo_num_slabs() {
nhc_slabinfo_check $FUNCNAME 4 "$@"
}