Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 268 additions & 0 deletions joshua_dashboard.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
#!/bin/bash
# joshua_dashboard.sh - Health dashboard for Joshua clusters
#
# Usage: joshua_dashboard.sh --context <context> [--watch]
#
# Script was generated by claude opus 4.5.

set -o pipefail

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
BOLD='\033[1m'
DIM='\033[2m'
NC='\033[0m'

CONTEXT="${JOSHUA_CONTEXT:-}"
WATCH_MODE=false
NAMESPACE="${JOSHUA_NAMESPACE:-default}"

while [[ $# -gt 0 ]]; do
case "$1" in
--context|-c) CONTEXT="$2"; shift 2 ;;
--watch|-w) WATCH_MODE=true; shift ;;
--namespace|-n) NAMESPACE="$2"; shift 2 ;;
--help|-h)
echo "Usage: $0 --context <k8s-context> [--watch] [--namespace <ns>]"
exit 0 ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done

if [ -z "$CONTEXT" ]; then
echo "Error: --context is required (or set JOSHUA_CONTEXT)"
exit 1
fi

kc() { kubectl --context "$CONTEXT" -n "$NAMESPACE" "$@"; }

color_val() {
local val="$1" warn="$2" err="$3"
if [ "$val" -ge "$err" ] 2>/dev/null; then echo -e "${RED}$val${NC}"
elif [ "$val" -ge "$warn" ] 2>/dev/null; then echo -e "${YELLOW}$val${NC}"
else echo -e "${GREEN}$val${NC}"; fi
}

show_dashboard() {
[ "$WATCH_MODE" = true ] && clear

echo -e "${BOLD}${CYAN}JOSHUA DASHBOARD${NC} ${DIM}$(date '+%H:%M:%S')${NC} ${DIM}ns:${NC}$NAMESPACE"
echo -e "${DIM}${CONTEXT}${NC}"
echo ""

# Scalers
local scaler1 scaler2
scaler1=$(kc get pods -l app=agent-scaler -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
scaler2=$(kc get pods -l app=agent-scaler-rhel9 -o jsonpath='{.items[0].status.phase}' 2>/dev/null)

echo -e "${BOLD}Scalers${NC}"
printf " centos7: %s rhel9: %s\n" \
"$([ "$scaler1" = "Running" ] && echo -e "${GREEN}Running${NC}" || echo -e "${RED}${scaler1:-NOT FOUND}${NC}")" \
"$([ "$scaler2" = "Running" ] && echo -e "${GREEN}Running${NC}" || echo -e "${YELLOW}${scaler2:-NOT FOUND}${NC}")"

# Queue depth & FDB check - query both scalers
local scaler1_pod scaler2_pod queue1 queue2 fdb1_status fdb2_status queue_depth
scaler1_pod=$(kc get pods -l "app=agent-scaler" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
scaler2_pod=$(kc get pods -l "app=agent-scaler-rhel9" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)

if [ -n "$scaler1_pod" ]; then
queue1=$(kc exec -i "$scaler1_pod" -- python3 /tools/ensemble_count.py -C /etc/foundationdb/fdb.cluster 2>/dev/null)
[[ "$queue1" =~ ^[0-9]+$ ]] && fdb1_status="OK" || fdb1_status="ERROR"
else
fdb1_status="N/A"
fi

if [ -n "$scaler2_pod" ]; then
queue2=$(kc exec -i "$scaler2_pod" -- python3 /tools/ensemble_count.py -C /etc/foundationdb/fdb.cluster 2>/dev/null)
[[ "$queue2" =~ ^[0-9]+$ ]] && fdb2_status="OK" || fdb2_status="ERROR"
else
fdb2_status="N/A"
fi

# Use whichever queue depth we got
[[ "$queue1" =~ ^[0-9]+$ ]] && queue_depth="$queue1" || queue_depth="$queue2"

echo ""
echo -e "${BOLD}Queue${NC}"
if [ -n "$queue_depth" ] && [[ "$queue_depth" =~ ^[0-9]+$ ]]; then
printf " Pending tests: %s\n" "$(color_val "$queue_depth" 1000 10000)"
else
echo -e " Pending tests: ${RED}QUERY FAILED${NC}"
fi
printf " FDB: centos7=%s rhel9=%s\n" \
"$([ "$fdb1_status" = "OK" ] && echo -e "${GREEN}OK${NC}" || echo -e "${RED}$fdb1_status${NC}")" \
"$([ "$fdb2_status" = "OK" ] && echo -e "${GREEN}OK${NC}" || echo -e "${RED}$fdb2_status${NC}")"

# Agent counts - compact
local c_run c_pend c_fail c_done r_run r_pend r_fail r_done
c_run=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')
c_pend=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
c_fail=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Failed --no-headers 2>/dev/null | wc -l | tr -d ' ')
c_done=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
r_run=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')
r_pend=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
r_fail=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Failed --no-headers 2>/dev/null | wc -l | tr -d ' ')
r_done=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')

echo ""
echo -e "${BOLD}Agents${NC}"
printf " centos7: ${GREEN}%s${NC} run, %s pend, %s fail, %s done\n" "$c_run" \
"$([ "$c_pend" -gt 0 ] && echo -e "${YELLOW}$c_pend${NC}" || echo "$c_pend")" \
"$([ "$c_fail" -gt 0 ] && echo -e "${RED}$c_fail${NC}" || echo "$c_fail")" \
"$([ "$c_done" -gt 100 ] && echo -e "${YELLOW}$c_done${NC}" || echo "$c_done")"
printf " rhel9: ${GREEN}%s${NC} run, %s pend, %s fail, %s done\n" "$r_run" \
"$([ "$r_pend" -gt 0 ] && echo -e "${YELLOW}$r_pend${NC}" || echo "$r_pend")" \
"$([ "$r_fail" -gt 0 ] && echo -e "${RED}$r_fail${NC}" || echo "$r_fail")" \
"$([ "$r_done" -gt 100 ] && echo -e "${YELLOW}$r_done${NC}" || echo "$r_done")"

# Pod age distribution
local current_epoch pods_lt_1h=0 pods_1h_6h=0 pods_6h_24h=0 pods_gt_24h=0
current_epoch=$(date +%s)

while read -r _ pod_timestamp; do
[[ -z "$pod_timestamp" ]] && continue
pod_epoch=$(date -d "$pod_timestamp" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$pod_timestamp" +%s 2>/dev/null)
[[ -z "$pod_epoch" ]] && continue
pod_age=$((current_epoch - pod_epoch))
if [[ "$pod_age" -lt 3600 ]]; then ((pods_lt_1h++))
elif [[ "$pod_age" -lt 21600 ]]; then ((pods_1h_6h++))
elif [[ "$pod_age" -lt 86400 ]]; then ((pods_6h_24h++))
else ((pods_gt_24h++)); fi
done < <(kc get jobs -o json 2>/dev/null | jq -r '.items[] | select(.status.active > 0) | .metadata.name + " " + .metadata.creationTimestamp' | grep -E '^joshua-(rhel9-)?agent-' || true)

echo ""
echo -e "${BOLD}Pod Ages${NC}"
printf " <1h: %s 1-6h: %s 6-24h: %s >24h: %s\n" \
"$pods_lt_1h" "$pods_1h_6h" \
"$([ "$pods_6h_24h" -gt 0 ] && echo -e "${YELLOW}$pods_6h_24h${NC}" || echo "$pods_6h_24h")" \
"$([ "$pods_gt_24h" -gt 0 ] && echo -e "${RED}$pods_gt_24h${NC}" || echo "$pods_gt_24h")"

# Problems section - only show if there are issues
local problems=""

# High restarts
local high_restarts
high_restarts=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.restartCount > 3) | .metadata.name + "(" + (.status.containerStatuses[0].restartCount|tostring) + ")"' | head -3 | tr '\n' ' ')
[ -n "$high_restarts" ] && problems+=" ${YELLOW}High restarts:${NC} $high_restarts\n"

# OOMKilled
local oom
oom=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.lastState.terminated.reason == "OOMKilled") | .metadata.name' | head -3 | tr '\n' ' ')
[ -n "$oom" ] && problems+=" ${RED}OOMKilled:${NC} $oom\n"

# CrashLoop
local crash
crash=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.state.waiting.reason == "CrashLoopBackOff") | .metadata.name' | head -3 | tr '\n' ' ')
[ -n "$crash" ] && problems+=" ${RED}CrashLoop:${NC} $crash\n"

# Failed pods
local failed_recent=0 failed_old=0
while read -r _ job_timestamp; do
[[ -z "$job_timestamp" ]] && continue
job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$job_timestamp" +%s 2>/dev/null)
[[ -z "$job_epoch" ]] && continue
if [[ $((current_epoch - job_epoch)) -lt 86400 ]]; then ((failed_recent++)); else ((failed_old++)); fi
done < <(kc get jobs -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | grep -E '^joshua-(rhel9-)?agent-' || true)

[ "$failed_recent" -gt 0 ] && problems+=" ${YELLOW}Failed <1d:${NC} $failed_recent (kept for debug)\n"
[ "$failed_old" -gt 0 ] && problems+=" ${RED}Failed >1d:${NC} $failed_old (should cleanup)\n"

# Completed pods not cleaned up
local total_done=$((c_done + r_done))
[ "$total_done" -gt 100 ] && problems+=" ${YELLOW}Completed pods:${NC} $total_done (cleanup not running?)\n"

# Pending pods
if [ $((c_pend + r_pend)) -gt 0 ]; then
local pend_reasons
pend_reasons=$(kc get pods --field-selector=status.phase=Pending -o json 2>/dev/null | \
jq -r '.items[] | select(.metadata.name | test("^joshua-")) | (.status.conditions[]? | select(.type=="PodScheduled" and .status=="False") | .reason) // "Unknown"' | \
sort | uniq -c | head -3 | awk '{print $2"("$1")"}' | tr '\n' ' ')
[ -n "$pend_reasons" ] && problems+=" ${YELLOW}Pending:${NC} $pend_reasons\n"
fi

if [ -n "$problems" ]; then
echo ""
echo -e "${BOLD}Problems${NC}"
echo -e "$problems"
fi

# Active ensembles - query both scalers
local script_dir joshua_py ensembles1 ensembles2
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
joshua_py=$(find "$script_dir" -name "joshua.py" -path "*/joshua/joshua.py" -print -quit 2>/dev/null)

echo ""
echo -e "${BOLD}Ensembles${NC}"

if [ -z "$joshua_py" ]; then
echo -e " ${RED}joshua.py not found in $script_dir${NC}"
elif [ -z "$scaler1_pod" ] && [ -z "$scaler2_pod" ]; then
echo -e " ${RED}No scaler pods available${NC}"
else
# Try centos7 scaler first
if [ -n "$scaler1_pod" ]; then
sed -e 's/import lxml.etree as le/le = None/' \
-e 's/from \. import joshua_model/import joshua_model/' \
"$joshua_py" | kc exec -i "$scaler1_pod" -- tee /tmp/joshua.py > /dev/null 2>&1
ensembles1=$(kc exec -i "$scaler1_pod" -- env PYTHONPATH=/tools python3 /tmp/joshua.py -C /etc/foundationdb/fdb.cluster list 2>&1 | head -15)
fi

# Try rhel9 scaler
if [ -n "$scaler2_pod" ]; then
sed -e 's/import lxml.etree as le/le = None/' \
-e 's/from \. import joshua_model/import joshua_model/' \
"$joshua_py" | kc exec -i "$scaler2_pod" -- tee /tmp/joshua.py > /dev/null 2>&1
ensembles2=$(kc exec -i "$scaler2_pod" -- env PYTHONPATH=/tools python3 /tmp/joshua.py -C /etc/foundationdb/fdb.cluster list 2>&1 | head -15)
fi

# Show ensembles from both scalers
if [ -n "$scaler1_pod" ]; then
echo -e " ${DIM}centos7:${NC}"
if echo "$ensembles1" | grep -qE '^[[:space:]]+[0-9]{8}-'; then
echo "$ensembles1" | grep -E '^[[:space:]]+[0-9]{8}-' | while read -r line; do echo " $line"; done
else
echo -e " ${DIM}(none)${NC}"
fi
fi

if [ -n "$scaler2_pod" ]; then
echo -e " ${DIM}rhel9:${NC}"
if echo "$ensembles2" | grep -qE '^[[:space:]]+[0-9]{8}-'; then
echo "$ensembles2" | grep -E '^[[:space:]]+[0-9]{8}-' | while read -r line; do echo " $line"; done
else
echo -e " ${DIM}(none)${NC}"
fi
fi
fi

# Node distribution - compact
local node_dist
node_dist=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-(rhel9-)?agent-")) | select(.status.phase == "Running") | .spec.nodeName' | sort | uniq -c | sort -rn | head -5)

if [ -n "$node_dist" ]; then
echo ""
echo -e "${BOLD}Nodes${NC} ${DIM}(top 5)${NC}"
echo "$node_dist" | while read -r count node; do
if [ "$count" -gt 50 ]; then
printf " ${YELLOW}%4s${NC} %s\n" "$count" "$node"
else
printf " %4s %s\n" "$count" "$node"
fi
done
fi

if [ "$WATCH_MODE" = true ]; then
echo ""
echo -e "${DIM}Refresh in 30s (Ctrl+C to exit)${NC}"
fi
}

if [ "$WATCH_MODE" = true ]; then
while true; do show_dashboard; sleep 30; done
else
show_dashboard
fi