diff --git a/joshua_dashboard.sh b/joshua_dashboard.sh new file mode 100755 index 0000000..5c279c0 --- /dev/null +++ b/joshua_dashboard.sh @@ -0,0 +1,268 @@ +#!/bin/bash +# joshua_dashboard.sh - Health dashboard for Joshua clusters +# +# Usage: joshua_dashboard.sh --context [--watch] +# +# Script was generated by claude opus 4.5. + +set -o pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +BOLD='\033[1m' +DIM='\033[2m' +NC='\033[0m' + +CONTEXT="${JOSHUA_CONTEXT:-}" +WATCH_MODE=false +NAMESPACE="${JOSHUA_NAMESPACE:-default}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --context|-c) CONTEXT="$2"; shift 2 ;; + --watch|-w) WATCH_MODE=true; shift ;; + --namespace|-n) NAMESPACE="$2"; shift 2 ;; + --help|-h) + echo "Usage: $0 --context [--watch] [--namespace ]" + exit 0 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +if [ -z "$CONTEXT" ]; then + echo "Error: --context is required (or set JOSHUA_CONTEXT)" + exit 1 +fi + +kc() { kubectl --context "$CONTEXT" -n "$NAMESPACE" "$@"; } + +color_val() { + local val="$1" warn="$2" err="$3" + if [ "$val" -ge "$err" ] 2>/dev/null; then echo -e "${RED}$val${NC}" + elif [ "$val" -ge "$warn" ] 2>/dev/null; then echo -e "${YELLOW}$val${NC}" + else echo -e "${GREEN}$val${NC}"; fi +} + +show_dashboard() { + [ "$WATCH_MODE" = true ] && clear + + echo -e "${BOLD}${CYAN}JOSHUA DASHBOARD${NC} ${DIM}$(date '+%H:%M:%S')${NC} ${DIM}ns:${NC}$NAMESPACE" + echo -e "${DIM}${CONTEXT}${NC}" + echo "" + + # Scalers + local scaler1 scaler2 + scaler1=$(kc get pods -l app=agent-scaler -o jsonpath='{.items[0].status.phase}' 2>/dev/null) + scaler2=$(kc get pods -l app=agent-scaler-rhel9 -o jsonpath='{.items[0].status.phase}' 2>/dev/null) + + echo -e "${BOLD}Scalers${NC}" + printf " centos7: %s rhel9: %s\n" \ + "$([ "$scaler1" = "Running" ] && echo -e "${GREEN}Running${NC}" || echo -e "${RED}${scaler1:-NOT FOUND}${NC}")" \ + "$([ "$scaler2" = "Running" ] && echo -e "${GREEN}Running${NC}" || echo -e "${YELLOW}${scaler2:-NOT FOUND}${NC}")" + + # Queue depth & FDB check - query both scalers + local scaler1_pod scaler2_pod queue1 queue2 fdb1_status fdb2_status queue_depth + scaler1_pod=$(kc get pods -l "app=agent-scaler" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + scaler2_pod=$(kc get pods -l "app=agent-scaler-rhel9" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + + if [ -n "$scaler1_pod" ]; then + queue1=$(kc exec -i "$scaler1_pod" -- python3 /tools/ensemble_count.py -C /etc/foundationdb/fdb.cluster 2>/dev/null) + [[ "$queue1" =~ ^[0-9]+$ ]] && fdb1_status="OK" || fdb1_status="ERROR" + else + fdb1_status="N/A" + fi + + if [ -n "$scaler2_pod" ]; then + queue2=$(kc exec -i "$scaler2_pod" -- python3 /tools/ensemble_count.py -C /etc/foundationdb/fdb.cluster 2>/dev/null) + [[ "$queue2" =~ ^[0-9]+$ ]] && fdb2_status="OK" || fdb2_status="ERROR" + else + fdb2_status="N/A" + fi + + # Use whichever queue depth we got + [[ "$queue1" =~ ^[0-9]+$ ]] && queue_depth="$queue1" || queue_depth="$queue2" + + echo "" + echo -e "${BOLD}Queue${NC}" + if [ -n "$queue_depth" ] && [[ "$queue_depth" =~ ^[0-9]+$ ]]; then + printf " Pending tests: %s\n" "$(color_val "$queue_depth" 1000 10000)" + else + echo -e " Pending tests: ${RED}QUERY FAILED${NC}" + fi + printf " FDB: centos7=%s rhel9=%s\n" \ + "$([ "$fdb1_status" = "OK" ] && echo -e "${GREEN}OK${NC}" || echo -e "${RED}$fdb1_status${NC}")" \ + "$([ "$fdb2_status" = "OK" ] && echo -e "${GREEN}OK${NC}" || echo -e "${RED}$fdb2_status${NC}")" + + # Agent counts - compact + local c_run c_pend c_fail c_done r_run r_pend r_fail r_done + c_run=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ') + c_pend=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ') + c_fail=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Failed --no-headers 2>/dev/null | wc -l | tr -d ' ') + c_done=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ') + r_run=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ') + r_pend=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ') + r_fail=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Failed --no-headers 2>/dev/null | wc -l | tr -d ' ') + r_done=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ') + + echo "" + echo -e "${BOLD}Agents${NC}" + printf " centos7: ${GREEN}%s${NC} run, %s pend, %s fail, %s done\n" "$c_run" \ + "$([ "$c_pend" -gt 0 ] && echo -e "${YELLOW}$c_pend${NC}" || echo "$c_pend")" \ + "$([ "$c_fail" -gt 0 ] && echo -e "${RED}$c_fail${NC}" || echo "$c_fail")" \ + "$([ "$c_done" -gt 100 ] && echo -e "${YELLOW}$c_done${NC}" || echo "$c_done")" + printf " rhel9: ${GREEN}%s${NC} run, %s pend, %s fail, %s done\n" "$r_run" \ + "$([ "$r_pend" -gt 0 ] && echo -e "${YELLOW}$r_pend${NC}" || echo "$r_pend")" \ + "$([ "$r_fail" -gt 0 ] && echo -e "${RED}$r_fail${NC}" || echo "$r_fail")" \ + "$([ "$r_done" -gt 100 ] && echo -e "${YELLOW}$r_done${NC}" || echo "$r_done")" + + # Pod age distribution + local current_epoch pods_lt_1h=0 pods_1h_6h=0 pods_6h_24h=0 pods_gt_24h=0 + current_epoch=$(date +%s) + + while read -r _ pod_timestamp; do + [[ -z "$pod_timestamp" ]] && continue + pod_epoch=$(date -d "$pod_timestamp" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$pod_timestamp" +%s 2>/dev/null) + [[ -z "$pod_epoch" ]] && continue + pod_age=$((current_epoch - pod_epoch)) + if [[ "$pod_age" -lt 3600 ]]; then ((pods_lt_1h++)) + elif [[ "$pod_age" -lt 21600 ]]; then ((pods_1h_6h++)) + elif [[ "$pod_age" -lt 86400 ]]; then ((pods_6h_24h++)) + else ((pods_gt_24h++)); fi + done < <(kc get jobs -o json 2>/dev/null | jq -r '.items[] | select(.status.active > 0) | .metadata.name + " " + .metadata.creationTimestamp' | grep -E '^joshua-(rhel9-)?agent-' || true) + + echo "" + echo -e "${BOLD}Pod Ages${NC}" + printf " <1h: %s 1-6h: %s 6-24h: %s >24h: %s\n" \ + "$pods_lt_1h" "$pods_1h_6h" \ + "$([ "$pods_6h_24h" -gt 0 ] && echo -e "${YELLOW}$pods_6h_24h${NC}" || echo "$pods_6h_24h")" \ + "$([ "$pods_gt_24h" -gt 0 ] && echo -e "${RED}$pods_gt_24h${NC}" || echo "$pods_gt_24h")" + + # Problems section - only show if there are issues + local problems="" + + # High restarts + local high_restarts + high_restarts=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.restartCount > 3) | .metadata.name + "(" + (.status.containerStatuses[0].restartCount|tostring) + ")"' | head -3 | tr '\n' ' ') + [ -n "$high_restarts" ] && problems+=" ${YELLOW}High restarts:${NC} $high_restarts\n" + + # OOMKilled + local oom + oom=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.lastState.terminated.reason == "OOMKilled") | .metadata.name' | head -3 | tr '\n' ' ') + [ -n "$oom" ] && problems+=" ${RED}OOMKilled:${NC} $oom\n" + + # CrashLoop + local crash + crash=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.state.waiting.reason == "CrashLoopBackOff") | .metadata.name' | head -3 | tr '\n' ' ') + [ -n "$crash" ] && problems+=" ${RED}CrashLoop:${NC} $crash\n" + + # Failed pods + local failed_recent=0 failed_old=0 + while read -r _ job_timestamp; do + [[ -z "$job_timestamp" ]] && continue + job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$job_timestamp" +%s 2>/dev/null) + [[ -z "$job_epoch" ]] && continue + if [[ $((current_epoch - job_epoch)) -lt 86400 ]]; then ((failed_recent++)); else ((failed_old++)); fi + done < <(kc get jobs -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | grep -E '^joshua-(rhel9-)?agent-' || true) + + [ "$failed_recent" -gt 0 ] && problems+=" ${YELLOW}Failed <1d:${NC} $failed_recent (kept for debug)\n" + [ "$failed_old" -gt 0 ] && problems+=" ${RED}Failed >1d:${NC} $failed_old (should cleanup)\n" + + # Completed pods not cleaned up + local total_done=$((c_done + r_done)) + [ "$total_done" -gt 100 ] && problems+=" ${YELLOW}Completed pods:${NC} $total_done (cleanup not running?)\n" + + # Pending pods + if [ $((c_pend + r_pend)) -gt 0 ]; then + local pend_reasons + pend_reasons=$(kc get pods --field-selector=status.phase=Pending -o json 2>/dev/null | \ + jq -r '.items[] | select(.metadata.name | test("^joshua-")) | (.status.conditions[]? | select(.type=="PodScheduled" and .status=="False") | .reason) // "Unknown"' | \ + sort | uniq -c | head -3 | awk '{print $2"("$1")"}' | tr '\n' ' ') + [ -n "$pend_reasons" ] && problems+=" ${YELLOW}Pending:${NC} $pend_reasons\n" + fi + + if [ -n "$problems" ]; then + echo "" + echo -e "${BOLD}Problems${NC}" + echo -e "$problems" + fi + + # Active ensembles - query both scalers + local script_dir joshua_py ensembles1 ensembles2 + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + joshua_py=$(find "$script_dir" -name "joshua.py" -path "*/joshua/joshua.py" -print -quit 2>/dev/null) + + echo "" + echo -e "${BOLD}Ensembles${NC}" + + if [ -z "$joshua_py" ]; then + echo -e " ${RED}joshua.py not found in $script_dir${NC}" + elif [ -z "$scaler1_pod" ] && [ -z "$scaler2_pod" ]; then + echo -e " ${RED}No scaler pods available${NC}" + else + # Try centos7 scaler first + if [ -n "$scaler1_pod" ]; then + sed -e 's/import lxml.etree as le/le = None/' \ + -e 's/from \. import joshua_model/import joshua_model/' \ + "$joshua_py" | kc exec -i "$scaler1_pod" -- tee /tmp/joshua.py > /dev/null 2>&1 + ensembles1=$(kc exec -i "$scaler1_pod" -- env PYTHONPATH=/tools python3 /tmp/joshua.py -C /etc/foundationdb/fdb.cluster list 2>&1 | head -15) + fi + + # Try rhel9 scaler + if [ -n "$scaler2_pod" ]; then + sed -e 's/import lxml.etree as le/le = None/' \ + -e 's/from \. import joshua_model/import joshua_model/' \ + "$joshua_py" | kc exec -i "$scaler2_pod" -- tee /tmp/joshua.py > /dev/null 2>&1 + ensembles2=$(kc exec -i "$scaler2_pod" -- env PYTHONPATH=/tools python3 /tmp/joshua.py -C /etc/foundationdb/fdb.cluster list 2>&1 | head -15) + fi + + # Show ensembles from both scalers + if [ -n "$scaler1_pod" ]; then + echo -e " ${DIM}centos7:${NC}" + if echo "$ensembles1" | grep -qE '^[[:space:]]+[0-9]{8}-'; then + echo "$ensembles1" | grep -E '^[[:space:]]+[0-9]{8}-' | while read -r line; do echo " $line"; done + else + echo -e " ${DIM}(none)${NC}" + fi + fi + + if [ -n "$scaler2_pod" ]; then + echo -e " ${DIM}rhel9:${NC}" + if echo "$ensembles2" | grep -qE '^[[:space:]]+[0-9]{8}-'; then + echo "$ensembles2" | grep -E '^[[:space:]]+[0-9]{8}-' | while read -r line; do echo " $line"; done + else + echo -e " ${DIM}(none)${NC}" + fi + fi + fi + + # Node distribution - compact + local node_dist + node_dist=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-(rhel9-)?agent-")) | select(.status.phase == "Running") | .spec.nodeName' | sort | uniq -c | sort -rn | head -5) + + if [ -n "$node_dist" ]; then + echo "" + echo -e "${BOLD}Nodes${NC} ${DIM}(top 5)${NC}" + echo "$node_dist" | while read -r count node; do + if [ "$count" -gt 50 ]; then + printf " ${YELLOW}%4s${NC} %s\n" "$count" "$node" + else + printf " %4s %s\n" "$count" "$node" + fi + done + fi + + if [ "$WATCH_MODE" = true ]; then + echo "" + echo -e "${DIM}Refresh in 30s (Ctrl+C to exit)${NC}" + fi +} + +if [ "$WATCH_MODE" = true ]; then + while true; do show_dashboard; sleep 30; done +else + show_dashboard +fi