FoundationDB · saintstack · Jan 16, 2026
diff --git a/joshua_dashboard.sh b/joshua_dashboard.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+# joshua_dashboard.sh - Health dashboard for Joshua clusters
+#
+# Usage: joshua_dashboard.sh --context <context> [--watch]
+#
+# Script was generated by claude opus 4.5.
+
+set -o pipefail
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+BOLD='\033[1m'
+DIM='\033[2m'
+NC='\033[0m'
+
+CONTEXT="${JOSHUA_CONTEXT:-}"
+WATCH_MODE=false
+NAMESPACE="${JOSHUA_NAMESPACE:-default}"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --context|-c) CONTEXT="$2"; shift 2 ;;
+        --watch|-w) WATCH_MODE=true; shift ;;
+        --namespace|-n) NAMESPACE="$2"; shift 2 ;;
+        --help|-h)
+            echo "Usage: $0 --context <k8s-context> [--watch] [--namespace <ns>]"
+            exit 0 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+if [ -z "$CONTEXT" ]; then
+    echo "Error: --context is required (or set JOSHUA_CONTEXT)"
+    exit 1
+fi
+
+kc() { kubectl --context "$CONTEXT" -n "$NAMESPACE" "$@"; }
+
+color_val() {
+    local val="$1" warn="$2" err="$3"
+    if [ "$val" -ge "$err" ] 2>/dev/null; then echo -e "${RED}$val${NC}"
+    elif [ "$val" -ge "$warn" ] 2>/dev/null; then echo -e "${YELLOW}$val${NC}"
+    else echo -e "${GREEN}$val${NC}"; fi
+}
+
+show_dashboard() {
+    [ "$WATCH_MODE" = true ] && clear
+
+    echo -e "${BOLD}${CYAN}JOSHUA DASHBOARD${NC} ${DIM}$(date '+%H:%M:%S')${NC}  ${DIM}ns:${NC}$NAMESPACE"
+    echo -e "${DIM}${CONTEXT}${NC}"
+    echo ""
+
+    # Scalers
+    local scaler1 scaler2
+    scaler1=$(kc get pods -l app=agent-scaler -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
+    scaler2=$(kc get pods -l app=agent-scaler-rhel9 -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
+
+    echo -e "${BOLD}Scalers${NC}"
+    printf "  centos7: %s  rhel9: %s\n" \
+        "$([ "$scaler1" = "Running" ] && echo -e "${GREEN}Running${NC}" || echo -e "${RED}${scaler1:-NOT FOUND}${NC}")" \
+        "$([ "$scaler2" = "Running" ] && echo -e "${GREEN}Running${NC}" || echo -e "${YELLOW}${scaler2:-NOT FOUND}${NC}")"
+
+    # Queue depth & FDB check - query both scalers
+    local scaler1_pod scaler2_pod queue1 queue2 fdb1_status fdb2_status queue_depth
+    scaler1_pod=$(kc get pods -l "app=agent-scaler" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+    scaler2_pod=$(kc get pods -l "app=agent-scaler-rhel9" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+
+    if [ -n "$scaler1_pod" ]; then
+        queue1=$(kc exec -i "$scaler1_pod" -- python3 /tools/ensemble_count.py -C /etc/foundationdb/fdb.cluster 2>/dev/null)
+        [[ "$queue1" =~ ^[0-9]+$ ]] && fdb1_status="OK" || fdb1_status="ERROR"
+    else
+        fdb1_status="N/A"
+    fi
+
+    if [ -n "$scaler2_pod" ]; then
+        queue2=$(kc exec -i "$scaler2_pod" -- python3 /tools/ensemble_count.py -C /etc/foundationdb/fdb.cluster 2>/dev/null)
+        [[ "$queue2" =~ ^[0-9]+$ ]] && fdb2_status="OK" || fdb2_status="ERROR"
+    else
+        fdb2_status="N/A"
+    fi
+
+    # Use whichever queue depth we got
+    [[ "$queue1" =~ ^[0-9]+$ ]] && queue_depth="$queue1" || queue_depth="$queue2"
+
+    echo ""
+    echo -e "${BOLD}Queue${NC}"
+    if [ -n "$queue_depth" ] && [[ "$queue_depth" =~ ^[0-9]+$ ]]; then
+        printf "  Pending tests: %s\n" "$(color_val "$queue_depth" 1000 10000)"
+    else
+        echo -e "  Pending tests: ${RED}QUERY FAILED${NC}"
+    fi
+    printf "  FDB: centos7=%s rhel9=%s\n" \
+        "$([ "$fdb1_status" = "OK" ] && echo -e "${GREEN}OK${NC}" || echo -e "${RED}$fdb1_status${NC}")" \
+        "$([ "$fdb2_status" = "OK" ] && echo -e "${GREEN}OK${NC}" || echo -e "${RED}$fdb2_status${NC}")"
+
+    # Agent counts - compact
+    local c_run c_pend c_fail c_done r_run r_pend r_fail r_done
+    c_run=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    c_pend=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    c_fail=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Failed --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    c_done=$(kc get pods -l app=joshua-agent --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    r_run=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    r_pend=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    r_fail=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Failed --no-headers 2>/dev/null | wc -l | tr -d ' ')
+    r_done=$(kc get pods -l app=joshua-rhel9-agent --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
+
+    echo ""
+    echo -e "${BOLD}Agents${NC}"
+    printf "  centos7: ${GREEN}%s${NC} run, %s pend, %s fail, %s done\n" "$c_run" \
+        "$([ "$c_pend" -gt 0 ] && echo -e "${YELLOW}$c_pend${NC}" || echo "$c_pend")" \
+        "$([ "$c_fail" -gt 0 ] && echo -e "${RED}$c_fail${NC}" || echo "$c_fail")" \
+        "$([ "$c_done" -gt 100 ] && echo -e "${YELLOW}$c_done${NC}" || echo "$c_done")"
+    printf "  rhel9:   ${GREEN}%s${NC} run, %s pend, %s fail, %s done\n" "$r_run" \
+        "$([ "$r_pend" -gt 0 ] && echo -e "${YELLOW}$r_pend${NC}" || echo "$r_pend")" \
+        "$([ "$r_fail" -gt 0 ] && echo -e "${RED}$r_fail${NC}" || echo "$r_fail")" \
+        "$([ "$r_done" -gt 100 ] && echo -e "${YELLOW}$r_done${NC}" || echo "$r_done")"
+
+    # Pod age distribution
+    local current_epoch pods_lt_1h=0 pods_1h_6h=0 pods_6h_24h=0 pods_gt_24h=0
+    current_epoch=$(date +%s)
+
+    while read -r _ pod_timestamp; do
+        [[ -z "$pod_timestamp" ]] && continue
+        pod_epoch=$(date -d "$pod_timestamp" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$pod_timestamp" +%s 2>/dev/null)
+        [[ -z "$pod_epoch" ]] && continue
+        pod_age=$((current_epoch - pod_epoch))
+        if [[ "$pod_age" -lt 3600 ]]; then ((pods_lt_1h++))
+        elif [[ "$pod_age" -lt 21600 ]]; then ((pods_1h_6h++))
+        elif [[ "$pod_age" -lt 86400 ]]; then ((pods_6h_24h++))
+        else ((pods_gt_24h++)); fi
+    done < <(kc get jobs -o json 2>/dev/null | jq -r '.items[] | select(.status.active > 0) | .metadata.name + " " + .metadata.creationTimestamp' | grep -E '^joshua-(rhel9-)?agent-' || true)
+
+    echo ""
+    echo -e "${BOLD}Pod Ages${NC}"
+    printf "  <1h: %s  1-6h: %s  6-24h: %s  >24h: %s\n" \
+        "$pods_lt_1h" "$pods_1h_6h" \
+        "$([ "$pods_6h_24h" -gt 0 ] && echo -e "${YELLOW}$pods_6h_24h${NC}" || echo "$pods_6h_24h")" \
+        "$([ "$pods_gt_24h" -gt 0 ] && echo -e "${RED}$pods_gt_24h${NC}" || echo "$pods_gt_24h")"
+
+    # Problems section - only show if there are issues
+    local problems=""
+
+    # High restarts
+    local high_restarts
+    high_restarts=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.restartCount > 3) | .metadata.name + "(" + (.status.containerStatuses[0].restartCount|tostring) + ")"' | head -3 | tr '\n' ' ')
+    [ -n "$high_restarts" ] && problems+="  ${YELLOW}High restarts:${NC} $high_restarts\n"
+
+    # OOMKilled
+    local oom
+    oom=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.lastState.terminated.reason == "OOMKilled") | .metadata.name' | head -3 | tr '\n' ' ')
+    [ -n "$oom" ] && problems+="  ${RED}OOMKilled:${NC} $oom\n"
+
+    # CrashLoop
+    local crash
+    crash=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-")) | select(.status.containerStatuses[]?.state.waiting.reason == "CrashLoopBackOff") | .metadata.name' | head -3 | tr '\n' ' ')
+    [ -n "$crash" ] && problems+="  ${RED}CrashLoop:${NC} $crash\n"
+
+    # Failed pods
+    local failed_recent=0 failed_old=0
+    while read -r _ job_timestamp; do
+        [[ -z "$job_timestamp" ]] && continue
+        job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$job_timestamp" +%s 2>/dev/null)
+        [[ -z "$job_epoch" ]] && continue
+        if [[ $((current_epoch - job_epoch)) -lt 86400 ]]; then ((failed_recent++)); else ((failed_old++)); fi
+    done < <(kc get jobs -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | grep -E '^joshua-(rhel9-)?agent-' || true)
+
+    [ "$failed_recent" -gt 0 ] && problems+="  ${YELLOW}Failed <1d:${NC} $failed_recent (kept for debug)\n"
+    [ "$failed_old" -gt 0 ] && problems+="  ${RED}Failed >1d:${NC} $failed_old (should cleanup)\n"
+
+    # Completed pods not cleaned up
+    local total_done=$((c_done + r_done))
+    [ "$total_done" -gt 100 ] && problems+="  ${YELLOW}Completed pods:${NC} $total_done (cleanup not running?)\n"
+
+    # Pending pods
+    if [ $((c_pend + r_pend)) -gt 0 ]; then
+        local pend_reasons
+        pend_reasons=$(kc get pods --field-selector=status.phase=Pending -o json 2>/dev/null | \
+            jq -r '.items[] | select(.metadata.name | test("^joshua-")) | (.status.conditions[]? | select(.type=="PodScheduled" and .status=="False") | .reason) // "Unknown"' | \
+            sort | uniq -c | head -3 | awk '{print $2"("$1")"}' | tr '\n' ' ')
+        [ -n "$pend_reasons" ] && problems+="  ${YELLOW}Pending:${NC} $pend_reasons\n"
+    fi
+
+    if [ -n "$problems" ]; then
+        echo ""
+        echo -e "${BOLD}Problems${NC}"
+        echo -e "$problems"
+    fi
+
+    # Active ensembles - query both scalers
+    local script_dir joshua_py ensembles1 ensembles2
+    script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    joshua_py=$(find "$script_dir" -name "joshua.py" -path "*/joshua/joshua.py" -print -quit 2>/dev/null)
+
+    echo ""
+    echo -e "${BOLD}Ensembles${NC}"
+
+    if [ -z "$joshua_py" ]; then
+        echo -e "  ${RED}joshua.py not found in $script_dir${NC}"
+    elif [ -z "$scaler1_pod" ] && [ -z "$scaler2_pod" ]; then
+        echo -e "  ${RED}No scaler pods available${NC}"
+    else
+        # Try centos7 scaler first
+        if [ -n "$scaler1_pod" ]; then
+            sed -e 's/import lxml.etree as le/le = None/' \
+                -e 's/from \. import joshua_model/import joshua_model/' \
+                "$joshua_py" | kc exec -i "$scaler1_pod" -- tee /tmp/joshua.py > /dev/null 2>&1
+            ensembles1=$(kc exec -i "$scaler1_pod" -- env PYTHONPATH=/tools python3 /tmp/joshua.py -C /etc/foundationdb/fdb.cluster list 2>&1 | head -15)
+        fi
+
+        # Try rhel9 scaler
+        if [ -n "$scaler2_pod" ]; then
+            sed -e 's/import lxml.etree as le/le = None/' \
+                -e 's/from \. import joshua_model/import joshua_model/' \
+                "$joshua_py" | kc exec -i "$scaler2_pod" -- tee /tmp/joshua.py > /dev/null 2>&1
+            ensembles2=$(kc exec -i "$scaler2_pod" -- env PYTHONPATH=/tools python3 /tmp/joshua.py -C /etc/foundationdb/fdb.cluster list 2>&1 | head -15)
+        fi
+
+        # Show ensembles from both scalers
+        if [ -n "$scaler1_pod" ]; then
+            echo -e "  ${DIM}centos7:${NC}"
+            if echo "$ensembles1" | grep -qE '^[[:space:]]+[0-9]{8}-'; then
+                echo "$ensembles1" | grep -E '^[[:space:]]+[0-9]{8}-' | while read -r line; do echo "    $line"; done
+            else
+                echo -e "    ${DIM}(none)${NC}"
+            fi
+        fi
+
+        if [ -n "$scaler2_pod" ]; then
+            echo -e "  ${DIM}rhel9:${NC}"
+            if echo "$ensembles2" | grep -qE '^[[:space:]]+[0-9]{8}-'; then
+                echo "$ensembles2" | grep -E '^[[:space:]]+[0-9]{8}-' | while read -r line; do echo "    $line"; done
+            else
+                echo -e "    ${DIM}(none)${NC}"
+            fi
+        fi
+    fi
+
+    # Node distribution - compact
+    local node_dist
+    node_dist=$(kc get pods -o json 2>/dev/null | jq -r '.items[] | select(.metadata.name | test("^joshua-(rhel9-)?agent-")) | select(.status.phase == "Running") | .spec.nodeName' | sort | uniq -c | sort -rn | head -5)
+
+    if [ -n "$node_dist" ]; then
+        echo ""
+        echo -e "${BOLD}Nodes${NC} ${DIM}(top 5)${NC}"
+        echo "$node_dist" | while read -r count node; do
+            if [ "$count" -gt 50 ]; then
+                printf "  ${YELLOW}%4s${NC} %s\n" "$count" "$node"
+            else
+                printf "  %4s %s\n" "$count" "$node"
+            fi
+        done
+    fi
+
+    if [ "$WATCH_MODE" = true ]; then
+        echo ""
+        echo -e "${DIM}Refresh in 30s (Ctrl+C to exit)${NC}"
+    fi
+}
+
+if [ "$WATCH_MODE" = true ]; then
+    while true; do show_dashboard; sleep 30; done
+else
+    show_dashboard
+fi