-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalidate.sh
More file actions
executable file
·159 lines (139 loc) · 5.25 KB
/
validate.sh
File metadata and controls
executable file
·159 lines (139 loc) · 5.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env bash
# validate.sh — run all scientific HPC application cluster tests.
#
# Usage:
# ./validate.sh # test all apps (grayscott first, then others)
# ./validate.sh grayscott # test a single app
# ./validate.sh grayscott lammps # test specific apps
#
# Each app is tested in two phases:
# 1. Single-node: 4 MPI ranks on one container (--oversubscribe)
# 2. Cluster: head + 2 workers on separate containers, MPI over SSH
#
# Exit code: 0 if all tests pass, 1 if any fail.
set -uo pipefail
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BOLD='\033[1m'
NC='\033[0m'
PASS=0
FAIL=0
pass() { echo -e "${GREEN}[PASS]${NC} $*"; ((PASS++)) || true; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; ((FAIL++)) || true; }
banner() {
echo
echo -e "${BOLD}════════════════════════════════════════${NC}"
echo -e "${BOLD} $1${NC}"
echo -e "${BOLD}════════════════════════════════════════${NC}"
}
# ── Build the shared base image ────────────────────────────────────────────
banner "Building sci-hpc-base"
if ! docker build -t sci-hpc-base "$REPO/base" 2>&1; then
echo -e "${RED}Base image build failed — cannot continue.${NC}"
exit 1
fi
echo -e "${GREEN}sci-hpc-base built.${NC}"
# ── Generic MPI-app test function ──────────────────────────────────────────
# Args: <app> <build_timeout_s> <run_timeout_s>
test_mpi_app() {
local app=$1
local build_timeout=${2:-3600}
local run_timeout=${3:-300}
banner "Testing $app"
cd "$REPO/$app"
# Build
echo "→ Building sci-${app}..."
if ! timeout "$build_timeout" docker compose build 2>&1; then
fail "$app: image build"
cd "$REPO"
return
fi
echo -e "${GREEN}Build OK${NC}"
# Single-node
echo "→ Single-node test (4 MPI ranks, 1 container)..."
docker compose down -v 2>/dev/null || true
if timeout "$run_timeout" docker compose run --rm validate 2>&1; then
pass "$app: single-node"
else
fail "$app: single-node"
fi
# Cluster (multi-node)
echo "→ Cluster test (head + worker1 + worker2)..."
docker compose down -v 2>/dev/null || true
if timeout "$run_timeout" docker compose up \
--abort-on-container-exit \
--exit-code-from head \
head 2>&1; then
pass "$app: cluster (multi-node)"
else
fail "$app: cluster (multi-node)"
fi
docker compose down -v 2>/dev/null || true
cd "$REPO"
}
# ── AI Training test (torchrun rendezvous, not MPI+SSH) ────────────────────
test_ai_training() {
local run_timeout=${1:-300}
banner "Testing ai_training"
cd "$REPO/ai_training"
echo "→ Building sci-ai-training..."
if ! timeout 1800 docker compose build 2>&1; then
fail "ai_training: image build"
cd "$REPO"
return
fi
echo -e "${GREEN}Build OK${NC}"
# Single-node
echo "→ Single-node test..."
docker compose down -v 2>/dev/null || true
if timeout "$run_timeout" docker compose run --rm validate 2>&1; then
pass "ai_training: single-node"
else
fail "ai_training: single-node"
fi
# Multi-node (all 3 nodes start together via torchrun)
echo "→ Cluster test (3 torchrun nodes)..."
docker compose down -v 2>/dev/null || true
if timeout "$run_timeout" docker compose up \
--abort-on-container-exit \
--exit-code-from node0 \
node0 node1 node2 2>&1; then
pass "ai_training: cluster (multi-node)"
else
fail "ai_training: cluster (multi-node)"
fi
docker compose down -v 2>/dev/null || true
cd "$REPO"
}
# ── Select apps to test ────────────────────────────────────────────────────
if [ $# -gt 0 ]; then
APPS=("$@")
else
# Default order: fast builds first
APPS=(grayscott ai_training lammps vpic nyx warpx)
fi
for app in "${APPS[@]}"; do
case "$app" in
ai_training) test_ai_training 300 ;;
grayscott) test_mpi_app grayscott 600 120 ;;
lammps) test_mpi_app lammps 3600 300 ;;
vpic) test_mpi_app vpic 3600 300 ;;
nyx) test_mpi_app nyx 7200 300 ;;
warpx) test_mpi_app warpx 7200 300 ;;
*) echo "Unknown app: $app"; fail "unknown: $app" ;;
esac
done
# ── Summary ────────────────────────────────────────────────────────────────
banner "Results"
echo -e " ${GREEN}Passed${NC}: $PASS"
echo -e " ${RED}Failed${NC}: $FAIL"
echo
if [ "$FAIL" -eq 0 ]; then
echo -e "${GREEN}${BOLD}All tests passed!${NC}"
exit 0
else
echo -e "${RED}${BOLD}$FAIL test(s) failed.${NC}"
exit 1
fi