-
Notifications
You must be signed in to change notification settings - Fork 1
212 lines (196 loc) · 9.22 KB
/
gpu-ci.yml
File metadata and controls
212 lines (196 loc) · 9.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
name: GPU CI
on:
push:
jobs:
gpu-tests:
runs-on: self-hosted
timeout-minutes: 240
# Serialize per branch (keeps parallelism across branches)
concurrency:
group: gpu-ci-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
# Extract HYPRE version from CMakeLists.txt to ensure cache key stays in sync
# Uses python for portability (grep -P isn't guaranteed on all distros)
- name: Extract HYPRE version
id: hypre_version
run: |
python3 - <<'PY' | tee -a "$GITHUB_OUTPUT"
import re, pathlib
text = pathlib.Path("CMakeLists.txt").read_text()
m = re.search(r"GIT_TAG\s+v(\d+(?:\.\d+)*)", text)
if not m:
raise SystemExit("ERROR: Could not extract HYPRE version from CMakeLists.txt")
print(f"version={m.group(1)}")
print(f"Detected HYPRE version: {m.group(1)}", file=__import__('sys').stderr)
PY
# Ensure cache directories exist (avoids warnings on cold start)
- name: Ensure cache directories exist
run: mkdir -p build_cpu_hypre/_deps build_gpu_hypre/_deps
# Restore HYPRE cache (slow to compile, rarely changes)
# Key includes HYPRE version and hash of build config files
# Only cache *_hypre directories - ci.sh uses these when HYPRE is enabled
- name: Restore HYPRE cache
id: hypre_cache
uses: actions/cache/restore@v4
with:
path: |
build_cpu_hypre/_deps
build_gpu_hypre/_deps
key: hypre-v${{ steps.hypre_version.outputs.version }}-h200-cc90-${{ hashFiles('CMakeLists.txt', 'cmake/**', '.github/scripts/**') }}
restore-keys: |
hypre-v${{ steps.hypre_version.outputs.version }}-h200-cc90-
# Clean hypre-subbuild after cache restore - these contain CMakeCache.txt with
# absolute paths that break when restored on a different runner
- name: Clean HYPRE subbuild cache (path-sensitive)
run: |
echo "Cleaning hypre-subbuild directories (contain absolute paths)..."
for dir in build_cpu_hypre/_deps build_gpu_hypre/_deps; do
if [ -d "${dir}/hypre-subbuild" ]; then
echo " Removing ${dir}/hypre-subbuild"
rm -rf "${dir}/hypre-subbuild"
fi
done
echo "Done - hypre-build (compiled libs) preserved, hypre-subbuild (CMake config) cleared"
- name: Submit GPU correctness suite to Slurm (H200, 1 GPU)
run: |
./.github/scripts/submit_and_monitor_slurm.sh \
./.github/scripts/gpu_ci_correctness.sbatch.template \
"${GITHUB_WORKSPACE}" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.err" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.sbatch"
- name: Show Build Output
if: always()
run: |
echo "==================================================================="
echo " GPU CI Build Log (last 300 lines)"
echo "==================================================================="
echo ""
if [ -f "${GITHUB_WORKSPACE}/gpu_ci_build.log" ]; then
tail -n 300 "${GITHUB_WORKSPACE}/gpu_ci_build.log"
else
echo "Build log not found (build may have failed early)"
echo "Check the Slurm output for details."
fi
- name: Show Test Output
if: always()
run: |
echo "==================================================================="
echo " GPU CI Test Log (last 300 lines)"
echo "==================================================================="
echo ""
if [ -f "${GITHUB_WORKSPACE}/gpu_ci_test.log" ]; then
tail -n 300 "${GITHUB_WORKSPACE}/gpu_ci_test.log"
else
echo "Test log not found (tests may not have run)"
echo "Check the build log and Slurm output for details."
fi
- name: Submit GPU performance suite to Slurm (H200, 1 GPU)
run: |
./.github/scripts/submit_and_monitor_slurm.sh \
./.github/scripts/gpu_ci_perf.sbatch.template \
"${GITHUB_WORKSPACE}" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.err" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.sbatch"
- name: Show Slurm Output (on failure)
if: failure()
run: |
echo "==================================================================="
echo " Slurm Job Output (last 200 lines each)"
echo "==================================================================="
for f in "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.err" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.err"; do
if [ -f "$f" ]; then
echo ""
echo "===== $f ====="
tail -n 200 "$f"
fi
done
- name: Debug info on failure
if: failure()
run: |
echo "=== Debug Information ==="
echo "Commit: $(git rev-parse HEAD)"
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
echo ""
echo "=== Build directories ==="
ls -la build_gpu_hypre 2>/dev/null || echo "No build_gpu_hypre directory"
ls -la build_cpu_hypre 2>/dev/null || echo "No build_cpu_hypre directory"
echo ""
echo "=== HYPRE cache status ==="
ls -la build_gpu_hypre/_deps 2>/dev/null || echo "No GPU HYPRE cache"
ls -la build_cpu_hypre/_deps 2>/dev/null || echo "No CPU HYPRE cache"
echo ""
echo "=== Test binaries ==="
ls -lh build_gpu_hypre/test_* 2>/dev/null || echo "No test binaries found"
# Save HYPRE cache if build succeeded (even if tests failed)
# HYPRE cache is independent of test results - only corrupted if build was interrupted
# Check for HYPRE library artifacts to verify build completed successfully
- name: Save HYPRE cache
continue-on-error: true
if: always() && steps.hypre_cache.outputs.cache-hit != 'true'
run: |
# Only save if HYPRE build artifacts exist (proof that build completed)
# Require at least one libHYPRE to be found (prevents empty cache on cold start)
HYPRE_OK=true
FOUND_LIB=false
for dir in build_cpu_hypre build_gpu_hypre; do
if [ -d "${dir}/_deps/hypre-build" ]; then
# Check for libHYPRE (the key artifact)
if ls ${dir}/_deps/hypre-build/src/libHYPRE* 2>/dev/null | head -1 > /dev/null; then
FOUND_LIB=true
else
echo "WARNING: ${dir} HYPRE build incomplete (no libHYPRE found)"
HYPRE_OK=false
fi
fi
done
# Must find at least one lib AND no incomplete builds
if [ "$HYPRE_OK" = "true" ] && [ "$FOUND_LIB" = "true" ]; then
echo "HYPRE builds complete - proceeding with cache save"
echo "save_cache=true" >> $GITHUB_OUTPUT
else
echo "HYPRE builds incomplete or missing - skipping cache save"
echo "save_cache=false" >> $GITHUB_OUTPUT
fi
id: check_hypre
- name: Actually save HYPRE cache
continue-on-error: true
if: always() && steps.hypre_cache.outputs.cache-hit != 'true' && steps.check_hypre.outputs.save_cache == 'true'
uses: actions/cache/save@v4
with:
path: |
build_cpu_hypre/_deps
build_gpu_hypre/_deps
key: hypre-v${{ steps.hypre_version.outputs.version }}-h200-cc90-${{ hashFiles('CMakeLists.txt', 'cmake/**', '.github/scripts/**') }}
- name: Cleanup
if: always()
run: |
# On failure, preserve logs for debugging; on success, clean everything
if [ "${{ job.status }}" = "success" ]; then
# Clean up Slurm artifacts and logs only on success
rm -f "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.sbatch" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.out" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.err" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.sbatch" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.out" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.err" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_build.log" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_test.log" || true
else
echo "=== Preserving logs for debugging (job status: ${{ job.status }}) ==="
ls -la "${GITHUB_WORKSPACE}"/gpu_ci_*.{sbatch,out,err,log} 2>/dev/null || true
fi
# Clean build artifacts but keep _deps for cache (only *_hypre dirs are cached)
for dir in build_gpu_hypre build_cpu_hypre; do
if [ -d "${GITHUB_WORKSPACE}/${dir}" ]; then
find "${GITHUB_WORKSPACE}/${dir}" -mindepth 1 -maxdepth 1 ! -name '_deps' -exec rm -rf {} +
fi
done
# Fully remove non-hypre build dirs (not cached)
rm -rf "${GITHUB_WORKSPACE}/build_gpu" "${GITHUB_WORKSPACE}/build_cpu" || true