Skip to content

Commit f6045fa

Browse files
[None][chore] Fix Gitlab CI termination issues (NVIDIA#10576)
Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Co-authored-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
1 parent f6c4dd8 commit f6045fa

9 files changed

Lines changed: 278 additions & 82 deletions

File tree

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#!/bin/bash
2+
# cleanup_jobs.sh - Cancel all SLURM jobs tracked in jobs.txt
3+
#
4+
# This script is designed to run in GitLab CI after_script to ensure
5+
# all SLURM jobs are cancelled when the pipeline is interrupted, cancelled,
6+
# or times out.
7+
#
8+
# Usage:
9+
# bash cleanup_jobs.sh
10+
#
11+
# Environment variables:
12+
# OUTPUT_PATH: Directory containing jobs.txt and pytest.pid
13+
14+
set -e
15+
16+
OUTPUT_PATH="${OUTPUT_PATH:-/tmp}"
17+
JOBS_FILE="${OUTPUT_PATH}/jobs.txt"
18+
PID_FILE="${OUTPUT_PATH}/pytest.pid"
19+
20+
echo "=========================================="
21+
echo "SLURM Job Cleanup Script"
22+
echo "=========================================="
23+
echo "Output path: $OUTPUT_PATH"
24+
echo ""
25+
26+
# Show pytest PID if available (for debugging)
27+
if [ -f "$PID_FILE" ]; then
28+
PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
29+
echo "Pytest PID: $PYTEST_PID"
30+
31+
# Check if pytest is still running
32+
if kill -0 "$PYTEST_PID" 2>/dev/null; then
33+
echo "Status: Still running"
34+
else
35+
echo "Status: Already terminated"
36+
fi
37+
echo ""
38+
else
39+
echo "No pytest.pid found (test may not have started)"
40+
echo ""
41+
fi
42+
43+
# Check if jobs.txt exists
44+
if [ ! -f "$JOBS_FILE" ]; then
45+
echo "[WARN] No jobs.txt found"
46+
echo " Nothing to cancel"
47+
echo "=========================================="
48+
exit 0
49+
fi
50+
51+
echo "[INFO] Reading jobs from: $JOBS_FILE"
52+
53+
# Read, deduplicate, and filter empty lines
54+
JOBS=$(sort -u "$JOBS_FILE" | grep -v '^$' || true)
55+
56+
if [ -z "$JOBS" ]; then
57+
echo "[WARN] jobs.txt is empty"
58+
echo " Nothing to cancel"
59+
echo "=========================================="
60+
exit 0
61+
fi
62+
63+
JOB_COUNT=$(echo "$JOBS" | wc -l)
64+
echo "Found $JOB_COUNT job(s) to cancel"
65+
echo ""
66+
67+
# Cancel each job
68+
CANCELLED=0
69+
ALREADY_DONE=0
70+
FAILED=0
71+
72+
echo "Cancelling jobs..."
73+
while IFS= read -r job_id; do
74+
if [ -n "$job_id" ]; then
75+
printf " %-12s ... " "$job_id"
76+
77+
# Try to cancel the job
78+
if scancel "$job_id" 2>/dev/null; then
79+
echo "[OK] Cancelled"
80+
CANCELLED=$((CANCELLED + 1))
81+
else
82+
# Check if job exists in squeue
83+
if squeue -j "$job_id" -h 2>/dev/null | grep -q "$job_id"; then
84+
echo "[FAIL] Failed to cancel"
85+
FAILED=$((FAILED + 1))
86+
else
87+
echo "[SKIP] Already finished"
88+
ALREADY_DONE=$((ALREADY_DONE + 1))
89+
fi
90+
fi
91+
fi
92+
done <<< "$JOBS"
93+
94+
echo ""
95+
echo "=========================================="
96+
echo "[DONE] Cleanup completed"
97+
echo " Total: $JOB_COUNT"
98+
echo " Cancelled: $CANCELLED"
99+
echo " Already done: $ALREADY_DONE"
100+
echo " Failed: $FAILED"
101+
echo "=========================================="
102+
103+
# Exit with error if any cancellation actually failed
104+
if [ $FAILED -gt 0 ]; then
105+
exit 1
106+
fi
107+
108+
exit 0

tests/integration/defs/perf/disagg/conftest.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ def __init__(self, batch_size=5):
151151

152152
self.submitted_batches = set() # Track which batch numbers have been submitted
153153
self.job_mapping = {} # Map test_id -> SLURM job_id
154+
self.submit_errors = {} # Map test_id -> error message (validation/submission failures)
154155
self.all_configs = [] # Ordered list of all test configs
155156

156157
logger.info(f"\n{'=' * 70}")
@@ -214,6 +215,8 @@ def _submit_batch(self, batch_num):
214215
batch_num: Batch number to submit (0-indexed)
215216
"""
216217
from execution.executor import JobManager
218+
from utils.config_validator import ConfigValidator
219+
from utils.job_tracker import JobTracker
217220

218221
# Calculate batch range
219222
if self.batch_size:
@@ -230,33 +233,56 @@ def _submit_batch(self, batch_num):
230233
logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)")
231234
logger.info(f"{'=' * 70}\n")
232235

233-
# Submit all jobs in this batch
236+
# Pre-validate all configs before submission
237+
logger.info("Pre-validating configurations...")
238+
valid_configs = []
239+
for config in batch_configs:
240+
try:
241+
ConfigValidator.validate_test_config(config)
242+
valid_configs.append(config)
243+
except Exception as e:
244+
# Validation failed - mark as None and record error
245+
self.job_mapping[config.test_id] = None
246+
self.submit_errors[config.test_id] = f"Validation failed: {str(e)}"
247+
logger.error(f" [FAILED] Validation failed: {config.test_id}")
248+
logger.error(f" Error: {str(e)[:100]}")
249+
250+
logger.info(
251+
f"Validation complete: {len(valid_configs)}/{len(batch_configs)} configs valid\n"
252+
)
253+
254+
# Submit only valid configs
234255
success_count = 0
235-
for i, config in enumerate(batch_configs, 1):
256+
for i, config in enumerate(valid_configs, 1):
236257
try:
237258
success, job_id = JobManager.submit_test_job(config)
238259
if success and job_id:
239260
self.job_mapping[config.test_id] = job_id
261+
JobTracker.record_job(job_id) # Record job ID for cleanup
240262
success_count += 1
241-
# Truncate test_id for display
242-
display_id = (
243-
config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
263+
logger.success(
264+
f" [{i:3d}/{len(valid_configs)}] Job {job_id} <- {config.test_id}"
244265
)
245-
logger.success(f" [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}")
246266
else:
267+
# Submission failed - mark as None and record error
247268
self.job_mapping[config.test_id] = None
248-
logger.error(f" [{i:3d}/{len(batch_configs)}] Failed: {config.test_id[:50]}")
269+
self.submit_errors[config.test_id] = f"Job submission failed: {job_id}"
270+
logger.error(f" [{i:3d}/{len(valid_configs)}] Failed: {config.test_id}")
249271
except Exception as e:
272+
# Submission exception - mark as None and record error
250273
self.job_mapping[config.test_id] = None
251-
logger.error(f" [{i:3d}/{len(batch_configs)}] Error: {e}")
274+
self.submit_errors[config.test_id] = f"Submission exception: {str(e)}"
275+
logger.error(f" [{i:3d}/{len(valid_configs)}] Error: {e}")
252276

253277
# Mark batch as submitted
254278
self.submitted_batches.add(batch_num)
255279

256280
logger.info(f"\n{'=' * 70}")
257281
logger.success(
258-
f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded"
282+
f"Batch {batch_num} Complete: {success_count}/{len(valid_configs)} submitted successfully"
259283
)
284+
if len(valid_configs) < len(batch_configs):
285+
logger.warning(f"Skipped {len(batch_configs) - len(valid_configs)} invalid config(s)")
260286
logger.info(f"{'=' * 70}\n")
261287

262288

tests/integration/defs/perf/disagg/execution/executor.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,21 +271,26 @@ def submit_test_job(test_config) -> tuple:
271271

272272
@staticmethod
273273
def backup_logs(
274-
job_id: str,
274+
job_id: Optional[str],
275275
test_config,
276276
result_dir: str,
277277
is_passed: bool,
278278
) -> Optional[str]:
279279
"""Backup logs and config files to test_id directory.
280280
281281
Args:
282-
job_id: SLURM job ID
282+
job_id: SLURM job ID (None if submission failed)
283283
test_config: TestConfig object
284284
result_dir: Result directory path (already named as test_id)
285285
is_passed: Whether the job passed
286286
Returns:
287287
Final directory path if successful, None otherwise
288288
"""
289+
if job_id is None:
290+
logger.warning(f"Job submission failed for {test_config.test_id}")
291+
else:
292+
logger.info(f"Backing up logs for job {job_id} ({test_config.test_id})")
293+
289294
if not os.path.exists(result_dir):
290295
logger.warning(f"Result directory does not exist yet: {result_dir}")
291296
return None

tests/integration/defs/perf/disagg/reporting/accuracy_validator.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,13 @@ def __post_init__(self) -> None:
9292
# Dataset default parameters for hypothesis testing
9393
# Extracted from accuracy_core.py AccuracyTask subclasses
9494
DATASET_DEFAULTS = {
95+
"aime25": {
96+
"alpha": 0.05,
97+
"beta": 0.2,
98+
"sigma": 50,
99+
"num_samples": 30, # AIME 2025 full sample size
100+
"higher_is_better": True,
101+
},
95102
"gsm8k": {
96103
"alpha": 0.05,
97104
"beta": 0.2,
@@ -127,6 +134,14 @@ def __post_init__(self) -> None:
127134
"num_samples": 198,
128135
"higher_is_better": True,
129136
},
137+
# Alias for gpqa_diamond (same task, different naming convention)
138+
"gpqa_diamond_cot_zeroshot": {
139+
"alpha": 0.05,
140+
"beta": 0.2,
141+
"sigma": 50,
142+
"num_samples": 198,
143+
"higher_is_better": True,
144+
},
130145
"json_mode_eval": {
131146
"alpha": 0.05,
132147
"beta": 0.2,

tests/integration/defs/perf/disagg/session_collect.sh

Lines changed: 8 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -22,44 +22,18 @@ cd "$WORK_DIR"
2222
python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
2323
echo "System information collection completed"
2424

25-
# Step 2: Handle different installation modes
26-
echo ""
27-
echo "Step 2: Installing TensorRT-LLM..."
25+
# Step 2: Collect TensorRT-LLM version information (only for none mode)
2826
if [ "$INSTALL_MODE" = "none" ]; then
29-
echo "Using built-in TensorRT-LLM, skipping installation"
30-
31-
elif [ "$INSTALL_MODE" = "wheel" ]; then
32-
echo "Installing TensorRT-LLM wheel..."
33-
echo "Wheel path pattern: $WHEEL_PATH"
34-
35-
# Expand wildcard and install
36-
for wheel_file in $WHEEL_PATH; do
37-
if [ -f "$wheel_file" ]; then
38-
echo "Found wheel: $wheel_file"
39-
pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
40-
break
41-
fi
42-
done
43-
echo "Wheel installation completed"
44-
45-
elif [ "$INSTALL_MODE" = "source" ]; then
46-
echo "Installing TensorRT-LLM from source..."
47-
cd "$REPO_DIR"
48-
pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
49-
echo "Source installation completed"
50-
27+
echo ""
28+
echo "Step 2: Collecting TensorRT-LLM version information..."
29+
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
30+
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
31+
echo "TensorRT-LLM version written to: $VERSION_FILE"
5132
else
52-
echo "ERROR: Invalid install mode: $INSTALL_MODE"
53-
exit 1
33+
echo ""
34+
echo "Step 2: Skipping TensorRT-LLM version collection (install_mode=$INSTALL_MODE)"
5435
fi
5536

56-
# Step 3: Collect TensorRT-LLM version information
57-
echo ""
58-
echo "Step 3: Collecting TensorRT-LLM version information..."
59-
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
60-
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
61-
echo "TensorRT-LLM version written to: $VERSION_FILE"
62-
6337
echo ""
6438
echo "=========================================="
6539
echo "Session Collect Job Completed"

tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,12 @@ worker_config:
7777
stream_interval: 20
7878
num_postprocess_workers: 4
7979
ctx:
80-
max_batch_size: 8
80+
max_batch_size: 1
8181
max_num_tokens: 131104
8282
max_seq_len: 131104
8383
tensor_parallel_size: 1
8484
moe_expert_parallel_size: 1
85-
enable_attention_dp: true
85+
enable_attention_dp: false
8686
pipeline_parallel_size: 8
8787
print_iter_log: true
8888
cuda_graph_config: null

0 commit comments

Comments
 (0)