[None][chore] Fix Gitlab CI termination issues (NVIDIA#10576)

fredricz-20070104 · yufeiwu-nv · web-flow · commit f6045fac0903 · 2026-01-10T07:51:18.000-05:00
Signed-off-by: FredricZ-2007 &lt;226039983+fredricz-20070104@users.noreply.github.com&gt;
Signed-off-by: yufeiwu-nv &lt;230315618+yufeiwu-nv@users.noreply.github.com&gt;
Co-authored-by: yufeiwu-nv &lt;230315618+yufeiwu-nv@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/perf/disagg/cleanup_jobs.sh b/tests/integration/defs/perf/disagg/cleanup_jobs.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# cleanup_jobs.sh - Cancel all SLURM jobs tracked in jobs.txt
+#
+# This script is designed to run in GitLab CI after_script to ensure
+# all SLURM jobs are cancelled when the pipeline is interrupted, cancelled,
+# or times out.
+#
+# Usage:
+#   bash cleanup_jobs.sh
+#
+# Environment variables:
+#   OUTPUT_PATH: Directory containing jobs.txt and pytest.pid
+
+set -e
+
+OUTPUT_PATH="${OUTPUT_PATH:-/tmp}"
+JOBS_FILE="${OUTPUT_PATH}/jobs.txt"
+PID_FILE="${OUTPUT_PATH}/pytest.pid"
+
+echo "=========================================="
+echo "SLURM Job Cleanup Script"
+echo "=========================================="
+echo "Output path: $OUTPUT_PATH"
+echo ""
+
+# Show pytest PID if available (for debugging)
+if [ -f "$PID_FILE" ]; then
+    PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
+    echo "Pytest PID: $PYTEST_PID"
+
+    # Check if pytest is still running
+    if kill -0 "$PYTEST_PID" 2>/dev/null; then
+        echo "Status: Still running"
+    else
+        echo "Status: Already terminated"
+    fi
+    echo ""
+else
+    echo "No pytest.pid found (test may not have started)"
+    echo ""
+fi
+
+# Check if jobs.txt exists
+if [ ! -f "$JOBS_FILE" ]; then
+    echo "[WARN] No jobs.txt found"
+    echo "       Nothing to cancel"
+    echo "=========================================="
+    exit 0
+fi
+
+echo "[INFO] Reading jobs from: $JOBS_FILE"
+
+# Read, deduplicate, and filter empty lines
+JOBS=$(sort -u "$JOBS_FILE" | grep -v '^$' || true)
+
+if [ -z "$JOBS" ]; then
+    echo "[WARN] jobs.txt is empty"
+    echo "       Nothing to cancel"
+    echo "=========================================="
+    exit 0
+fi
+
+JOB_COUNT=$(echo "$JOBS" | wc -l)
+echo "Found $JOB_COUNT job(s) to cancel"
+echo ""
+
+# Cancel each job
+CANCELLED=0
+ALREADY_DONE=0
+FAILED=0
+
+echo "Cancelling jobs..."
+while IFS= read -r job_id; do
+    if [ -n "$job_id" ]; then
+        printf "  %-12s ... " "$job_id"
+
+        # Try to cancel the job
+        if scancel "$job_id" 2>/dev/null; then
+            echo "[OK] Cancelled"
+            CANCELLED=$((CANCELLED + 1))
+        else
+            # Check if job exists in squeue
+            if squeue -j "$job_id" -h 2>/dev/null | grep -q "$job_id"; then
+                echo "[FAIL] Failed to cancel"
+                FAILED=$((FAILED + 1))
+            else
+                echo "[SKIP] Already finished"
+                ALREADY_DONE=$((ALREADY_DONE + 1))
+            fi
+        fi
+    fi
+done <<< "$JOBS"
+
+echo ""
+echo "=========================================="
+echo "[DONE] Cleanup completed"
+echo "       Total:           $JOB_COUNT"
+echo "       Cancelled:       $CANCELLED"
+echo "       Already done:    $ALREADY_DONE"
+echo "       Failed:          $FAILED"
+echo "=========================================="
+
+# Exit with error if any cancellation actually failed
+if [ $FAILED -gt 0 ]; then
+    exit 1
+fi
+
+exit 0
diff --git a/tests/integration/defs/perf/disagg/conftest.py b/tests/integration/defs/perf/disagg/conftest.py
@@ -151,6 +151,7 @@ def __init__(self, batch_size=5):
 
         self.submitted_batches = set()  # Track which batch numbers have been submitted
         self.job_mapping = {}  # Map test_id -> SLURM job_id
+        self.submit_errors = {}  # Map test_id -> error message (validation/submission failures)
         self.all_configs = []  # Ordered list of all test configs
 
         logger.info(f"\n{'=' * 70}")
@@ -214,6 +215,8 @@ def _submit_batch(self, batch_num):
             batch_num: Batch number to submit (0-indexed)
         """
         from execution.executor import JobManager
+        from utils.config_validator import ConfigValidator
+        from utils.job_tracker import JobTracker
 
         # Calculate batch range
         if self.batch_size:
@@ -230,33 +233,56 @@ def _submit_batch(self, batch_num):
         logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)")
         logger.info(f"{'=' * 70}\n")
 
-        # Submit all jobs in this batch
+        # Pre-validate all configs before submission
+        logger.info("Pre-validating configurations...")
+        valid_configs = []
+        for config in batch_configs:
+            try:
+                ConfigValidator.validate_test_config(config)
+                valid_configs.append(config)
+            except Exception as e:
+                # Validation failed - mark as None and record error
+                self.job_mapping[config.test_id] = None
+                self.submit_errors[config.test_id] = f"Validation failed: {str(e)}"
+                logger.error(f"  [FAILED] Validation failed: {config.test_id}")
+                logger.error(f"     Error: {str(e)[:100]}")
+
+        logger.info(
+            f"Validation complete: {len(valid_configs)}/{len(batch_configs)} configs valid\n"
+        )
+
+        # Submit only valid configs
         success_count = 0
-        for i, config in enumerate(batch_configs, 1):
+        for i, config in enumerate(valid_configs, 1):
             try:
                 success, job_id = JobManager.submit_test_job(config)
                 if success and job_id:
                     self.job_mapping[config.test_id] = job_id
+                    JobTracker.record_job(job_id)  # Record job ID for cleanup
                     success_count += 1
-                    # Truncate test_id for display
-                    display_id = (
-                        config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
+                    logger.success(
+                        f"  [{i:3d}/{len(valid_configs)}] Job {job_id} <- {config.test_id}"
                     )
-                    logger.success(f"  [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}")
                 else:
+                    # Submission failed - mark as None and record error
                     self.job_mapping[config.test_id] = None
-                    logger.error(f"  [{i:3d}/{len(batch_configs)}] Failed: {config.test_id[:50]}")
+                    self.submit_errors[config.test_id] = f"Job submission failed: {job_id}"
+                    logger.error(f"  [{i:3d}/{len(valid_configs)}] Failed: {config.test_id}")
             except Exception as e:
+                # Submission exception - mark as None and record error
                 self.job_mapping[config.test_id] = None
-                logger.error(f"  [{i:3d}/{len(batch_configs)}] Error: {e}")
+                self.submit_errors[config.test_id] = f"Submission exception: {str(e)}"
+                logger.error(f"  [{i:3d}/{len(valid_configs)}] Error: {e}")
 
         # Mark batch as submitted
         self.submitted_batches.add(batch_num)
 
         logger.info(f"\n{'=' * 70}")
         logger.success(
-            f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded"
+            f"Batch {batch_num} Complete: {success_count}/{len(valid_configs)} submitted successfully"
         )
+        if len(valid_configs) < len(batch_configs):
+            logger.warning(f"Skipped {len(batch_configs) - len(valid_configs)} invalid config(s)")
         logger.info(f"{'=' * 70}\n")
 
 
diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -271,21 +271,26 @@ def submit_test_job(test_config) -> tuple:
 
     @staticmethod
     def backup_logs(
-        job_id: str,
+        job_id: Optional[str],
         test_config,
         result_dir: str,
         is_passed: bool,
     ) -> Optional[str]:
         """Backup logs and config files to test_id directory.
 
         Args:
-            job_id: SLURM job ID
+            job_id: SLURM job ID (None if submission failed)
             test_config: TestConfig object
             result_dir: Result directory path (already named as test_id)
             is_passed: Whether the job passed
         Returns:
             Final directory path if successful, None otherwise
         """
+        if job_id is None:
+            logger.warning(f"Job submission failed for {test_config.test_id}")
+        else:
+            logger.info(f"Backing up logs for job {job_id} ({test_config.test_id})")
+
         if not os.path.exists(result_dir):
             logger.warning(f"Result directory does not exist yet: {result_dir}")
             return None
diff --git a/tests/integration/defs/perf/disagg/reporting/accuracy_validator.py b/tests/integration/defs/perf/disagg/reporting/accuracy_validator.py
@@ -92,6 +92,13 @@ def __post_init__(self) -> None:
 # Dataset default parameters for hypothesis testing
 # Extracted from accuracy_core.py AccuracyTask subclasses
 DATASET_DEFAULTS = {
+    "aime25": {
+        "alpha": 0.05,
+        "beta": 0.2,
+        "sigma": 50,
+        "num_samples": 30,  # AIME 2025 full sample size
+        "higher_is_better": True,
+    },
     "gsm8k": {
         "alpha": 0.05,
         "beta": 0.2,
@@ -127,6 +134,14 @@ def __post_init__(self) -> None:
         "num_samples": 198,
         "higher_is_better": True,
     },
+    # Alias for gpqa_diamond (same task, different naming convention)
+    "gpqa_diamond_cot_zeroshot": {
+        "alpha": 0.05,
+        "beta": 0.2,
+        "sigma": 50,
+        "num_samples": 198,
+        "higher_is_better": True,
+    },
     "json_mode_eval": {
         "alpha": 0.05,
         "beta": 0.2,
diff --git a/tests/integration/defs/perf/disagg/session_collect.sh b/tests/integration/defs/perf/disagg/session_collect.sh
@@ -22,44 +22,18 @@ cd "$WORK_DIR"
 python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
 echo "System information collection completed"
 
-# Step 2: Handle different installation modes
-echo ""
-echo "Step 2: Installing TensorRT-LLM..."
+# Step 2: Collect TensorRT-LLM version information (only for none mode)
 if [ "$INSTALL_MODE" = "none" ]; then
-    echo "Using built-in TensorRT-LLM, skipping installation"
-
-elif [ "$INSTALL_MODE" = "wheel" ]; then
-    echo "Installing TensorRT-LLM wheel..."
-    echo "Wheel path pattern: $WHEEL_PATH"
-
-    # Expand wildcard and install
-    for wheel_file in $WHEEL_PATH; do
-        if [ -f "$wheel_file" ]; then
-            echo "Found wheel: $wheel_file"
-            pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
-            break
-        fi
-    done
-    echo "Wheel installation completed"
-
-elif [ "$INSTALL_MODE" = "source" ]; then
-    echo "Installing TensorRT-LLM from source..."
-    cd "$REPO_DIR"
-    pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
-    echo "Source installation completed"
-
+    echo ""
+    echo "Step 2: Collecting TensorRT-LLM version information..."
+    VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
+    python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
+    echo "TensorRT-LLM version written to: $VERSION_FILE"
 else
-    echo "ERROR: Invalid install mode: $INSTALL_MODE"
-    exit 1
+    echo ""
+    echo "Step 2: Skipping TensorRT-LLM version collection (install_mode=$INSTALL_MODE)"
 fi
 
-# Step 3: Collect TensorRT-LLM version information
-echo ""
-echo "Step 3: Collecting TensorRT-LLM version information..."
-VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
-python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
-echo "TensorRT-LLM version written to: $VERSION_FILE"
-
 echo ""
 echo "=========================================="
 echo "Session Collect Job Completed"
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default.yaml
@@ -77,12 +77,12 @@ worker_config:
     stream_interval: 20
     num_postprocess_workers: 4
   ctx:
-    max_batch_size: 8
+    max_batch_size: 1
     max_num_tokens: 131104
     max_seq_len: 131104
     tensor_parallel_size: 1
     moe_expert_parallel_size: 1
-    enable_attention_dp: true
+    enable_attention_dp: false
     pipeline_parallel_size: 8
     print_iter_log: true
     cuda_graph_config: null
diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py
diff --git a/tests/integration/defs/perf/disagg/utils/job_tracker.py b/tests/integration/defs/perf/disagg/utils/job_tracker.py
diff --git a/tests/integration/defs/perf/disagg/utils/trackers.py b/tests/integration/defs/perf/disagg/utils/trackers.py