Fix eval tests not capturing server launch failures (#18886)

alisonshao · web-flow · commit 34d975b18fa5 · 2026-02-18T07:59:03.000+08:00
diff --git a/python/sglang/srt/model_loader/ci_weight_validation.py b/python/sglang/srt/model_loader/ci_weight_validation.py
@@ -1730,30 +1730,44 @@ def _validate_weights_after_download(
     return True
 
 
-def _get_lock_file_path(model_name_or_path: str) -> str:
+def _get_lock_file_path(
+    model_name_or_path: str, cache_dir: Optional[str] = None
+) -> str:
     """
     Generate a unique lock file path for download coordination.
 
-    Uses file-based locking (fcntl.flock) to ensure only one process downloads
-    while others wait. This works regardless of how processes are spawned
-    (mp.Process, torchrun, etc.).
+    In CI environments where multiple containers share an NFS-mounted HF cache,
+    the lock file is placed on the shared cache directory so ALL containers
+    coordinate on the same lock. This prevents cross-container .incomplete
+    file race conditions.
+
+    Falls back to /dev/shm (container-local) for non-CI or when the cache
+    dir is not accessible.
 
     Args:
         model_name_or_path: Model identifier
+        cache_dir: HF cache directory (None to use default)
 
     Returns:
         Path to the lock file
     """
-    # Create a unique hash based on model name only (not cache_dir)
-    # This ensures all processes coordinate on the same lock regardless of
-    # cache_dir configuration differences between processes
     key_hash = hashlib.sha256(model_name_or_path.encode()).hexdigest()[:16]
 
-    # Use /dev/shm (shared memory filesystem) for lock files because:
-    # 1. It's always local to the machine (not NFS)
-    # 2. It properly supports file locking
-    # 3. It's shared across all processes on the same machine
-    # Fall back to /tmp if /dev/shm doesn't exist
+    # In CI, place lock on the shared HF cache directory so that ALL containers
+    # sharing the same NFS-mounted cache coordinate downloads.
+    # /dev/shm is container-local and doesn't prevent cross-container races.
+    try:
+        import huggingface_hub.constants
+
+        effective_cache_dir = cache_dir or huggingface_hub.constants.HF_HUB_CACHE
+        if os.path.isdir(effective_cache_dir):
+            lock_dir = os.path.join(effective_cache_dir, ".sglang_locks")
+            os.makedirs(lock_dir, exist_ok=True)
+            return os.path.join(lock_dir, f"download_{key_hash}.lock")
+    except Exception:
+        pass
+
+    # Fallback to container-local lock
     if os.path.isdir("/dev/shm"):
         return f"/dev/shm/sglang_download_lock_{key_hash}"
     return f"/tmp/sglang_download_lock_{key_hash}"
@@ -1826,13 +1840,10 @@ def ci_download_with_validation_and_retry(
     This function handles the download of model weights in CI environments,
     with automatic validation and retry logic for handling corrupted downloads.
 
-    Uses file-based locking (fcntl.flock) to prevent HuggingFace hub race
-    conditions where multiple processes try to download simultaneously,
-    causing .incomplete file conflicts. Only one process downloads at a time;
-    others wait for the lock then use the cached result.
-
-    This approach works regardless of how processes are spawned (mp.Process,
-    torchrun, etc.) since it doesn't rely on environment variables.
+    Uses filelock.FileLock on the shared HF cache directory to coordinate
+    downloads across all processes AND all containers sharing the same
+    NFS-mounted cache. Only one process downloads at a time; others wait
+    for the lock then use the cached result.
 
     Args:
         model_name_or_path: The model name or path
@@ -1848,8 +1859,7 @@ def ci_download_with_validation_and_retry(
     Raises:
         RuntimeError: If download fails after max_retries attempts
     """
-    import fcntl
-
+    import filelock
     import huggingface_hub.constants
     from huggingface_hub import snapshot_download
     from tqdm.auto import tqdm
@@ -1859,36 +1869,69 @@ def __init__(self, *args, **kwargs):
             kwargs["disable"] = True
             super().__init__(*args, **kwargs)
 
-    # Use file-based locking to serialize downloads across all processes
-    # This prevents HF hub race conditions with .incomplete files
-    lock_file_path = _get_lock_file_path(model_name_or_path)
+    # Use filelock on the shared HF cache directory to coordinate downloads
+    # across all processes AND all containers sharing the same NFS mount.
+    # This prevents cross-container .incomplete file race conditions.
+    lock_file_path = _get_lock_file_path(model_name_or_path, cache_dir)
 
-    # Log lock file path for debugging
     logger.info(
         "[CI Download] Process %d using lock file: %s",
         os.getpid(),
         lock_file_path,
     )
 
-    # Create lock file if it doesn't exist
-    lock_file = open(lock_file_path, "w")
+    # filelock.FileLock handles creation, acquisition, and release cleanly.
+    # timeout=-1 means wait indefinitely (another container may be downloading
+    # a large model for 30+ minutes).
+    lock = filelock.FileLock(lock_file_path, timeout=-1, mode=0o666)
 
-    try:
-        # Acquire exclusive lock - blocks until lock is available
-        # This ensures only one process downloads at a time
-        logger.info(
-            "[CI Download] Process %d waiting to acquire lock for %s",
-            os.getpid(),
-            model_name_or_path,
-        )
-        fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
+    logger.info(
+        "[CI Download] Process %d waiting to acquire lock for %s",
+        os.getpid(),
+        model_name_or_path,
+    )
+
+    with lock:
         logger.info(
             "[CI Download] Process %d ACQUIRED lock for %s",
             os.getpid(),
             model_name_or_path,
         )
 
-        # Now we have exclusive access - perform download with retry logic
+        # Re-check if another container already downloaded the model while
+        # we were waiting for the lock. This avoids redundant downloads.
+        try:
+            from sglang.srt.model_loader.weight_utils import (
+                _find_local_hf_snapshot_dir_unlocked,
+            )
+
+            cached_path = _find_local_hf_snapshot_dir_unlocked(
+                model_name_or_path, cache_dir, allow_patterns, revision
+            )
+            if cached_path is not None:
+                logger.info(
+                    "[CI Download] Process %d found cached model after "
+                    "acquiring lock (downloaded by another container): %s",
+                    os.getpid(),
+                    cached_path,
+                )
+                return cached_path
+        except Exception as e:
+            logger.debug(
+                "[CI Download] Re-check for cached model failed (non-fatal): %s", e
+            )
+
+        # Clean up stale .incomplete files from previous failed downloads
+        # before starting. Only do this once before the first attempt.
+        cleaned = _cleanup_incomplete_blobs(model_name_or_path, cache_dir)
+        if cleaned > 0:
+            logger.info(
+                "[CI Download] Pre-download cleanup: removed %d stale "
+                ".incomplete file(s) for %s",
+                cleaned,
+                model_name_or_path,
+            )
+
         hf_folder = None
         for attempt in range(max_retries):
             try:
@@ -1907,12 +1950,11 @@ def __init__(self, *args, **kwargs):
                     max_workers=1,
                 )
             except (FileNotFoundError, OSError) as e:
-                # Cross-container race condition: another container on the same
-                # host moved/deleted the .incomplete file while we were using it.
-                # This happens when multiple CI containers share an NFS-mounted
-                # HF cache but have separate /dev/shm lock namespaces.
+                # Race condition: .incomplete file was moved/deleted by another
+                # process. With NFS-level locking this should be rare, but can
+                # still happen if lock acquisition fails on some NFS setups.
                 logger.warning(
-                    "[CI Download] Process %d hit download race condition "
+                    "[CI Download] Process %d hit download error "
                     "(attempt %d/%d) for %s: %s: %s",
                     os.getpid(),
                     attempt + 1,
@@ -1921,19 +1963,21 @@ def __init__(self, *args, **kwargs):
                     type(e).__name__,
                     e,
                 )
-                _cleanup_incomplete_blobs(model_name_or_path, cache_dir)
                 if attempt < max_retries - 1:
-                    backoff = 2**attempt
+                    # Backoff: 10s, 20s, 40s. Clean only the stale
+                    # .incomplete files (not active ones from other processes).
+                    backoff = 10 * (2**attempt)
                     logger.info(
-                        "[CI Download] Retrying in %ds...",
+                        "[CI Download] Cleaning up .incomplete files and "
+                        "retrying in %ds...",
                         backoff,
                     )
+                    _cleanup_incomplete_blobs(model_name_or_path, cache_dir)
                     time.sleep(backoff)
                     continue
                 raise RuntimeError(
                     f"Download failed for {model_name_or_path} after "
-                    f"{max_retries} attempts due to cross-container race "
-                    f"condition (.incomplete file conflicts). "
+                    f"{max_retries} attempts due to download errors. "
                     f"Last error: {type(e).__name__}: {e}"
                 ) from e
 
@@ -1963,16 +2007,6 @@ def __init__(self, *args, **kwargs):
         # Should never reach here, but return hf_folder just in case
         return hf_folder
 
-    finally:
-        # Always release the lock
-        fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
-        lock_file.close()
-        logger.info(
-            "[CI Download] Process %d RELEASED lock for %s",
-            os.getpid(),
-            model_name_or_path,
-        )
-
 
 def ci_validate_and_clean_hf_cache(model_path: str) -> None:
     """
diff --git a/python/sglang/test/nightly_utils.py b/python/sglang/test/nightly_utils.py
@@ -259,18 +259,21 @@ def run_benchmark_for_model(
         avg_spec_accept_length = None
         model_description = f"{model_path}" + (f" ({variant})" if variant else "")
 
-        # Launch server
-        process = popen_launch_server(
-            model=model_path,
-            base_url=self.base_url,
-            other_args=other_args or [],
-            timeout=(
-                timeout if timeout is not None else DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
-            ),
-            env=env,
-        )
-
+        process = None
         try:
+            # Launch server
+            process = popen_launch_server(
+                model=model_path,
+                base_url=self.base_url,
+                other_args=other_args or [],
+                timeout=(
+                    timeout
+                    if timeout is not None
+                    else DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+                ),
+                env=env,
+            )
+
             # Generate filenames
             profile_path_prefix, json_output_file = self.generate_profile_filename(
                 model_path, variant
@@ -311,7 +314,8 @@ def run_benchmark_for_model(
 
         finally:
             # Always clean up server process
-            kill_process_tree(process.pid)
+            if process is not None:
+                kill_process_tree(process.pid)
 
     def _get_spec_accept_length(self) -> Optional[float]:
         """Query the server for avg_spec_accept_length metric.
diff --git a/test/registered/eval/test_text_models_gsm8k_eval.py b/test/registered/eval/test_text_models_gsm8k_eval.py
@@ -11,7 +11,6 @@
     DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
     DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
     DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     ModelLaunchSettings,
     check_evaluation_test_results,
@@ -20,6 +19,10 @@
     write_results_to_json,
 )
 
+# Nightly eval tests run large models (up to 70B+ params) that may need
+# downloading on cache miss. Use a longer timeout than the default 600s.
+NIGHTLY_EVAL_SERVER_TIMEOUT = 1800
+
 register_cuda_ci(est_time=3600, suite="nightly-eval-text-2-gpu", nightly=True)
 
 MODEL_SCORE_THRESHOLDS = {
@@ -72,19 +75,19 @@ def test_mgsm_en_all_models(self):
         for model_setup in self.models:
             with self.subTest(model=model_setup.model_path):
                 other_args = list(model_setup.extra_args)
-                error_message = None
+                process = None
 
                 if model_setup.model_path == "meta-llama/Llama-3.1-70B-Instruct":
                     other_args.extend(["--mem-fraction-static", "0.9"])
 
-                process = popen_launch_server(
-                    model=model_setup.model_path,
-                    other_args=other_args,
-                    base_url=self.base_url,
-                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-                )
-
                 try:
+                    process = popen_launch_server(
+                        model=model_setup.model_path,
+                        other_args=other_args,
+                        base_url=self.base_url,
+                        timeout=NIGHTLY_EVAL_SERVER_TIMEOUT,
+                    )
+
                     args = SimpleNamespace(
                         base_url=self.base_url,
                         model=model_setup.model_path,
@@ -103,20 +106,18 @@ def test_mgsm_en_all_models(self):
                     )
                     is_first = False
 
-                    # 0.0 for empty latency, None for no error
                     all_results.append(
-                        (model_setup.model_path, metrics["score"], 0.0, error_message)
+                        (model_setup.model_path, metrics["score"], 0.0, None)
                     )
                 except Exception as e:
-                    # Capture error message for the summary table
                     error_message = str(e)
-                    # Still append result with error info (use None for N/A metrics to match else clause)
                     all_results.append(
                         (model_setup.model_path, None, None, error_message)
                     )
                     print(f"Error evaluating {model_setup.model_path}: {error_message}")
                 finally:
-                    kill_process_tree(process.pid)
+                    if process is not None:
+                        kill_process_tree(process.pid)
 
         try:
             with open("results.json", "r") as f:
diff --git a/test/registered/eval/test_vlms_mmmu_eval.py b/test/registered/eval/test_vlms_mmmu_eval.py