Skip to content

Commit 34d975b

Browse files
authored
Fix eval tests not capturing server launch failures (#18886)
1 parent e02a9be commit 34d975b

File tree

4 files changed

+137
-95
lines changed

4 files changed

+137
-95
lines changed

python/sglang/srt/model_loader/ci_weight_validation.py

Lines changed: 91 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1730,30 +1730,44 @@ def _validate_weights_after_download(
17301730
return True
17311731

17321732

1733-
def _get_lock_file_path(model_name_or_path: str) -> str:
1733+
def _get_lock_file_path(
1734+
model_name_or_path: str, cache_dir: Optional[str] = None
1735+
) -> str:
17341736
"""
17351737
Generate a unique lock file path for download coordination.
17361738
1737-
Uses file-based locking (fcntl.flock) to ensure only one process downloads
1738-
while others wait. This works regardless of how processes are spawned
1739-
(mp.Process, torchrun, etc.).
1739+
In CI environments where multiple containers share an NFS-mounted HF cache,
1740+
the lock file is placed on the shared cache directory so ALL containers
1741+
coordinate on the same lock. This prevents cross-container .incomplete
1742+
file race conditions.
1743+
1744+
Falls back to /dev/shm (container-local) for non-CI or when the cache
1745+
dir is not accessible.
17401746
17411747
Args:
17421748
model_name_or_path: Model identifier
1749+
cache_dir: HF cache directory (None to use default)
17431750
17441751
Returns:
17451752
Path to the lock file
17461753
"""
1747-
# Create a unique hash based on model name only (not cache_dir)
1748-
# This ensures all processes coordinate on the same lock regardless of
1749-
# cache_dir configuration differences between processes
17501754
key_hash = hashlib.sha256(model_name_or_path.encode()).hexdigest()[:16]
17511755

1752-
# Use /dev/shm (shared memory filesystem) for lock files because:
1753-
# 1. It's always local to the machine (not NFS)
1754-
# 2. It properly supports file locking
1755-
# 3. It's shared across all processes on the same machine
1756-
# Fall back to /tmp if /dev/shm doesn't exist
1756+
# In CI, place lock on the shared HF cache directory so that ALL containers
1757+
# sharing the same NFS-mounted cache coordinate downloads.
1758+
# /dev/shm is container-local and doesn't prevent cross-container races.
1759+
try:
1760+
import huggingface_hub.constants
1761+
1762+
effective_cache_dir = cache_dir or huggingface_hub.constants.HF_HUB_CACHE
1763+
if os.path.isdir(effective_cache_dir):
1764+
lock_dir = os.path.join(effective_cache_dir, ".sglang_locks")
1765+
os.makedirs(lock_dir, exist_ok=True)
1766+
return os.path.join(lock_dir, f"download_{key_hash}.lock")
1767+
except Exception:
1768+
pass
1769+
1770+
# Fallback to container-local lock
17571771
if os.path.isdir("/dev/shm"):
17581772
return f"/dev/shm/sglang_download_lock_{key_hash}"
17591773
return f"/tmp/sglang_download_lock_{key_hash}"
@@ -1826,13 +1840,10 @@ def ci_download_with_validation_and_retry(
18261840
This function handles the download of model weights in CI environments,
18271841
with automatic validation and retry logic for handling corrupted downloads.
18281842
1829-
Uses file-based locking (fcntl.flock) to prevent HuggingFace hub race
1830-
conditions where multiple processes try to download simultaneously,
1831-
causing .incomplete file conflicts. Only one process downloads at a time;
1832-
others wait for the lock then use the cached result.
1833-
1834-
This approach works regardless of how processes are spawned (mp.Process,
1835-
torchrun, etc.) since it doesn't rely on environment variables.
1843+
Uses filelock.FileLock on the shared HF cache directory to coordinate
1844+
downloads across all processes AND all containers sharing the same
1845+
NFS-mounted cache. Only one process downloads at a time; others wait
1846+
for the lock then use the cached result.
18361847
18371848
Args:
18381849
model_name_or_path: The model name or path
@@ -1848,8 +1859,7 @@ def ci_download_with_validation_and_retry(
18481859
Raises:
18491860
RuntimeError: If download fails after max_retries attempts
18501861
"""
1851-
import fcntl
1852-
1862+
import filelock
18531863
import huggingface_hub.constants
18541864
from huggingface_hub import snapshot_download
18551865
from tqdm.auto import tqdm
@@ -1859,36 +1869,69 @@ def __init__(self, *args, **kwargs):
18591869
kwargs["disable"] = True
18601870
super().__init__(*args, **kwargs)
18611871

1862-
# Use file-based locking to serialize downloads across all processes
1863-
# This prevents HF hub race conditions with .incomplete files
1864-
lock_file_path = _get_lock_file_path(model_name_or_path)
1872+
# Use filelock on the shared HF cache directory to coordinate downloads
1873+
# across all processes AND all containers sharing the same NFS mount.
1874+
# This prevents cross-container .incomplete file race conditions.
1875+
lock_file_path = _get_lock_file_path(model_name_or_path, cache_dir)
18651876

1866-
# Log lock file path for debugging
18671877
logger.info(
18681878
"[CI Download] Process %d using lock file: %s",
18691879
os.getpid(),
18701880
lock_file_path,
18711881
)
18721882

1873-
# Create lock file if it doesn't exist
1874-
lock_file = open(lock_file_path, "w")
1883+
# filelock.FileLock handles creation, acquisition, and release cleanly.
1884+
# timeout=-1 means wait indefinitely (another container may be downloading
1885+
# a large model for 30+ minutes).
1886+
lock = filelock.FileLock(lock_file_path, timeout=-1, mode=0o666)
18751887

1876-
try:
1877-
# Acquire exclusive lock - blocks until lock is available
1878-
# This ensures only one process downloads at a time
1879-
logger.info(
1880-
"[CI Download] Process %d waiting to acquire lock for %s",
1881-
os.getpid(),
1882-
model_name_or_path,
1883-
)
1884-
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
1888+
logger.info(
1889+
"[CI Download] Process %d waiting to acquire lock for %s",
1890+
os.getpid(),
1891+
model_name_or_path,
1892+
)
1893+
1894+
with lock:
18851895
logger.info(
18861896
"[CI Download] Process %d ACQUIRED lock for %s",
18871897
os.getpid(),
18881898
model_name_or_path,
18891899
)
18901900

1891-
# Now we have exclusive access - perform download with retry logic
1901+
# Re-check if another container already downloaded the model while
1902+
# we were waiting for the lock. This avoids redundant downloads.
1903+
try:
1904+
from sglang.srt.model_loader.weight_utils import (
1905+
_find_local_hf_snapshot_dir_unlocked,
1906+
)
1907+
1908+
cached_path = _find_local_hf_snapshot_dir_unlocked(
1909+
model_name_or_path, cache_dir, allow_patterns, revision
1910+
)
1911+
if cached_path is not None:
1912+
logger.info(
1913+
"[CI Download] Process %d found cached model after "
1914+
"acquiring lock (downloaded by another container): %s",
1915+
os.getpid(),
1916+
cached_path,
1917+
)
1918+
return cached_path
1919+
except Exception as e:
1920+
logger.debug(
1921+
"[CI Download] Re-check for cached model failed (non-fatal): %s", e
1922+
)
1923+
1924+
# Clean up stale .incomplete files from previous failed downloads
1925+
# before starting. Only do this once before the first attempt.
1926+
cleaned = _cleanup_incomplete_blobs(model_name_or_path, cache_dir)
1927+
if cleaned > 0:
1928+
logger.info(
1929+
"[CI Download] Pre-download cleanup: removed %d stale "
1930+
".incomplete file(s) for %s",
1931+
cleaned,
1932+
model_name_or_path,
1933+
)
1934+
18921935
hf_folder = None
18931936
for attempt in range(max_retries):
18941937
try:
@@ -1907,12 +1950,11 @@ def __init__(self, *args, **kwargs):
19071950
max_workers=1,
19081951
)
19091952
except (FileNotFoundError, OSError) as e:
1910-
# Cross-container race condition: another container on the same
1911-
# host moved/deleted the .incomplete file while we were using it.
1912-
# This happens when multiple CI containers share an NFS-mounted
1913-
# HF cache but have separate /dev/shm lock namespaces.
1953+
# Race condition: .incomplete file was moved/deleted by another
1954+
# process. With NFS-level locking this should be rare, but can
1955+
# still happen if lock acquisition fails on some NFS setups.
19141956
logger.warning(
1915-
"[CI Download] Process %d hit download race condition "
1957+
"[CI Download] Process %d hit download error "
19161958
"(attempt %d/%d) for %s: %s: %s",
19171959
os.getpid(),
19181960
attempt + 1,
@@ -1921,19 +1963,21 @@ def __init__(self, *args, **kwargs):
19211963
type(e).__name__,
19221964
e,
19231965
)
1924-
_cleanup_incomplete_blobs(model_name_or_path, cache_dir)
19251966
if attempt < max_retries - 1:
1926-
backoff = 2**attempt
1967+
# Backoff: 10s, 20s, 40s. Clean only the stale
1968+
# .incomplete files (not active ones from other processes).
1969+
backoff = 10 * (2**attempt)
19271970
logger.info(
1928-
"[CI Download] Retrying in %ds...",
1971+
"[CI Download] Cleaning up .incomplete files and "
1972+
"retrying in %ds...",
19291973
backoff,
19301974
)
1975+
_cleanup_incomplete_blobs(model_name_or_path, cache_dir)
19311976
time.sleep(backoff)
19321977
continue
19331978
raise RuntimeError(
19341979
f"Download failed for {model_name_or_path} after "
1935-
f"{max_retries} attempts due to cross-container race "
1936-
f"condition (.incomplete file conflicts). "
1980+
f"{max_retries} attempts due to download errors. "
19371981
f"Last error: {type(e).__name__}: {e}"
19381982
) from e
19391983

@@ -1963,16 +2007,6 @@ def __init__(self, *args, **kwargs):
19632007
# Should never reach here, but return hf_folder just in case
19642008
return hf_folder
19652009

1966-
finally:
1967-
# Always release the lock
1968-
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
1969-
lock_file.close()
1970-
logger.info(
1971-
"[CI Download] Process %d RELEASED lock for %s",
1972-
os.getpid(),
1973-
model_name_or_path,
1974-
)
1975-
19762010

19772011
def ci_validate_and_clean_hf_cache(model_path: str) -> None:
19782012
"""

python/sglang/test/nightly_utils.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -259,18 +259,21 @@ def run_benchmark_for_model(
259259
avg_spec_accept_length = None
260260
model_description = f"{model_path}" + (f" ({variant})" if variant else "")
261261

262-
# Launch server
263-
process = popen_launch_server(
264-
model=model_path,
265-
base_url=self.base_url,
266-
other_args=other_args or [],
267-
timeout=(
268-
timeout if timeout is not None else DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
269-
),
270-
env=env,
271-
)
272-
262+
process = None
273263
try:
264+
# Launch server
265+
process = popen_launch_server(
266+
model=model_path,
267+
base_url=self.base_url,
268+
other_args=other_args or [],
269+
timeout=(
270+
timeout
271+
if timeout is not None
272+
else DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
273+
),
274+
env=env,
275+
)
276+
274277
# Generate filenames
275278
profile_path_prefix, json_output_file = self.generate_profile_filename(
276279
model_path, variant
@@ -311,7 +314,8 @@ def run_benchmark_for_model(
311314

312315
finally:
313316
# Always clean up server process
314-
kill_process_tree(process.pid)
317+
if process is not None:
318+
kill_process_tree(process.pid)
315319

316320
def _get_spec_accept_length(self) -> Optional[float]:
317321
"""Query the server for avg_spec_accept_length metric.

test/registered/eval/test_text_models_gsm8k_eval.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
1212
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
1313
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
14-
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
1514
DEFAULT_URL_FOR_TEST,
1615
ModelLaunchSettings,
1716
check_evaluation_test_results,
@@ -20,6 +19,10 @@
2019
write_results_to_json,
2120
)
2221

22+
# Nightly eval tests run large models (up to 70B+ params) that may need
23+
# downloading on cache miss. Use a longer timeout than the default 600s.
24+
NIGHTLY_EVAL_SERVER_TIMEOUT = 1800
25+
2326
register_cuda_ci(est_time=3600, suite="nightly-eval-text-2-gpu", nightly=True)
2427

2528
MODEL_SCORE_THRESHOLDS = {
@@ -72,19 +75,19 @@ def test_mgsm_en_all_models(self):
7275
for model_setup in self.models:
7376
with self.subTest(model=model_setup.model_path):
7477
other_args = list(model_setup.extra_args)
75-
error_message = None
78+
process = None
7679

7780
if model_setup.model_path == "meta-llama/Llama-3.1-70B-Instruct":
7881
other_args.extend(["--mem-fraction-static", "0.9"])
7982

80-
process = popen_launch_server(
81-
model=model_setup.model_path,
82-
other_args=other_args,
83-
base_url=self.base_url,
84-
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
85-
)
86-
8783
try:
84+
process = popen_launch_server(
85+
model=model_setup.model_path,
86+
other_args=other_args,
87+
base_url=self.base_url,
88+
timeout=NIGHTLY_EVAL_SERVER_TIMEOUT,
89+
)
90+
8891
args = SimpleNamespace(
8992
base_url=self.base_url,
9093
model=model_setup.model_path,
@@ -103,20 +106,18 @@ def test_mgsm_en_all_models(self):
103106
)
104107
is_first = False
105108

106-
# 0.0 for empty latency, None for no error
107109
all_results.append(
108-
(model_setup.model_path, metrics["score"], 0.0, error_message)
110+
(model_setup.model_path, metrics["score"], 0.0, None)
109111
)
110112
except Exception as e:
111-
# Capture error message for the summary table
112113
error_message = str(e)
113-
# Still append result with error info (use None for N/A metrics to match else clause)
114114
all_results.append(
115115
(model_setup.model_path, None, None, error_message)
116116
)
117117
print(f"Error evaluating {model_setup.model_path}: {error_message}")
118118
finally:
119-
kill_process_tree(process.pid)
119+
if process is not None:
120+
kill_process_tree(process.pid)
120121

121122
try:
122123
with open("results.json", "r") as f:

0 commit comments

Comments
 (0)