@@ -1730,30 +1730,44 @@ def _validate_weights_after_download(
17301730 return True
17311731
17321732
1733- def _get_lock_file_path (model_name_or_path : str ) -> str :
1733+ def _get_lock_file_path (
1734+ model_name_or_path : str , cache_dir : Optional [str ] = None
1735+ ) -> str :
17341736 """
17351737 Generate a unique lock file path for download coordination.
17361738
1737- Uses file-based locking (fcntl.flock) to ensure only one process downloads
1738- while others wait. This works regardless of how processes are spawned
1739- (mp.Process, torchrun, etc.).
1739+ In CI environments where multiple containers share an NFS-mounted HF cache,
1740+ the lock file is placed on the shared cache directory so ALL containers
1741+ coordinate on the same lock. This prevents cross-container .incomplete
1742+ file race conditions.
1743+
1744+ Falls back to /dev/shm (container-local) for non-CI or when the cache
1745+ dir is not accessible.
17401746
17411747 Args:
17421748 model_name_or_path: Model identifier
1749+ cache_dir: HF cache directory (None to use default)
17431750
17441751 Returns:
17451752 Path to the lock file
17461753 """
1747- # Create a unique hash based on model name only (not cache_dir)
1748- # This ensures all processes coordinate on the same lock regardless of
1749- # cache_dir configuration differences between processes
17501754 key_hash = hashlib .sha256 (model_name_or_path .encode ()).hexdigest ()[:16 ]
17511755
1752- # Use /dev/shm (shared memory filesystem) for lock files because:
1753- # 1. It's always local to the machine (not NFS)
1754- # 2. It properly supports file locking
1755- # 3. It's shared across all processes on the same machine
1756- # Fall back to /tmp if /dev/shm doesn't exist
1756+ # In CI, place lock on the shared HF cache directory so that ALL containers
1757+ # sharing the same NFS-mounted cache coordinate downloads.
1758+ # /dev/shm is container-local and doesn't prevent cross-container races.
1759+ try :
1760+ import huggingface_hub .constants
1761+
1762+ effective_cache_dir = cache_dir or huggingface_hub .constants .HF_HUB_CACHE
1763+ if os .path .isdir (effective_cache_dir ):
1764+ lock_dir = os .path .join (effective_cache_dir , ".sglang_locks" )
1765+ os .makedirs (lock_dir , exist_ok = True )
1766+ return os .path .join (lock_dir , f"download_{ key_hash } .lock" )
1767+ except Exception :
1768+ pass
1769+
1770+ # Fallback to container-local lock
17571771 if os .path .isdir ("/dev/shm" ):
17581772 return f"/dev/shm/sglang_download_lock_{ key_hash } "
17591773 return f"/tmp/sglang_download_lock_{ key_hash } "
@@ -1826,13 +1840,10 @@ def ci_download_with_validation_and_retry(
18261840 This function handles the download of model weights in CI environments,
18271841 with automatic validation and retry logic for handling corrupted downloads.
18281842
1829- Uses file-based locking (fcntl.flock) to prevent HuggingFace hub race
1830- conditions where multiple processes try to download simultaneously,
1831- causing .incomplete file conflicts. Only one process downloads at a time;
1832- others wait for the lock then use the cached result.
1833-
1834- This approach works regardless of how processes are spawned (mp.Process,
1835- torchrun, etc.) since it doesn't rely on environment variables.
1843+ Uses filelock.FileLock on the shared HF cache directory to coordinate
1844+ downloads across all processes AND all containers sharing the same
1845+ NFS-mounted cache. Only one process downloads at a time; others wait
1846+ for the lock then use the cached result.
18361847
18371848 Args:
18381849 model_name_or_path: The model name or path
@@ -1848,8 +1859,7 @@ def ci_download_with_validation_and_retry(
18481859 Raises:
18491860 RuntimeError: If download fails after max_retries attempts
18501861 """
1851- import fcntl
1852-
1862+ import filelock
18531863 import huggingface_hub .constants
18541864 from huggingface_hub import snapshot_download
18551865 from tqdm .auto import tqdm
@@ -1859,36 +1869,69 @@ def __init__(self, *args, **kwargs):
18591869 kwargs ["disable" ] = True
18601870 super ().__init__ (* args , ** kwargs )
18611871
1862- # Use file-based locking to serialize downloads across all processes
1863- # This prevents HF hub race conditions with .incomplete files
1864- lock_file_path = _get_lock_file_path (model_name_or_path )
1872+ # Use filelock on the shared HF cache directory to coordinate downloads
1873+ # across all processes AND all containers sharing the same NFS mount.
1874+ # This prevents cross-container .incomplete file race conditions.
1875+ lock_file_path = _get_lock_file_path (model_name_or_path , cache_dir )
18651876
1866- # Log lock file path for debugging
18671877 logger .info (
18681878 "[CI Download] Process %d using lock file: %s" ,
18691879 os .getpid (),
18701880 lock_file_path ,
18711881 )
18721882
1873- # Create lock file if it doesn't exist
1874- lock_file = open (lock_file_path , "w" )
1883+ # filelock.FileLock handles creation, acquisition, and release cleanly.
1884+ # timeout=-1 means wait indefinitely (another container may be downloading
1885+ # a large model for 30+ minutes).
1886+ lock = filelock .FileLock (lock_file_path , timeout = - 1 , mode = 0o666 )
18751887
1876- try :
1877- # Acquire exclusive lock - blocks until lock is available
1878- # This ensures only one process downloads at a time
1879- logger .info (
1880- "[CI Download] Process %d waiting to acquire lock for %s" ,
1881- os .getpid (),
1882- model_name_or_path ,
1883- )
1884- fcntl .flock (lock_file .fileno (), fcntl .LOCK_EX )
1888+ logger .info (
1889+ "[CI Download] Process %d waiting to acquire lock for %s" ,
1890+ os .getpid (),
1891+ model_name_or_path ,
1892+ )
1893+
1894+ with lock :
18851895 logger .info (
18861896 "[CI Download] Process %d ACQUIRED lock for %s" ,
18871897 os .getpid (),
18881898 model_name_or_path ,
18891899 )
18901900
1891- # Now we have exclusive access - perform download with retry logic
1901+ # Re-check if another container already downloaded the model while
1902+ # we were waiting for the lock. This avoids redundant downloads.
1903+ try :
1904+ from sglang .srt .model_loader .weight_utils import (
1905+ _find_local_hf_snapshot_dir_unlocked ,
1906+ )
1907+
1908+ cached_path = _find_local_hf_snapshot_dir_unlocked (
1909+ model_name_or_path , cache_dir , allow_patterns , revision
1910+ )
1911+ if cached_path is not None :
1912+ logger .info (
1913+ "[CI Download] Process %d found cached model after "
1914+ "acquiring lock (downloaded by another container): %s" ,
1915+ os .getpid (),
1916+ cached_path ,
1917+ )
1918+ return cached_path
1919+ except Exception as e :
1920+ logger .debug (
1921+ "[CI Download] Re-check for cached model failed (non-fatal): %s" , e
1922+ )
1923+
1924+ # Clean up stale .incomplete files from previous failed downloads
1925+ # before starting. Only do this once before the first attempt.
1926+ cleaned = _cleanup_incomplete_blobs (model_name_or_path , cache_dir )
1927+ if cleaned > 0 :
1928+ logger .info (
1929+ "[CI Download] Pre-download cleanup: removed %d stale "
1930+ ".incomplete file(s) for %s" ,
1931+ cleaned ,
1932+ model_name_or_path ,
1933+ )
1934+
18921935 hf_folder = None
18931936 for attempt in range (max_retries ):
18941937 try :
@@ -1907,12 +1950,11 @@ def __init__(self, *args, **kwargs):
19071950 max_workers = 1 ,
19081951 )
19091952 except (FileNotFoundError , OSError ) as e :
1910- # Cross-container race condition: another container on the same
1911- # host moved/deleted the .incomplete file while we were using it.
1912- # This happens when multiple CI containers share an NFS-mounted
1913- # HF cache but have separate /dev/shm lock namespaces.
1953+ # Race condition: .incomplete file was moved/deleted by another
1954+ # process. With NFS-level locking this should be rare, but can
1955+ # still happen if lock acquisition fails on some NFS setups.
19141956 logger .warning (
1915- "[CI Download] Process %d hit download race condition "
1957+ "[CI Download] Process %d hit download error "
19161958 "(attempt %d/%d) for %s: %s: %s" ,
19171959 os .getpid (),
19181960 attempt + 1 ,
@@ -1921,19 +1963,21 @@ def __init__(self, *args, **kwargs):
19211963 type (e ).__name__ ,
19221964 e ,
19231965 )
1924- _cleanup_incomplete_blobs (model_name_or_path , cache_dir )
19251966 if attempt < max_retries - 1 :
1926- backoff = 2 ** attempt
1967+ # Backoff: 10s, 20s, 40s. Clean only the stale
1968+ # .incomplete files (not active ones from other processes).
1969+ backoff = 10 * (2 ** attempt )
19271970 logger .info (
1928- "[CI Download] Retrying in %ds..." ,
1971+ "[CI Download] Cleaning up .incomplete files and "
1972+ "retrying in %ds..." ,
19291973 backoff ,
19301974 )
1975+ _cleanup_incomplete_blobs (model_name_or_path , cache_dir )
19311976 time .sleep (backoff )
19321977 continue
19331978 raise RuntimeError (
19341979 f"Download failed for { model_name_or_path } after "
1935- f"{ max_retries } attempts due to cross-container race "
1936- f"condition (.incomplete file conflicts). "
1980+ f"{ max_retries } attempts due to download errors. "
19371981 f"Last error: { type (e ).__name__ } : { e } "
19381982 ) from e
19391983
@@ -1963,16 +2007,6 @@ def __init__(self, *args, **kwargs):
19632007 # Should never reach here, but return hf_folder just in case
19642008 return hf_folder
19652009
1966- finally :
1967- # Always release the lock
1968- fcntl .flock (lock_file .fileno (), fcntl .LOCK_UN )
1969- lock_file .close ()
1970- logger .info (
1971- "[CI Download] Process %d RELEASED lock for %s" ,
1972- os .getpid (),
1973- model_name_or_path ,
1974- )
1975-
19762010
19772011def ci_validate_and_clean_hf_cache (model_path : str ) -> None :
19782012 """
0 commit comments