1212import warnings
1313from argparse import Namespace
1414from collections .abc import AsyncIterator
15- from contextlib import asynccontextmanager , suppress
15+ from contextlib import asynccontextmanager
1616from typing import Any
1717
1818import uvloop
7474_FALLBACK_SUPPORTED_TASKS : tuple [SupportedTask , ...] = ("generate" ,)
7575
7676
77- def _startup_prefetch_weights (vllm_config : "VllmConfig" ) -> None :
78- """Kick off reading model weight shards into OS page cache from the
79- parent APIServer. EngineCore will read the same files a few seconds
80- later from the child; by then the kernel already has them ready.
81-
82- All work (directory resolution, HF/ModelScope cache lookup, globbing,
83- and the reads themselves) runs inside the background thread so we do
84- not block the asyncio event loop.
85-
86- Best-effort: any failure (unknown model location, permission, etc.) is
87- swallowed — vLLM's existing in-child prefetch then runs normally.
88- """
89- import threading
90-
91- # Capture only the small scalar fields the thread needs. Avoid holding
92- # a reference to vllm_config (which contains unpicklable objects) for
93- # longer than necessary.
94- model_ref = vllm_config .model_config .model
95- revision = vllm_config .model_config .revision
96- download_dir = vllm_config .load_config .download_dir
97-
98- def _prefetch_worker () -> None :
99- import glob
100- import os
101-
102- from vllm import envs
103-
104- candidate_dir : str | None = None
105-
106- # 1. Local path?
107- if os .path .isdir (model_ref ):
108- candidate_dir = model_ref
109- else :
110- # 2. HF / ModelScope repo id — resolve to the local cache
111- # snapshot dir using the same revision / cache_dir the weight
112- # loader will use, so we prefetch the right files.
113- try :
114- if envs .VLLM_USE_MODELSCOPE :
115- from modelscope .hub .snapshot_download import (
116- snapshot_download ,
117- )
118-
119- candidate_dir = snapshot_download (
120- model_id = model_ref ,
121- revision = revision ,
122- cache_dir = download_dir ,
123- local_files_only = True ,
124- )
125- else :
126- from huggingface_hub import snapshot_download
127-
128- candidate_dir = snapshot_download (
129- repo_id = model_ref ,
130- revision = revision ,
131- cache_dir = download_dir ,
132- allow_patterns = [
133- "*.safetensors" ,
134- "*.bin" ,
135- "*.json" ,
136- "*tokenizer*" ,
137- ],
138- local_files_only = True ,
139- )
140- except Exception :
141- return # not cached yet or not a known repo id
142-
143- if not candidate_dir or not os .path .isdir (candidate_dir ):
144- return
145-
146- # Weight shards: large files, read into page cache.
147- shard_paths = sorted (
148- glob .glob (os .path .join (candidate_dir , "*.safetensors" ))
149- + glob .glob (os .path .join (candidate_dir , "*.bin" ))
150- )
151- # Tokenizer/config sidecars: small, but re-opened in the child and
152- # add synchronous open+read latency when the disk is cold.
153- sidecar_paths = sorted (
154- glob .glob (os .path .join (candidate_dir , "*.json" ))
155- + glob .glob (os .path .join (candidate_dir , "tokenizer.model" ))
156- + glob .glob (os .path .join (candidate_dir , "*tokenizer*" ))
157- )
158- shard_paths .extend (sidecar_paths )
159- if not shard_paths :
160- return
161-
162- logger .debug (
163- "Parent-side weight prefetch starting for %d files in %s" ,
164- len (shard_paths ),
165- candidate_dir ,
166- )
167-
168- # Match vLLM's in-child prefetch block size + thread count.
169- block_size = 16 * 1024 * 1024 # 16 MB
170- # Read shards in parallel across 8 worker threads (bounded) to
171- # saturate multi-spindle / multi-queue storage without thrashing.
172- from concurrent .futures import ThreadPoolExecutor
173-
174- def read_one (p : str ) -> None :
175- try :
176- with open (p , "rb" ) as f :
177- while f .read (block_size ):
178- pass
179- except Exception :
180- pass
181-
182- with ThreadPoolExecutor (max_workers = 8 ) as pool :
183- list (pool .map (read_one , shard_paths ))
184-
185- threading .Thread (
186- target = _prefetch_worker ,
187- daemon = True ,
188- name = "vllm-parent-weight-prefetch" ,
189- ).start ()
190-
191-
19277@asynccontextmanager
19378async def build_async_engine_client (
19479 args : Namespace ,
@@ -200,10 +85,7 @@ async def build_async_engine_client(
20085 # The executor is expected to be mp.
20186 # Pre-import heavy modules in the forkserver process
20287 logger .debug ("Setup forkserver with pre-imports" )
203- # May already have been set by the CLI entry's async prewarm
204- # (vllm/entrypoints/cli/main.py); tolerate re-call.
205- with suppress (RuntimeError ):
206- multiprocessing .set_start_method ("forkserver" , force = False )
88+ multiprocessing .set_start_method ("forkserver" )
20789 multiprocessing .set_forkserver_preload (["vllm.v1.engine.async_llm" ])
20890 forkserver .ensure_running ()
20991 logger .debug ("Forkserver setup complete!" )
@@ -241,28 +123,6 @@ async def build_async_engine_client_from_engine_args(
241123 # Create the EngineConfig (determines if we can use V1).
242124 vllm_config = engine_args .create_engine_config (usage_context = usage_context )
243125
244- # [startup] Start prefetching model weight shards into the OS page cache
245- # in a background thread from the PARENT APIServer process. EngineCore
246- # will page-fault on these same files ~10-15 s later (after fork + CUDA
247- # context + distributed init + model init). For large-weight cases
248- # (tens of GB) this parent-side head start meaningfully shrinks the
249- # prefetch+load phase that the engine's in-child prefetch otherwise
250- # barely overlaps.
251- #
252- # Skip in API-only workers that connect to an already-running EngineCore
253- # (multi-API-server / disaggregated setups): those processes never load
254- # weights, and if we prefetched from all of them we'd contend with the
255- # engine's own read. Presence of an `input_address` in client_config is
256- # the current marker that this worker is headless.
257- #
258- # Best-effort: if the model is a local path, glob for safetensors; if
259- # it's a repo-id, try to resolve via HF hub (or ModelScope) local cache.
260- # Any failure silently falls through to the existing in-child prefetch
261- # path. All I/O (incl. directory resolution) runs inside the BG thread
262- # so the asyncio event loop is never blocked.
263- if not (client_config and client_config .get ("input_address" )):
264- _startup_prefetch_weights (vllm_config )
265-
266126 from vllm .v1 .engine .async_llm import AsyncLLM
267127
268128 async_llm : AsyncLLM | None = None
0 commit comments