|
25 | 25 | from vllm.v1.sample.metadata import SamplingMetadata |
26 | 26 |
|
27 | 27 | import optimum.rbln |
| 28 | +import vllm_rbln.rbln_envs as envs |
28 | 29 | from optimum.rbln.transformers.models.decoderonly import ( |
29 | 30 | decoderonly_runtime_utils as runtime_utils, |
30 | 31 | ) |
31 | 32 | from vllm_rbln.utils.optimum.common import select_bucket_size |
32 | | -from vllm_rbln.utils.optimum.registry import get_rbln_model_info |
| 33 | +from vllm_rbln.utils.optimum.registry import compile_model, get_rbln_model_info |
33 | 34 |
|
34 | 35 | logger = init_logger(__name__) |
35 | 36 |
|
36 | 37 |
|
| 38 | +def get_attn_block_size(vllm_config: VllmConfig) -> int: |
| 39 | + if vllm_config.cache_config.enable_prefix_caching: |
| 40 | + block_size = vllm_config.additional_config["attn_block_size"] |
| 41 | + else: |
| 42 | + block_size = vllm_config.cache_config.block_size |
| 43 | + return block_size |
| 44 | + |
| 45 | + |
| 46 | +def generate_model_path_name( |
| 47 | + model_name: str, |
| 48 | + batch_size: int, |
| 49 | + block_size: int, |
| 50 | + max_model_len: int, |
| 51 | + tp_size: int, |
| 52 | +) -> str: |
| 53 | + # FIXME: To avoid cache collisions, the cache key should also include |
| 54 | + # the versions of the compiler and optimum-rbln. |
| 55 | + model_name = model_name.replace("/", "_").replace(":", "_") |
| 56 | + return f"{model_name}_bs{batch_size}_blk{block_size}_msl{max_model_len}_tp{tp_size}" |
| 57 | + |
| 58 | + |
37 | 59 | class KVCacheBlockAdapter: |
38 | 60 | """ |
39 | 61 | KV cache block allocation behavior (v1 vs v0). |
@@ -81,12 +103,7 @@ def _estimated_num_blocks(self) -> int: |
81 | 103 | def is_full_block_available(self) -> bool: |
82 | 104 | """True if we can allocate a full batch worth of blocks.""" |
83 | 105 | estimated = self._estimated_num_blocks() |
84 | | - |
85 | | - if self.vllm_config.cache_config.enable_prefix_caching: |
86 | | - block_size = self.vllm_config.additional_config["attn_block_size"] |
87 | | - |
88 | | - else: |
89 | | - block_size = self.vllm_config.cache_config.block_size |
| 106 | + block_size = get_attn_block_size(self.vllm_config) |
90 | 107 |
|
91 | 108 | max_model_len = self.vllm_config.model_config.max_model_len |
92 | 109 | max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs |
@@ -145,39 +162,76 @@ def _resolve_kvcache_num_blocks(self) -> int: |
145 | 162 | return int(self.scheduler_config.max_num_seqs) |
146 | 163 |
|
147 | 164 | def init_model(self) -> None: |
| 165 | + # Check if the model is already compiled and load it; |
| 166 | + # else compile the model and load it. |
148 | 167 | config = self.model_config.hf_config |
149 | | - model_name, model_cls_name = get_rbln_model_info(config) |
150 | | - |
151 | 168 | if isinstance(self.model_config.model, str | Path) and os.path.exists( |
152 | 169 | self.model_config.model |
153 | 170 | ): |
154 | 171 | model_path = Path(self.model_config.model) |
155 | 172 | if model_path.is_dir() and any(model_path.glob("rbln_config.json")): |
156 | | - compiled_path = self.model_config.model |
| 173 | + is_compiled_model = True |
157 | 174 | else: |
158 | | - compiled_path = None |
| 175 | + is_compiled_model = False |
159 | 176 | else: |
160 | | - compiled_path = None |
| 177 | + is_compiled_model = False |
161 | 178 |
|
162 | | - if compiled_path is None or not os.path.exists(compiled_path): |
163 | | - raise RuntimeError(f"Compiled model path does not exist: {compiled_path}") |
164 | | - |
165 | | - # huggingface model class name |
166 | | - logger.info( |
167 | | - "model_name = %s, model_cls_name = %s, model_path = %s", |
168 | | - model_name, |
169 | | - model_cls_name, |
170 | | - compiled_path, |
171 | | - ) |
| 179 | + model_name, model_cls_name = get_rbln_model_info(config) |
| 180 | + model = None |
| 181 | + |
| 182 | + # If a HuggingFace model (not optimum-compiled) is given, |
| 183 | + # look up the cached compiled model. |
| 184 | + # If it does not exist, compile and save it to the cache for future use. |
| 185 | + if not is_compiled_model: |
| 186 | + model_path_name = generate_model_path_name( |
| 187 | + self.model_config.model, |
| 188 | + batch_size=self.scheduler_config.max_num_seqs, |
| 189 | + block_size=get_attn_block_size(self.vllm_config), |
| 190 | + max_model_len=self.model_config.max_model_len, |
| 191 | + tp_size=envs.VLLM_RBLN_TP_SIZE, |
| 192 | + ) |
| 193 | + cached_model_path = os.path.join( |
| 194 | + envs.VLLM_CACHE_ROOT, |
| 195 | + "compiled_models/" + model_path_name, |
| 196 | + ) |
| 197 | + if not os.path.exists(cached_model_path): |
| 198 | + logger.info( |
| 199 | + "Compiling the model %s. This may take a while...", |
| 200 | + self.model_config.model, |
| 201 | + ) |
| 202 | + model = compile_model( |
| 203 | + self.model_config.model, |
| 204 | + config, |
| 205 | + batch_size=self.scheduler_config.max_num_seqs, |
| 206 | + block_size=get_attn_block_size(self.vllm_config), |
| 207 | + max_model_len=self.model_config.max_model_len, |
| 208 | + tp_size=envs.VLLM_RBLN_TP_SIZE, |
| 209 | + model_path=str(cached_model_path), |
| 210 | + ) |
| 211 | + else: |
| 212 | + logger.info( |
| 213 | + "Found compiled model at %s. Loading the model from the path.", |
| 214 | + cached_model_path, |
| 215 | + ) |
| 216 | + self.vllm_config.model_config.model = cached_model_path |
| 217 | + |
| 218 | + # Load the model directly if it is either an optimum-compiled model |
| 219 | + # or a HuggingFace model that has already been compiled and cached. |
| 220 | + if model is None: |
| 221 | + model_cls = getattr(optimum.rbln, model_cls_name) |
| 222 | + assert model_cls is not None |
| 223 | + model = model_cls.from_pretrained(self.vllm_config.model_config.model) |
| 224 | + logger.info( |
| 225 | + "model_name = %s, model_cls_name = %s, model_path = %s", |
| 226 | + model_name, |
| 227 | + model_cls_name, |
| 228 | + self.vllm_config.model_config.model, |
| 229 | + ) |
172 | 230 |
|
173 | 231 | self.supports_transcription_only = ( |
174 | 232 | model_cls_name == "RBLNOptimumWhisperForConditionalGeneration" |
175 | 233 | ) |
176 | 234 |
|
177 | | - # huggingface model class |
178 | | - model_cls = getattr(optimum.rbln, model_cls_name) |
179 | | - assert model_cls is not None |
180 | | - model = model_cls.from_pretrained(compiled_path, export=False) |
181 | 235 | self.model = model |
182 | 236 | self.rbln_model_config = model.rbln_config |
183 | 237 | self.attn_impl = ( |
|
0 commit comments