RBLN-SW
diff --git a/‎tests/v1/core/conftest.py‎
Lines changed: 23 additions & 0 deletions b/‎tests/v1/core/conftest.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎tests/v1/worker/conftest.py‎
Lines changed: 7 additions & 0 deletions b/‎tests/v1/worker/conftest.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎vllm_rbln/model_executor/models/optimum/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎vllm_rbln/model_executor/models/optimum/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm_rbln/model_executor/models/optimum/model_base.py‎
Lines changed: 80 additions & 26 deletions b/‎vllm_rbln/model_executor/models/optimum/model_base.py‎
Lines changed: 80 additions & 26 deletions
diff --git a/‎vllm_rbln/platform.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm_rbln/platform.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm_rbln/utils/optimum/cache_blocks.py‎
Lines changed: 135 additions & 0 deletions b/‎vllm_rbln/utils/optimum/cache_blocks.py‎
Lines changed: 135 additions & 0 deletions
@@ -0,0 +1,23 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import patch
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def skip_prepare_compile():
+    with patch("vllm_rbln.utils.optimum.configuration.prepare_vllm_for_compile"):
+        yield
@@ -14,6 +14,7 @@
 
 import os
 import shutil
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -28,3 +29,9 @@ def fresh_inductor_cache_per_test(monkeypatch):
     torch._dynamo.reset()
 
     yield
+
+
+@pytest.fixture(autouse=True)
+def skip_prepare_compile():
+    with patch("vllm_rbln.utils.optimum.configuration.prepare_vllm_for_compile"):
+        yield
@@ -50,7 +50,7 @@
 
 def load_model(vllm_config: VllmConfig) -> nn.Module:
     model_config = vllm_config.model_config
-
+    logger.info("Loading RBLN model from %s", model_config.model)
     if is_multi_modal(model_config.hf_config):
         assert vllm_config.cache_config.enable_prefix_caching in (False, None), (
             "Prefix caching is not supported with multimodal models. "
 
@@ -25,15 +25,37 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 
 import optimum.rbln
+import vllm_rbln.rbln_envs as envs
 from optimum.rbln.transformers.models.decoderonly import (
     decoderonly_runtime_utils as runtime_utils,
 )
 from vllm_rbln.utils.optimum.common import select_bucket_size
-from vllm_rbln.utils.optimum.registry import get_rbln_model_info
+from vllm_rbln.utils.optimum.registry import compile_model, get_rbln_model_info
 
 logger = init_logger(__name__)
 
 
+def get_attn_block_size(vllm_config: VllmConfig) -> int:
+    if vllm_config.cache_config.enable_prefix_caching:
+        block_size = vllm_config.additional_config["attn_block_size"]
+    else:
+        block_size = vllm_config.cache_config.block_size
+    return block_size
+
+
+def generate_model_path_name(
+    model_name: str,
+    batch_size: int,
+    block_size: int,
+    max_model_len: int,
+    tp_size: int,
+) -> str:
+    # FIXME: To avoid cache collisions, the cache key should also include
+    # the versions of the compiler and optimum-rbln.
+    model_name = model_name.replace("/", "_").replace(":", "_")
+    return f"{model_name}_bs{batch_size}_blk{block_size}_msl{max_model_len}_tp{tp_size}"
+
+
 class KVCacheBlockAdapter:
     """
      KV cache block allocation behavior (v1 vs v0).
@@ -81,12 +103,7 @@ def _estimated_num_blocks(self) -> int:
     def is_full_block_available(self) -> bool:
         """True if we can allocate a full batch worth of blocks."""
         estimated = self._estimated_num_blocks()
-
-        if self.vllm_config.cache_config.enable_prefix_caching:
-            block_size = self.vllm_config.additional_config["attn_block_size"]
-
-        else:
-            block_size = self.vllm_config.cache_config.block_size
+        block_size = get_attn_block_size(self.vllm_config)
 
         max_model_len = self.vllm_config.model_config.max_model_len
         max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
@@ -145,39 +162,76 @@ def _resolve_kvcache_num_blocks(self) -> int:
             return int(self.scheduler_config.max_num_seqs)
 
     def init_model(self) -> None:
+        # Check if the model is already compiled and load it;
+        # else compile the model and load it.
         config = self.model_config.hf_config
-        model_name, model_cls_name = get_rbln_model_info(config)
-
         if isinstance(self.model_config.model, str | Path) and os.path.exists(
             self.model_config.model
         ):
             model_path = Path(self.model_config.model)
             if model_path.is_dir() and any(model_path.glob("rbln_config.json")):
-                compiled_path = self.model_config.model
+                is_compiled_model = True
             else:
-                compiled_path = None
+                is_compiled_model = False
         else:
-            compiled_path = None
+            is_compiled_model = False
 
-        if compiled_path is None or not os.path.exists(compiled_path):
-            raise RuntimeError(f"Compiled model path does not exist: {compiled_path}")
-
-        # huggingface model class name
-        logger.info(
-            "model_name = %s, model_cls_name = %s, model_path = %s",
-            model_name,
-            model_cls_name,
-            compiled_path,
-        )
+        model_name, model_cls_name = get_rbln_model_info(config)
+        model = None
+
+        # If a HuggingFace model (not optimum-compiled) is given,
+        # look up the cached compiled model.
+        # If it does not exist, compile and save it to the cache for future use.
+        if not is_compiled_model:
+            model_path_name = generate_model_path_name(
+                self.model_config.model,
+                batch_size=self.scheduler_config.max_num_seqs,
+                block_size=get_attn_block_size(self.vllm_config),
+                max_model_len=self.model_config.max_model_len,
+                tp_size=envs.VLLM_RBLN_TP_SIZE,
+            )
+            cached_model_path = os.path.join(
+                envs.VLLM_CACHE_ROOT,
+                "compiled_models/" + model_path_name,
+            )
+            if not os.path.exists(cached_model_path):
+                logger.info(
+                    "Compiling the model %s. This may take a while...",
+                    self.model_config.model,
+                )
+                model = compile_model(
+                    self.model_config.model,
+                    config,
+                    batch_size=self.scheduler_config.max_num_seqs,
+                    block_size=get_attn_block_size(self.vllm_config),
+                    max_model_len=self.model_config.max_model_len,
+                    tp_size=envs.VLLM_RBLN_TP_SIZE,
+                    model_path=str(cached_model_path),
+                )
+            else:
+                logger.info(
+                    "Found compiled model at %s. Loading the model from the path.",
+                    cached_model_path,
+                )
+            self.vllm_config.model_config.model = cached_model_path
+
+        # Load the model directly if it is either an optimum-compiled model
+        # or a HuggingFace model that has already been compiled and cached.
+        if model is None:
+            model_cls = getattr(optimum.rbln, model_cls_name)
+            assert model_cls is not None
+            model = model_cls.from_pretrained(self.vllm_config.model_config.model)
+            logger.info(
+                "model_name = %s, model_cls_name = %s, model_path = %s",
+                model_name,
+                model_cls_name,
+                self.vllm_config.model_config.model,
+            )
 
         self.supports_transcription_only = (
             model_cls_name == "RBLNOptimumWhisperForConditionalGeneration"
         )
 
-        # huggingface model class
-        model_cls = getattr(optimum.rbln, model_cls_name)
-        assert model_cls is not None
-        model = model_cls.from_pretrained(compiled_path, export=False)
         self.model = model
         self.rbln_model_config = model.rbln_config
         self.attn_impl = (
 
@@ -239,7 +239,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
 
             assert vllm_config.parallel_config.tensor_parallel_size == 1, (
-                "Tensor parallelism is set when compiled in optimum-rbln."
+                "Cannot set tensor_parallel_size for pre-compiled optimum-rbln models. "
+                "If you want to compile with tensor parallelism in vllm-rbln, "
+                "please use the `VLLM_RBLN_TP_SIZE` environment variable instead."
             )
             assert vllm_config.parallel_config.pipeline_parallel_size == 1, (
                 "Pipeline parallelism is not supported in optimum-rbln."
 
@@ -0,0 +1,135 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""KV-cache block calculation and synchronisation helpers."""
+
+import math
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+from vllm_rbln.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def is_full_block_available(num_blocks: int, vllm_config: VllmConfig) -> bool:
+    if vllm_config.cache_config.enable_prefix_caching:
+        block_size = vllm_config.additional_config["attn_block_size"]
+
+    else:
+        block_size = vllm_config.cache_config.block_size
+
+    max_model_len = vllm_config.model_config.max_model_len
+    max_num_seqs = vllm_config.scheduler_config.max_num_seqs
+
+    blocks_per_seq = math.ceil(max_model_len / block_size)
+    ideal_total = max_num_seqs * blocks_per_seq
+    return num_blocks >= ideal_total
+
+
+def get_block_ratio(vllm_config: VllmConfig) -> int:
+    if vllm_config.cache_config.enable_prefix_caching:
+        ob_size = vllm_config.additional_config["attn_block_size"]
+        ib_size = vllm_config.cache_config.block_size
+        blk_ratio = ob_size // ib_size
+    else:
+        blk_ratio = 1
+    return blk_ratio
+
+
+def apply_prefix_caching_block_size(
+    vllm_config: VllmConfig, kvcache_block_size: int, prefill_chunk_size: int
+) -> None:
+    assert prefill_chunk_size is not None, (
+        "prefill_chunk_size must be specified in rbln_config.json"
+    )
+    # If user set prefix_block_size in additional_config, use it.
+    # Otherwise, set it to prefill_chunk_size.
+    prefix_block_size = vllm_config.additional_config.get("prefix_block_size", None)
+    if prefix_block_size is None:
+        prefix_block_size = prefill_chunk_size
+        logger.debug(
+            "Prefix block size is set to %s based on prefill_chunk_size",
+            prefix_block_size,
+        )
+    else:
+        if prefix_block_size % prefill_chunk_size != 0:
+            raise ValueError(
+                "prefix_block_size ({}) is not divisible "
+                "by prefill_chunk_size ({}). "
+                "Please check the value of prefill_chunk_size "
+                "in rbln_config.json".format(prefix_block_size, prefill_chunk_size)
+            )
+        if prefix_block_size > kvcache_block_size:
+            raise ValueError(
+                "prefix_block_size ({}) is greater than "
+                "kvcache_block_size ({}). "
+                "Please check the value of kvcache_block_size "
+                "in rbln_config.json".format(prefix_block_size, kvcache_block_size)
+            )
+        logger.debug(
+            "Prefix block size is set to %s based on additional_config",
+            prefix_block_size,
+        )
+    if kvcache_block_size % prefix_block_size != 0:
+        raise ValueError(
+            "kvcache_block_size ({}) is not divisible "
+            "by prefix_block_size ({}). "
+            "Please check the value of prefix_block_size in rbln_config.json".format(
+                kvcache_block_size, prefix_block_size
+            )
+        )
+    vllm_config.cache_config.block_size = prefix_block_size
+    vllm_config.additional_config["attn_block_size"] = kvcache_block_size
+
+
+def sync_cache_block_size(
+    vllm_config: VllmConfig, kvcache_block_size: int, prefill_chunk_size: int
+) -> None:
+    if vllm_config.cache_config.enable_prefix_caching:
+        apply_prefix_caching_block_size(
+            vllm_config, kvcache_block_size, prefill_chunk_size
+        )
+    else:
+        if vllm_config.cache_config.block_size != kvcache_block_size:
+            logger.info(
+                "Updating model_cache_config.block_size from %s to %s "
+                "based on rbln_config.json",
+                vllm_config.cache_config.block_size,
+                kvcache_block_size,
+            )
+            vllm_config.cache_config.block_size = kvcache_block_size
+
+
+def sync_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> None:
+    # num_blocks is determined by rbln_config or overridden by user.
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_blocks = vllm_config.cache_config.num_gpu_blocks_override
+        vllm_config.additional_config["num_blocks_override"] = num_blocks
+
+    blk_ratio = get_block_ratio(vllm_config)
+
+    if is_full_block_available(num_blocks, vllm_config):
+        adjusted_num_blocks = num_blocks * blk_ratio + 1
+    else:
+        adjusted_num_blocks = (num_blocks - 1) * blk_ratio + 1
+
+    vllm_config.cache_config.num_gpu_blocks = adjusted_num_blocks
+
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        vllm_config.cache_config.num_gpu_blocks_override = adjusted_num_blocks
Original file line number	Diff line number	Diff line change
`@@ -239,7 +239,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:`
`239`	`239`	`)`
`240`	`240`
`241`	`241`	`assert vllm_config.parallel_config.tensor_parallel_size == 1, (`
`242`		`- "Tensor parallelism is set when compiled in optimum-rbln."`
	`242`	`+ "Cannot set tensor_parallel_size for pre-compiled optimum-rbln models. "`
	`243`	`+ "If you want to compile with tensor parallelism in vllm-rbln, "`
	`244`	+ "please use the `VLLM_RBLN_TP_SIZE` environment variable instead."
`243`	`245`	`)`
`244`	`246`	`assert vllm_config.parallel_config.pipeline_parallel_size == 1, (`
`245`	`247`	`"Pipeline parallelism is not supported in optimum-rbln."`