rebellions-sw
diff --git a/‎vllm_rbln/attention/backends/flash_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎vllm_rbln/attention/backends/flash_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm_rbln/platform.py‎
Lines changed: 37 additions & 9 deletions b/‎vllm_rbln/platform.py‎
Lines changed: 37 additions & 9 deletions
diff --git a/‎vllm_rbln/v1/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎vllm_rbln/v1/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎vllm_rbln/v1/attention/__init__.py‎ b/‎vllm_rbln/v1/attention/__init__.py‎
diff --git a/‎vllm_rbln/v1/attention/backends/__init__.py‎ b/‎vllm_rbln/v1/attention/backends/__init__.py‎
@@ -421,7 +421,7 @@ def forward(
             kv_cache shape= [2, num_blocks,
                                 block_size * num_kv_heads * head_size]
 
-        TODO:
+        Shape that we expect:
             kv_cache  = [2][num_blocks, num_kv_heads, 1, block_size, head_size]
             key       = [1, num_kv_heads, 1, block_size, head_size]
             query     = [1, num_kv_heads, 4, query_len, head_size]
 
@@ -166,11 +166,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if cls.is_torch_compile:
-            if parallel_config.worker_cls == "auto":
-                parallel_config.worker_cls = \
-                    "vllm_rbln.worker.worker.RBLNWorker"
-            scheduler_config.scheduler_cls = \
-                "vllm_rbln.core.scheduler.RBLNScheduler"
+            if envs.VLLM_USE_V1:
+                if parallel_config.worker_cls == "auto":
+                    parallel_config.worker_cls = (
+                        "vllm_rbln.v1.worker.rbln_worker.RBLNWorker")
+                scheduler_config.scheduler_cls = (
+                    "vllm_rbln.v1.core.rbln_scheduler.RBLNScheduler")
+            else:
+                if parallel_config.worker_cls == "auto":
+                    parallel_config.worker_cls = (
+                        "vllm_rbln.worker.worker.RBLNWorker")
+                scheduler_config.scheduler_cls = (
+                    "vllm_rbln.core.scheduler.RBLNScheduler")
         else:
             if envs.VLLM_USE_V1:
                 if parallel_config.worker_cls == "auto":
@@ -204,6 +211,24 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "block_size must be configured for RBLN backend")
             cache_config.enable_prefix_caching = False
 
+        if envs.VLLM_USE_V1 and cls.is_torch_compile:
+            from vllm.config import CompilationLevel
+
+            if (vllm_config.compilation_config.level
+                    != CompilationLevel.NO_COMPILATION):
+                logger.info("RBLN doesn't @support_torch_compile decorator")
+                vllm_config.compilation_config.level = (
+                    CompilationLevel.NO_COMPILATION)
+                if (len(vllm_config.compilation_config.custom_ops) == 1
+                        and vllm_config.compilation_config.custom_ops[0]
+                        == "none"):
+                    vllm_config.compilation_config.custom_ops = []
+
+            if not model_config.disable_cascade_attn:
+                logger.info("The cascade attention is disabled"
+                            " because RBLN does not support it")
+                model_config.disable_cascade_attn = True
+
     @classmethod
     def get_attn_backend_cls(
         cls,
@@ -215,13 +240,16 @@ def get_attn_backend_cls(
         use_v1: bool,
         use_mla: bool,
     ) -> str:
-        attn_backend_cls = (
-            "vllm_rbln.attention.backends.flash_attention.RBLNAttentionBackend"
-        )
+        if envs.VLLM_USE_V1:
+            attn_backend_cls = ("vllm_rbln.v1.attention.backends."
+                                "flash_attention.RBLNAttentionBackend")
+        else:
+            attn_backend_cls = ("vllm_rbln.attention.backends."
+                                "flash_attention.RBLNAttentionBackend")
         logger.info("Using RBLN Attention Backend: %s", attn_backend_cls)
 
         return attn_backend_cls
 
     @classmethod
     def supports_v1(cls, model_config: "ModelConfig") -> bool:
-        return not cls.is_torch_compile
+        return True
@@ -0,0 +1,13 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.