Skip to content

Commit 386fbf4

Browse files
core: migrating to V1 Engine (#51)
* feature: migrate v1 engine for torch.compile
1 parent bb5b768 commit 386fbf4

File tree

11 files changed

+3542
-10
lines changed

11 files changed

+3542
-10
lines changed

vllm_rbln/attention/backends/flash_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ def forward(
421421
kv_cache shape= [2, num_blocks,
422422
block_size * num_kv_heads * head_size]
423423
424-
TODO:
424+
Shape that we expect:
425425
kv_cache = [2][num_blocks, num_kv_heads, 1, block_size, head_size]
426426
key = [1, num_kv_heads, 1, block_size, head_size]
427427
query = [1, num_kv_heads, 4, query_len, head_size]

vllm_rbln/platform.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
166166
parallel_config = vllm_config.parallel_config
167167
scheduler_config = vllm_config.scheduler_config
168168
if cls.is_torch_compile:
169-
if parallel_config.worker_cls == "auto":
170-
parallel_config.worker_cls = \
171-
"vllm_rbln.worker.worker.RBLNWorker"
172-
scheduler_config.scheduler_cls = \
173-
"vllm_rbln.core.scheduler.RBLNScheduler"
169+
if envs.VLLM_USE_V1:
170+
if parallel_config.worker_cls == "auto":
171+
parallel_config.worker_cls = (
172+
"vllm_rbln.v1.worker.rbln_worker.RBLNWorker")
173+
scheduler_config.scheduler_cls = (
174+
"vllm_rbln.v1.core.rbln_scheduler.RBLNScheduler")
175+
else:
176+
if parallel_config.worker_cls == "auto":
177+
parallel_config.worker_cls = (
178+
"vllm_rbln.worker.worker.RBLNWorker")
179+
scheduler_config.scheduler_cls = (
180+
"vllm_rbln.core.scheduler.RBLNScheduler")
174181
else:
175182
if envs.VLLM_USE_V1:
176183
if parallel_config.worker_cls == "auto":
@@ -204,6 +211,24 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
204211
"block_size must be configured for RBLN backend")
205212
cache_config.enable_prefix_caching = False
206213

214+
if envs.VLLM_USE_V1 and cls.is_torch_compile:
215+
from vllm.config import CompilationLevel
216+
217+
if (vllm_config.compilation_config.level
218+
!= CompilationLevel.NO_COMPILATION):
219+
logger.info("RBLN doesn't @support_torch_compile decorator")
220+
vllm_config.compilation_config.level = (
221+
CompilationLevel.NO_COMPILATION)
222+
if (len(vllm_config.compilation_config.custom_ops) == 1
223+
and vllm_config.compilation_config.custom_ops[0]
224+
== "none"):
225+
vllm_config.compilation_config.custom_ops = []
226+
227+
if not model_config.disable_cascade_attn:
228+
logger.info("The cascade attention is disabled"
229+
" because RBLN does not support it")
230+
model_config.disable_cascade_attn = True
231+
207232
@classmethod
208233
def get_attn_backend_cls(
209234
cls,
@@ -215,13 +240,16 @@ def get_attn_backend_cls(
215240
use_v1: bool,
216241
use_mla: bool,
217242
) -> str:
218-
attn_backend_cls = (
219-
"vllm_rbln.attention.backends.flash_attention.RBLNAttentionBackend"
220-
)
243+
if envs.VLLM_USE_V1:
244+
attn_backend_cls = ("vllm_rbln.v1.attention.backends."
245+
"flash_attention.RBLNAttentionBackend")
246+
else:
247+
attn_backend_cls = ("vllm_rbln.attention.backends."
248+
"flash_attention.RBLNAttentionBackend")
221249
logger.info("Using RBLN Attention Backend: %s", attn_backend_cls)
222250

223251
return attn_backend_cls
224252

225253
@classmethod
226254
def supports_v1(cls, model_config: "ModelConfig") -> bool:
227-
return not cls.is_torch_compile
255+
return True

vllm_rbln/v1/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2025 Rebellions Inc. All rights reserved.
2+
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at:
6+
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

vllm_rbln/v1/attention/__init__.py

Whitespace-only changes.

vllm_rbln/v1/attention/backends/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)