RBLN-SW · rebel-jinhwan · Jan 28, 2026 · Jan 28, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/examples/experimental/offline_inference_basic.py b/examples/experimental/offline_inference_basic.py
@@ -50,7 +50,7 @@ def main():
         block_size=args.block_size,
         enable_chunked_prefill=True,
         max_num_batched_tokens=128,
-        gpu_memory_utilization=1,
+        gpu_memory_utilization=0.9,
         enable_expert_parallel=args.enable_expert_parallel,
     )
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -124,6 +124,11 @@ ignore = [
     "UP007",
 ]
 
+[tool.pytest.ini_options]
+markers = [
+    "cpu_test: mark test as CPU-only test",
+]
+
 [tool.mypy]
 ignore_missing_imports = true
 check_untyped_defs = true

diff --git a/tests/torch_compile/common/test_logger.py b/tests/torch_compile/common/test_logger.py
diff --git a/tests/torch_compile/common/test_platform.py b/tests/torch_compile/common/test_platform.py
diff --git a/tests/torch_compile/common/test_rbln_envs.py b/tests/torch_compile/common/test_rbln_envs.py
diff --git a/tests/torch_compile/conftest.py b/tests/torch_compile/conftest.py
@@ -13,8 +13,13 @@
 # limitations under the License.
 
 import pytest
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.plugins import load_general_plugins
 
 
@@ -29,7 +34,7 @@ def initialize_environment():
 
 @pytest.fixture
 def vllm_config():
-    scheduler_config = SchedulerConfig()
+    scheduler_config = SchedulerConfig.default_factory()
     model_config = ModelConfig(model="facebook/opt-125m")
     cache_config = CacheConfig(
         block_size=1024,

diff --git a/tests/torch_compile/models/__init__.py → tests/torch_compile/integrations/__init__.py b/tests/torch_compile/models/__init__.py → tests/torch_compile/integrations/__init__.py
diff --git a/tests/torch_compile/v1/__init__.py → ...h_compile/integrations/models/__init__.py b/tests/torch_compile/v1/__init__.py → ...h_compile/integrations/models/__init__.py
diff --git a/...e/models/test_basic_models_correctness.py → ...s/models/test_basic_models_correctness.py b/...e/models/test_basic_models_correctness.py → ...s/models/test_basic_models_correctness.py
diff --git a/...pile/models/test_model_coverage_single.py → ...ions/models/test_model_coverage_single.py b/...pile/models/test_model_coverage_single.py → ...ions/models/test_model_coverage_single.py
diff --git a/...ompile/models/test_pooling_model_embed.py → ...ations/models/test_pooling_model_embed.py b/...ompile/models/test_pooling_model_embed.py → ...ations/models/test_pooling_model_embed.py
diff --git a/...ompile/models/test_pooling_model_score.py → ...ations/models/test_pooling_model_score.py b/...ompile/models/test_pooling_model_score.py → ...ations/models/test_pooling_model_score.py
diff --git a/tests/torch_compile/models/utils.py → ...orch_compile/integrations/models/utils.py b/tests/torch_compile/models/utils.py → ...orch_compile/integrations/models/utils.py
diff --git a/tests/torch_compile/v1/core/__init__.py → ...torch_compile/integrations/v1/__init__.py b/tests/torch_compile/v1/core/__init__.py → ...torch_compile/integrations/v1/__init__.py
diff --git a/tests/torch_compile/v1/lora/__init__.py → ..._compile/integrations/v1/core/__init__.py b/tests/torch_compile/v1/lora/__init__.py → ..._compile/integrations/v1/core/__init__.py
diff --git a/...ch_compile/v1/core/test_prefix_caching.py → ...tegrations/v1/core/test_prefix_caching.py b/...ch_compile/v1/core/test_prefix_caching.py → ...tegrations/v1/core/test_prefix_caching.py
diff --git a/...s/torch_compile/v1/core/test_scheduler.py → ...le/integrations/v1/core/test_scheduler.py b/...s/torch_compile/v1/core/test_scheduler.py → ...le/integrations/v1/core/test_scheduler.py
@@ -14,9 +14,11 @@
 
 from vllm.v1.request import RequestStatus
 
-from tests.torch_compile.v1.core.utils import (create_requests,
-                                               create_runner_output,
-                                               create_scheduler)
+from tests.torch_compile.integrations.v1.core.utils import (
+    create_requests,
+    create_runner_output,
+    create_scheduler,
+)
 
 
 def test_schedule():
@@ -50,8 +52,9 @@ def test_schedule():
     output = scheduler.schedule()
     assert output.scheduled_cached_reqs.num_reqs == len(requests)
     assert len(output.num_scheduled_tokens) == len(requests)
-    assert all(num_tokens == 1
-               for num_tokens in output.num_scheduled_tokens.values())
+    assert all(
+        num_tokens == 1 for num_tokens in output.num_scheduled_tokens.values()
+    )
     assert len(output.finished_req_ids) == 0
 
 

diff --git a/tests/torch_compile/v1/core/utils.py → ...rch_compile/integrations/v1/core/utils.py b/tests/torch_compile/v1/core/utils.py → ...rch_compile/integrations/v1/core/utils.py
diff --git a/tests/torch_compile/integrations/v1/lora/__init__.py b/tests/torch_compile/integrations/v1/lora/__init__.py
diff --git a/tests/torch_compile/v1/lora/conftest.py → ..._compile/integrations/v1/lora/conftest.py b/tests/torch_compile/v1/lora/conftest.py → ..._compile/integrations/v1/lora/conftest.py
diff --git a/..._compile/v1/lora/test_lora_checkpoints.py → ...grations/v1/lora/test_lora_checkpoints.py b/..._compile/v1/lora/test_lora_checkpoints.py → ...grations/v1/lora/test_lora_checkpoints.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 
 import pytest
-from vllm.lora.models import LoRAModel
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper

diff --git a/...ch_compile/v1/lora/test_lora_functions.py → ...tegrations/v1/lora/test_lora_functions.py b/...ch_compile/v1/lora/test_lora_functions.py → ...tegrations/v1/lora/test_lora_functions.py
@@ -17,6 +17,9 @@
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
+
+import pytest
+
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.lora.request import LoRARequest
@@ -27,11 +30,15 @@
 
 
 def make_lora_request(lora_id: int):
-    return LoRARequest(lora_name=f"{lora_id}",
-                       lora_int_id=lora_id,
-                       lora_path=LORA_MODULE_PATH)
+    return LoRARequest(
+        lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
+    )
 
 
+@pytest.mark.skip(
+    reason="Integration test: requires RBLN device and conflicts with "
+    "session-scoped RBLN_DEVICES initialization"
+)
 def test_lora_functions_sync(monkeypatch):
     monkeypatch.setenv("RBLN_PROFILER", "0")
     monkeypatch.setenv("RBLN_KERNEL_MODEL", "triton")

diff --git a/..._compile/v1/lora/test_lora_huggingface.py → ...grations/v1/lora/test_lora_huggingface.py b/..._compile/v1/lora/test_lora_huggingface.py → ...grations/v1/lora/test_lora_huggingface.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 
 import pytest
-from vllm.lora.models import LoRAModel
+from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.model_executor.models.llama import LlamaForCausalLM

diff --git a/...orch_compile/v1/lora/test_lora_manager.py → ...integrations/v1/lora/test_lora_manager.py b/...orch_compile/v1/lora/test_lora_manager.py → ...integrations/v1/lora/test_lora_manager.py
@@ -25,12 +25,13 @@
 from torch import nn
 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA, LoRAMapping,
                               MergedColumnParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA)
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
-                              LRUCacheLoRAModelManager)
+from vllm.lora.lora_model import LoRAModel
+from vllm.lora.model_manager import (LoRAModelManager,
+                                     LRUCacheLoRAModelManager)
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,

diff --git a/tests/torch_compile/v1/lora/test_worker.py → ...mpile/integrations/v1/lora/test_worker.py b/tests/torch_compile/v1/lora/test_worker.py → ...mpile/integrations/v1/lora/test_worker.py
@@ -25,7 +25,7 @@
                          set_current_vllm_config)
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
-from vllm.lora.models import LoRAMapping
+from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 
 from vllm_rbln.v1.worker.rbln_worker import RBLNWorker as Worker

diff --git a/tests/torch_compile/v1/lora/utils.py → ...rch_compile/integrations/v1/lora/utils.py b/tests/torch_compile/v1/lora/utils.py → ...rch_compile/integrations/v1/lora/utils.py
diff --git a/tests/torch_compile/units/__init__.py b/tests/torch_compile/units/__init__.py
diff --git a/...ch_compile/common/test_forward_context.py → ...rch_compile/units/test_forward_context.py b/...ch_compile/common/test_forward_context.py → ...rch_compile/units/test_forward_context.py
@@ -17,11 +17,14 @@
 import pytest
 import torch
 
+pytestmark = pytest.mark.cpu_test
+
 
 @pytest.fixture
 def attn_metadata_mock():
     from vllm_rbln.v1.attention.backends.flash_attention import (
-        RBLNFlashAttentionMetadata)
+        RBLNFlashAttentionMetadata,
+    )
 
     attn_metadata_mock = MagicMock(spec=RBLNFlashAttentionMetadata)
     attn_metadata_mock.num_actual_tokens = 16
@@ -31,13 +34,18 @@ def attn_metadata_mock():
 def test_forward_context(vllm_config, attn_metadata_mock: MagicMock):
     # forward_context
     from vllm.forward_context import get_forward_context, set_forward_context
+
     with set_forward_context(
-            attn_metadata_mock,
-            vllm_config,
-            num_tokens_across_dp=torch.tensor([0, 1]),
+        attn_metadata_mock,
+        vllm_config,
+        num_tokens_across_dp=torch.tensor([0, 1]),
+        num_padded_tokens=1,
     ):
         # assert dp_metadata class name is RBLNDPMetadata
-        assert (get_forward_context().dp_metadata.__class__.__name__ ==
-                "RBLNDPMetadata"
-                ), f"Expected 'dp_metadata' class name is RBLNDPMetadata, \
+        assert (
+            get_forward_context().dp_metadata.__class__.__name__
+            == "RBLNDPMetadata"
+        ), (
+            f"Expected 'dp_metadata' class name is RBLNDPMetadata, \
                     got {get_forward_context().dp_metadata.__class__.__name__}"
+        )