Skip to content

Commit f3a507f

Browse files
authored
[Core] Add an environment variable which needs to be set explicitly to allow BlockSpaceManagerV1 (vllm-project#9149)
1 parent a64e7b9 commit f3a507f

14 files changed

+94
-8
lines changed

.buildkite/test-pipeline.yaml

+12-6
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ steps:
7777
- vllm/
7878
- tests/basic_correctness/test_chunked_prefill
7979
commands:
80-
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
81-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
80+
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
81+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
8282

8383
- label: Core Test # 10min
8484
mirror_hardwares: [amd]
@@ -88,7 +88,11 @@ steps:
8888
- vllm/distributed
8989
- tests/core
9090
commands:
91-
- pytest -v -s core
91+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
92+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
93+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
94+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
95+
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
9296

9397
- label: Entrypoints Test # 40min
9498
working_dir: "/vllm-workspace/tests"
@@ -185,7 +189,8 @@ steps:
185189
- vllm/
186190
- tests/prefix_caching
187191
commands:
188-
- pytest -v -s prefix_caching
192+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
193+
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
189194

190195
- label: Samplers Test # 36min
191196
source_file_dependencies:
@@ -209,7 +214,8 @@ steps:
209214
- tests/spec_decode
210215
commands:
211216
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
212-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
217+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
218+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
213219

214220
- label: LoRA Test %N # 15min each
215221
mirror_hardwares: [amd]
@@ -391,7 +397,7 @@ steps:
391397
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
392398
- pytest -v -s ./compile/test_wrapper.py
393399
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
394-
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
400+
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
395401
# Avoid importing model tests that cause CUDA reinitialization error
396402
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
397403
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus

benchmarks/benchmark_latency.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
221221
parser.add_argument("--enable-prefix-caching",
222222
action='store_true',
223223
help="Enable automatic prefix caching")
224-
parser.add_argument('--use-v2-block-manager', action='store_true')
224+
parser.add_argument('--use-v2-block-manager',
225+
action='store_true',
226+
default=EngineArgs.use_v2_block_manager)
225227
parser.add_argument(
226228
"--ray-workers-use-nsight",
227229
action='store_true',

benchmarks/benchmark_prefix_caching.py

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from transformers import PreTrainedTokenizerBase
3434

3535
from vllm import LLM, SamplingParams
36+
from vllm.engine.arg_utils import EngineArgs
3637
from vllm.utils import FlexibleArgumentParser
3738

3839
try:
@@ -177,6 +178,7 @@ def main(args):
177178
help='enable prefix caching')
178179
parser.add_argument('--use-v2-block-manager',
179180
action='store_true',
181+
default=EngineArgs.use_v2_block_manager,
180182
help='Use BlockSpaceMangerV2')
181183
parser.add_argument('--num-prompts',
182184
type=int,

benchmarks/benchmark_throughput.py

+1
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,7 @@ def main(args: argparse.Namespace):
473473
help="Maximum number of forward steps per scheduler call.")
474474
parser.add_argument("--use-v2-block-manager",
475475
action='store_true',
476+
default=EngineArgs.use_v2_block_manager,
476477
help="Enable block manager v2.")
477478
parser.add_argument(
478479
"--enable-prefix-caching",

tests/basic_correctness/test_chunked_prefill.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,20 @@
1212
import pytest
1313

1414
from ..models.utils import check_logprobs_close, check_outputs_equal
15-
from ..utils import multi_gpu_test
15+
from ..utils import check_deprecated_block_manager_usage, multi_gpu_test
1616

1717
MODELS = [
1818
"facebook/opt-125m",
1919
"meta-llama/Llama-2-7b-hf",
2020
]
2121

2222

23+
@pytest.fixture(scope="module", autouse=True)
24+
def check_deprecated_block_manager():
25+
check_deprecated_block_manager_usage(
26+
'tests/basic_correctness/test_chunked_prefill.py')
27+
28+
2329
@pytest.mark.parametrize("model", MODELS)
2430
@pytest.mark.parametrize("dtype", ["half"])
2531
@pytest.mark.parametrize("max_tokens", [32])

tests/core/block/e2e/test_correctness.py

+7
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@
22

33
import pytest
44

5+
from tests.utils import check_deprecated_block_manager_usage
56
from vllm import SamplingParams
67

78
from .conftest import get_token_ids_from_llm_generator
89

910

11+
@pytest.fixture(scope="module", autouse=True)
12+
def check_deprecated_block_manager():
13+
check_deprecated_block_manager_usage(
14+
'tests/core/block/e2e/test_correctness.py')
15+
16+
1017
@pytest.mark.parametrize(
1118
"common_llm_kwargs",
1219
[{

tests/core/block/e2e/test_correctness_sliding_window.py

+7
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import pytest
55

6+
from tests.utils import check_deprecated_block_manager_usage
67
from vllm import LLM, SamplingParams
78

89
from .conftest import get_text_from_llm_generator
@@ -12,6 +13,12 @@
1213
BLOCK_SIZE = 16
1314

1415

16+
@pytest.fixture(scope="module", autouse=True)
17+
def check_deprecated_block_manager():
18+
check_deprecated_block_manager_usage(
19+
'tests/core/block/e2e/test_correctness_sliding_window.py')
20+
21+
1522
@pytest.mark.parametrize(
1623
"common_llm_kwargs",
1724
[{

tests/core/test_chunked_prefill_scheduler.py

+7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from vllm.core.scheduler import Scheduler
99
from vllm.sequence import Logprob, SequenceGroup
1010

11+
from ..utils import check_deprecated_block_manager_usage
1112
from .utils import create_dummy_prompt
1213

1314

@@ -27,6 +28,12 @@ def schedule_and_update_computed_tokens(scheduler):
2728
return metas, out
2829

2930

31+
@pytest.fixture(scope="module", autouse=True)
32+
def check_deprecated_block_manager():
33+
check_deprecated_block_manager_usage(
34+
'tests/core/test_chunked_prefill_scheduler.py')
35+
36+
3037
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
3138
def test_simple(use_v2_block_manager: bool):
3239
"""Verify basic scheduling works."""

tests/core/test_scheduler.py

+7
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,18 @@
1212
from vllm.lora.request import LoRARequest
1313
from vllm.sequence import SequenceGroup, SequenceStatus
1414

15+
from ..utils import check_deprecated_block_manager_usage
1516
from .utils import (append_new_token, append_new_token_seq_group,
1617
create_dummy_prompt, get_sequence_groups,
1718
schedule_and_update_computed_tokens)
1819

1920

21+
@pytest.fixture(scope="module", autouse=True)
22+
def check_deprecated_block_manager():
23+
check_deprecated_block_manager_usage(
24+
"tests/core/test_chunked_prefill_scheduler.py")
25+
26+
2027
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
2128
def test_scheduler_add_seq_group(use_v2_block_manager: bool):
2229
block_size = 4

tests/prefix_caching/test_prefix_caching.py

+7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pytest
88

99
from tests.kernels.utils import override_backend_env_variable
10+
from tests.utils import check_deprecated_block_manager_usage
1011
from vllm.block import PhysicalTokenBlock
1112
from vllm.core.block_manager_v1 import CachedBlockAllocator
1213
from vllm.utils import Device
@@ -18,6 +19,12 @@
1819
]
1920

2021

22+
@pytest.fixture(scope="module", autouse=True)
23+
def check_deprecated_block_manager():
24+
check_deprecated_block_manager_usage(
25+
'tests/prefix_caching/test_prefix_caching.py')
26+
27+
2128
@pytest.mark.parametrize("block_size", [16])
2229
@pytest.mark.parametrize("num_blocks", [16])
2330
def test_block_allocator(

tests/spec_decode/e2e/test_compatibility.py

+7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
import pytest
22

3+
from tests.utils import check_deprecated_block_manager_usage
34
from vllm import SamplingParams
45

56
from .conftest import get_output_from_llm_generator
67

78

9+
@pytest.fixture(scope="module", autouse=True)
10+
def check_deprecated_block_manager():
11+
check_deprecated_block_manager_usage(
12+
'tests/spec_decode/e2e/test_compatibility.py')
13+
14+
815
@pytest.mark.parametrize(
916
"common_llm_kwargs",
1017
[{

tests/utils.py

+9
Original file line numberDiff line numberDiff line change
@@ -678,3 +678,12 @@ def get_client_text_logprob_generations(
678678
return [(text_generations, text,
679679
(None if x.logprobs is None else x.logprobs.top_logprobs))
680680
for completion in completions for x in completion.choices]
681+
682+
683+
def check_deprecated_block_manager_usage(test_name: str):
684+
assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
685+
f"To allow the use of deprecated BlockSpaceManagerV1, set the "
686+
f"environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
687+
f"You can run the tests with: "
688+
f"`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest {test_name}`" #noqa
689+
)

vllm/config.py

+12
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,18 @@ def _verify_args(self) -> None:
10371037
f"({self.num_scheduler_steps}) must be greater than or "
10381038
"equal to 1.")
10391039

1040+
if (not self.use_v2_block_manager \
1041+
and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1):
1042+
raise ValueError(
1043+
"The use of BlockSpaceManagerV1 is deprecated and will "
1044+
"be removed in a future release. Please switch to "
1045+
"BlockSpaceManagerV2 by setting --use-v2-block-manager to "
1046+
"True. If you wish to suppress this error temporarily, "
1047+
"you can set the environment variable "
1048+
"`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. If your use "
1049+
"case is not supported in BlockSpaceManagerV2, please "
1050+
"file an issue with detailed information.")
1051+
10401052
@property
10411053
def is_multi_step(self) -> bool:
10421054
return self.num_scheduler_steps > 1

vllm/envs.py

+6
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
VLLM_USE_TRITON_AWQ: bool = False
6565
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
6666
VLLM_SKIP_P2P_CHECK: bool = False
67+
VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
6768

6869

6970
def get_default_cache_root():
@@ -434,6 +435,11 @@ def get_default_config_root():
434435
# and trust the driver's peer-to-peer capability report.
435436
"VLLM_SKIP_P2P_CHECK":
436437
lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
438+
439+
# If set, allowing the use of deprecated block manager V1
440+
"VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
441+
lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
442+
) == "1",
437443
}
438444

439445
# end-env-vars-definition

0 commit comments

Comments
 (0)