Skip to content

Commit f201d79

Browse files
committed
rebase to 0.6.1
2 parents 581c529 + 9ba0817 commit f201d79

File tree

200 files changed

+9167
-3138
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

200 files changed

+9167
-3138
lines changed

.buildkite/run-amd-test.sh

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
7171
HF_MOUNT="/root/.cache/huggingface"
7272

7373
commands=$@
74+
echo "Commands:$commands"
75+
#ignore certain kernels tests
76+
if [[ $commands == *" kernels "* ]]; then
77+
commands="${commands} \
78+
--ignore=kernels/test_attention.py \
79+
--ignore=kernels/test_attention_selector.py \
80+
--ignore=kernels/test_blocksparse_attention.py \
81+
--ignore=kernels/test_causal_conv1d.py \
82+
--ignore=kernels/test_cutlass.py \
83+
--ignore=kernels/test_encoder_decoder_attn.py \
84+
--ignore=kernels/test_flash_attn.py \
85+
--ignore=kernels/test_flashinfer.py \
86+
--ignore=kernels/test_int8_quant.py \
87+
--ignore=kernels/test_machete_gemm.py \
88+
--ignore=kernels/test_mamba_ssm.py \
89+
--ignore=kernels/test_marlin_gemm.py \
90+
--ignore=kernels/test_moe.py \
91+
--ignore=kernels/test_prefix_prefill.py \
92+
--ignore=kernels/test_rand.py \
93+
--ignore=kernels/test_sampler.py"
94+
fi
95+
7496
PARALLEL_JOB_COUNT=8
7597
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
7698
if [[ $commands == *"--shard-id="* ]]; then
7799
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
78100
#replace shard arguments
79-
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
101+
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
80102
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
103+
echo "Shard ${GPU} commands:$commands"
81104
docker run \
82105
--device /dev/kfd --device /dev/dri \
83106
--network host \

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ trap remove_docker_container EXIT
1111
remove_docker_container
1212

1313
# Run the image, setting --shm-size=4g for tensor parallel.
14+
source /etc/environment
1415
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
15-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test
16+
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
1617

1718
# Run basic model test
1819
docker exec cpu-test bash -c "

.buildkite/run-cpu-test.sh

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,16 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
2323
# Run basic model test
2424
docker exec cpu-test bash -c "
2525
pip install pytest matplotlib einops transformers_stream_generator
26-
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
27-
--ignore=tests/models/test_oot_registration.py \
28-
--ignore=tests/models/test_registry.py \
29-
--ignore=tests/models/test_fp8.py \
30-
--ignore=tests/models/test_jamba.py \
31-
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
26+
pytest -v -s tests/models/decoder_only/language \
27+
--ignore=tests/models/test_fp8.py \
28+
--ignore=tests/models/decoder_only/language/test_jamba.py \
29+
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
30+
31+
# Run compressed-tensor test
32+
docker exec cpu-test bash -c "
33+
pytest -s -v \
34+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
35+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
3236

3337
# online inference
3438
docker exec cpu-test bash -c "

.buildkite/test-pipeline.yaml

Lines changed: 50 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ steps:
5050
- tests/worker
5151
commands:
5252
- pytest -v -s async_engine # Async Engine
53+
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
5354
- pytest -v -s test_inputs.py
5455
- pytest -v -s multimodal
5556
- pytest -v -s test_utils.py # Utils
@@ -91,7 +92,7 @@ steps:
9192
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
9293
- pytest -v -s entrypoints/openai
9394
- pytest -v -s entrypoints/test_chat_utils.py
94-
95+
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
9596

9697
- label: Distributed Tests (4 GPUs) # 10min
9798
working_dir: "/vllm-workspace/tests"
@@ -162,30 +163,13 @@ steps:
162163
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
163164
- python3 offline_inference_encoder_decoder.py
164165

165-
- label: Models Test # 1hr10min
166-
source_file_dependencies:
167-
- vllm/
168-
- tests/models
169-
commands:
170-
- pip install -e ./plugins/vllm_add_dummy_model
171-
- pytest -v -s models/test_oot_registration.py # it needs a clean process
172-
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
173-
174166
- label: torch compile integration test
175167
source_file_dependencies:
176168
- vllm/
177169
commands:
178170
- pytest -v -s ./compile/test_full_graph.py
179171
- pytest -v -s ./compile/test_wrapper.py
180172

181-
182-
- label: Vision Language Models Test # 42min
183-
#mirror_hardwares: [amd]
184-
source_file_dependencies:
185-
- vllm/
186-
commands:
187-
- pytest -v -s models -m vlm
188-
189173
- label: Prefix Caching Test # 7min
190174
#mirror_hardwares: [amd]
191175
source_file_dependencies:
@@ -217,7 +201,8 @@ steps:
217201
commands:
218202
# See https://github.com/vllm-project/vllm/issues/5152
219203
- export VLLM_ATTENTION_BACKEND=XFORMERS
220-
- pytest -v -s spec_decode
204+
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
205+
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
221206

222207
- label: LoRA Test %N # 30min each
223208
mirror_hardwares: [amd]
@@ -228,6 +213,7 @@ steps:
228213
parallelism: 4
229214

230215
- label: Kernels Test %N # 30min each
216+
mirror_hardwares: [amd]
231217
source_file_dependencies:
232218
- csrc/
233219
- vllm/attention
@@ -282,6 +268,45 @@ steps:
282268
commands:
283269
- pytest -v -s tool_use
284270

271+
##### models test #####
272+
273+
- label: Basic Models Test # 3min
274+
source_file_dependencies:
275+
- vllm/
276+
- tests/models
277+
commands:
278+
- pip install -e ./plugins/vllm_add_dummy_model
279+
- pytest -v -s models/test_oot_registration.py # it needs a clean process
280+
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
281+
282+
- label: Decoder-only Language Models Test # 1h3min
283+
#mirror_hardwares: [amd]
284+
source_file_dependencies:
285+
- vllm/
286+
- tests/models/decoder_only/language
287+
commands:
288+
- pytest -v -s models/decoder_only/language
289+
290+
- label: Decoder-only Multi-Modal Models Test # 56min
291+
#mirror_hardwares: [amd]
292+
source_file_dependencies:
293+
- vllm/
294+
- tests/models/decoder_only/audio_language
295+
- tests/models/decoder_only/vision_language
296+
commands:
297+
- pytest -v -s models/decoder_only/audio_language
298+
- pytest -v -s models/decoder_only/vision_language
299+
300+
- label: Other Models Test # 5min
301+
#mirror_hardwares: [amd]
302+
source_file_dependencies:
303+
- vllm/
304+
- tests/models/embedding/language
305+
- tests/models/encoder_decoder/language
306+
commands:
307+
- pytest -v -s models/embedding/language
308+
- pytest -v -s models/encoder_decoder/language
309+
285310
##### 1 GPU test #####
286311
##### multi gpus test #####
287312

@@ -307,11 +332,11 @@ steps:
307332
- tests/distributed/
308333
commands:
309334
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
310-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
335+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
311336
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
312337
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
313338
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
314-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
339+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
315340

316341
- label: Distributed Tests (2 GPUs) # 28min
317342
#mirror_hardwares: [amd]
@@ -324,11 +349,10 @@ steps:
324349
- vllm/model_executor/models/
325350
- tests/distributed/
326351
commands:
327-
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
328-
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
329-
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
330-
- pytest -v -s distributed/test_chunked_prefill_distributed.py
331-
- pytest -v -s distributed/test_multimodal_broadcast.py
352+
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
353+
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
354+
# Avoid importing model tests that cause CUDA reinitialization error
355+
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
332356
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
333357
- pip install -e ./plugins/vllm_add_dummy_model
334358
- pytest -v -s distributed/test_distributed_oot.py

.github/ISSUE_TEMPLATE/400-bug report.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ body:
3030
</details>
3131
validations:
3232
required: true
33+
- type: textarea
34+
attributes:
35+
label: Model Input Dumps
36+
description: |
37+
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
38+
placeholder: |
39+
Upload the dumped input file.
40+
validations:
41+
required: false
3342
- type: textarea
3443
attributes:
3544
label: 🐛 Describe the bug

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
3939
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
4040
</ul>
4141

42+
<h3>Adding or changing kernels</h3>
43+
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
44+
<ul>
45+
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
46+
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
47+
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
48+
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
49+
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
50+
</ul>
51+
4252
<h3>Notes for Large Changes</h3>
4353
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
4454

CMakeLists.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
194194
FetchContent_Declare(
195195
cutlass
196196
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
197-
# CUTLASS 3.5.1
198-
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
197+
GIT_TAG v3.5.1
199198
GIT_PROGRESS TRUE
199+
200+
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
201+
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
202+
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
203+
GIT_SHALLOW TRUE
200204
)
201205
FetchContent_MakeAvailable(cutlass)
202206

@@ -230,6 +234,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
230234
"-gencode arch=compute_90a,code=sm_90a")
231235
endif()
232236

237+
233238
#
234239
# Machete kernels
235240

@@ -288,6 +293,12 @@ define_gpu_extension_target(
288293
USE_SABI 3
289294
WITH_SOABI)
290295

296+
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
297+
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
298+
# driver API. This causes problems when linking with earlier versions of CUDA.
299+
# Setting this variable sidesteps the issue by calling the driver directly.
300+
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
301+
291302
#
292303
# _moe_C extension
293304
#

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
145145
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
146146
&& apt-get update -y \
147147
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
148+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
148149
&& add-apt-repository ppa:deadsnakes/ppa \
149150
&& apt-get update -y \
150151
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \

Dockerfile.cpu

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,14 @@
22

33
FROM ubuntu:22.04 AS cpu-test-1
44

5+
ENV CCACHE_DIR=/root/.cache/ccache
6+
7+
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
8+
59
RUN --mount=type=cache,target=/var/cache/apt \
610
apt-get update -y \
711
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
12+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
813
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
914

1015
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
2530
pip install --upgrade pip && \
2631
pip install -r requirements-build.txt
2732

33+
# install oneDNN
34+
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
35+
36+
RUN --mount=type=cache,target=/root/.cache/ccache \
37+
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
38+
-DONEDNN_BUILD_DOC=OFF \
39+
-DONEDNN_BUILD_EXAMPLES=OFF \
40+
-DONEDNN_BUILD_TESTS=OFF \
41+
-DONEDNN_BUILD_GRAPH=OFF \
42+
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
43+
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
44+
cmake --build ./oneDNN/build --target install --config Release
45+
2846
FROM cpu-test-1 AS build
2947

3048
WORKDIR /workspace/vllm
@@ -40,7 +58,6 @@ COPY ./ ./
4058
ARG VLLM_CPU_DISABLE_AVX512
4159
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
4260

43-
ENV CCACHE_DIR=/root/.cache/ccache
4461
RUN --mount=type=cache,target=/root/.cache/pip \
4562
--mount=type=cache,target=/root/.cache/ccache \
4663
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \

Dockerfile.neuron

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
66
RUN echo "Base image is $BASE_IMAGE"
77

88
# Install some basic utilities
9-
RUN apt-get update && apt-get install python3 python3-pip -y
9+
RUN apt-get update \
10+
&& apt-get install python3 python3-pip -y \
11+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
1012

1113
### Mount Point ###
1214
# When launching the container, mount the code directory to /app

0 commit comments

Comments
 (0)