Skip to content

Commit 786cabe

Browse files
authored
align vllm hpu version to latest vllm-fork (#860)
Signed-off-by: Xinyao Wang <[email protected]>
1 parent 618f45b commit 786cabe

File tree

13 files changed

+30
-77
lines changed

13 files changed

+30
-77
lines changed

.github/workflows/docker/compose/llms-compose-cd.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,6 @@ services:
2323
build:
2424
dockerfile: comps/llms/text-generation/vllm/llama_index/Dockerfile
2525
image: ${REGISTRY:-opea}/llm-vllm-llamaindex:${TAG:-latest}
26-
llm-vllm-llamaindex-hpu:
27-
build:
28-
dockerfile: comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu
29-
image: ${REGISTRY:-opea}/llm-vllm-llamaindex-hpu:${TAG:-latest}
3026
llm-predictionguard:
3127
build:
3228
dockerfile: comps/llms/text-generation/predictionguard/Dockerfile

.github/workflows/docker/compose/llms-compose.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@ services:
2424
build:
2525
dockerfile: comps/llms/text-generation/vllm/langchain/Dockerfile
2626
image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest}
27-
llm-vllm-hpu:
28-
build:
29-
dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu
30-
image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest}
3127
llm-vllm-ray:
3228
build:
3329
dockerfile: comps/llms/text-generation/vllm/ray/Dockerfile

comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu

Lines changed: 0 additions & 22 deletions
This file was deleted.

comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@ fi
3030

3131
# Build the docker image for vLLM based on the hardware mode
3232
if [ "$hw_mode" = "hpu" ]; then
33-
docker build -f Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
33+
git clone https://github.com/HabanaAI/vllm-fork.git
34+
cd ./vllm-fork/
35+
docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
36+
cd ..
37+
rm -rf vllm-fork
3438
else
3539
git clone https://github.com/vllm-project/vllm.git
3640
cd ./vllm/

comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ volume=$PWD/data
3838

3939
# Build the Docker run command based on hardware mode
4040
if [ "$hw_mode" = "hpu" ]; then
41-
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture "
41+
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture
4242
else
4343
docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80
4444
fi

comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ services:
2323
cap_add:
2424
- SYS_NICE
2525
ipc: host
26-
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
26+
command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80
2727
llm:
2828
image: opea/llm-vllm:latest
2929
container_name: llm-vllm-gaudi-server

comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu

Lines changed: 0 additions & 24 deletions
This file was deleted.

comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@ fi
3030

3131
# Build the docker image for vLLM based on the hardware mode
3232
if [ "$hw_mode" = "hpu" ]; then
33-
docker build -f docker/Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
33+
git clone https://github.com/HabanaAI/vllm-fork.git
34+
cd ./vllm-fork/
35+
docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
36+
cd ..
37+
rm -rf vllm-fork
3438
else
3539
git clone https://github.com/vllm-project/vllm.git
3640
cd ./vllm/

comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ volume=$PWD/data
3838

3939
# Build the Docker run command based on hardware mode
4040
if [ "$hw_mode" = "hpu" ]; then
41-
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture "
41+
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture
4242
else
4343
docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80
4444
fi

comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ services:
2323
cap_add:
2424
- SYS_NICE
2525
ipc: host
26-
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
26+
command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80
2727
llm:
2828
image: opea/llm-vllm-llamaindex:latest
2929
container_name: llm-vllm-gaudi-server

0 commit comments

Comments
 (0)