vllm 0.13.0

geodavic · geodavic · commit e089b84458d6 · 2026-01-05T21:59:45.000Z
diff --git a/README.md b/README.md
@@ -1,9 +1,9 @@
 # Fork of KServe for huggingfaceserver CVE fixes
 
-This is a fork of kserve that serves to document how we built the image:
+This is a fork of kserve that serves to document how we built the images:
 
 ```
-*******782.dkr.ecr.us-east-1.amazonaws.com/library/kserve-huggingfaceserver:v0.16.0
+*******782.dkr.ecr.us-east-1.amazonaws.com/library/kserve-huggingfaceserver:v0.16.0*
 ```
 
 The official image released by kserve had several high and critical CVEs. To build our version, use the `python/huggingface_server.Dockerfile` dockerfile. 
@@ -37,22 +37,18 @@ curl -v http://0.0.0.0:8080/openai/v1/chat/completions -H "Content-Type: applica
 
 The `reasoning_effort` is not available for all models.
 
-## SHA256 fix
+# Updating vLLM version
 
-The image:
+To update the vLLM version, edit  the following files:
 
 ```
-**********782.dkr.ecr.us-east-1.amazonaws.com/library/kserve-huggingfaceserver:v0.16.0.sha256.1
+python/huggingface_server.Dockerfile # (VLLM_VERSION arg)
+python/huggingfaceserver/pyproject.toml
+python/kserve/pyproject.toml
 ```
 
-is a temporary workaround to allow vLLM to work in FIPS constrained environments, where `hashlib.md5` is disabled. This image was made by first building the one above, and then exec-ing into it and running the following commands:
+Make sure you test your builds before deploying them after updating vLLM's version. The vLLM project is known to sometimes shuffle stuff internally and that can break kserve's vllm usage patterns.
 
-```bash
-$ cd /kserve-workspace/prod_venv/lib64/python3.12/site-packages/vllm/
-$ find . -type f -exec sed -i 's/hashlib\.md5/hashlib.sha256/g' {} +
-```
-
-This replaces all `hashlib.md5` calls with `hashlib.sha256`. Once that change is made inside the container, that running image is committed so the changes persist.
 
 # KServe
 [![go.dev reference](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white)](https://pkg.go.dev/github.com/kserve/kserve)
diff --git a/python/huggingface_server.Dockerfile b/python/huggingface_server.Dockerfile
@@ -52,7 +52,7 @@ WORKDIR ${WORKSPACE_DIR}
 FROM base AS build
 
 ARG WORKSPACE_DIR
-ARG VLLM_VERSION=0.12.0
+ARG VLLM_VERSION=0.13.0
 ARG LMCACHE_VERSION=0.3.0
 ARG BITSANDBYTES_VERSION=0.46.1
 ARG FLASHINFER_VERSION=0.2.6.post1
diff --git a/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py b/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py
@@ -174,7 +174,7 @@ async def start_engine(self):
                     chat_template=resolved_chat_template,
                     chat_template_content_format=self.args.chat_template_content_format,
                 )
-                if self.model_config.task == "embed"
+                if self.model_config.runner_type == "embed"
                 else None
             )
 
@@ -184,7 +184,7 @@ async def start_engine(self):
                     self.openai_serving_models,
                     request_logger=self.request_logger,
                 )
-                if self.model_config.task == "classify"
+                if self.model_config.runner_type == "classify"
                 else None
             )
 
diff --git a/python/huggingfaceserver/pyproject.toml b/python/huggingfaceserver/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
     "accelerate<2.0.0,>=1.6.0",
     "torch>=2.7.0",
     "triton>=3.2.0",
-    "vllm==0.12.0",
+    "vllm==0.13.0",
     "bitsandbytes>=0.45.3",
     "modelscope<2.0.0,>=1.16.0",
     "setuptools>=70.0.0",
diff --git a/python/kserve/pyproject.toml b/python/kserve/pyproject.toml
@@ -64,7 +64,7 @@ ray = [
     "ray[serve]>=2.43.0",
 ]
 llm = [
-    "vllm==0.12.0",
+    "vllm==0.13.0",
 ]
 
 [dependency-groups]

Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,7 @@ async def start_engine(self):`
`174`	`174`	`chat_template=resolved_chat_template,`
`175`	`175`	`chat_template_content_format=self.args.chat_template_content_format,`
`176`	`176`	`)`
`177`		`- if self.model_config.task == "embed"`
	`177`	`+ if self.model_config.runner_type == "embed"`
`178`	`178`	`else None`
`179`	`179`	`)`
`180`	`180`
`@@ -184,7 +184,7 @@ async def start_engine(self):`
`184`	`184`	`self.openai_serving_models,`
`185`	`185`	`request_logger=self.request_logger,`
`186`	`186`	`)`
`187`		`- if self.model_config.task == "classify"`
	`187`	`+ if self.model_config.runner_type == "classify"`
`188`	`188`	`else None`
`189`	`189`	`)`
`190`	`190`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ ray = [`
`64`	`64`	`"ray[serve]>=2.43.0",`
`65`	`65`	`]`
`66`	`66`	`llm = [`
`67`		`- "vllm==0.12.0",`
	`67`	`+ "vllm==0.13.0",`
`68`	`68`	`]`
`69`	`69`
`70`	`70`	`[dependency-groups]`