vllm-project
diff --git a/‎docker/Dockerfile.xpu‎
Lines changed: 3 additions & 0 deletions b/‎docker/Dockerfile.xpu‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/offline_inference/vision_language.py‎
Lines changed: 27 additions & 0 deletions b/‎examples/offline_inference/vision_language.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎tests/models/language/pooling/test_token_classification.py‎
Lines changed: 31 additions & 0 deletions b/‎tests/models/language/pooling/test_token_classification.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎tests/models/registry.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/models/registry.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm/config/model.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/config/model.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/entrypoints/context.py‎
Lines changed: 7 additions & 7 deletions b/‎vllm/entrypoints/context.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎vllm/model_executor/models/adapters.py‎
Lines changed: 12 additions & 0 deletions b/‎vllm/model_executor/models/adapters.py‎
Lines changed: 12 additions & 0 deletions
@@ -76,6 +76,9 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 
+# PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
+RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf
+
 # remove torch bundled oneccl to avoid conflicts
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
 
@@ -661,6 +661,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
 | `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
+| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
 
@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "ByteDance-Seed/BAGEL-7B-MoT"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 model_example_map = {
     "aria": run_aria,
     "aya_vision": run_aya_vision,
+    "bagel": run_bagel,
     "bee": run_bee,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
 
@@ -68,3 +68,34 @@ def test_modernbert_models(
         hf_output = torch.tensor(hf_output).cpu().float()
         vllm_output = torch.tensor(vllm_output).cpu().float()
         assert torch.allclose(hf_output, vllm_output, atol=1e-2)
+
+
+@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
+@pytest.mark.parametrize("dtype", ["float"])
+@torch.inference_mode
+def test_auto_conversion(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.token_classify(example_prompts)
+
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            hf_outputs.append(softmax(output.logits[0]))
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output).cpu().float()
+        vllm_output = torch.tensor(vllm_output).cpu().float()
+        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
@@ -573,6 +573,7 @@ def check_available_online(
     "Qwen3ForSequenceClassification": _HfExamplesInfo(
         "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
     ),
+    "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
@@ -582,6 +583,7 @@ def check_available_online(
         "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
     ),
     "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
+    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
     "BeeForConditionalGeneration": _HfExamplesInfo(
         "Open-Bee/Bee-8B-RL",
         trust_remote_code=True,
 
@@ -1796,6 +1796,7 @@ def get_served_model_name(model: str, served_model_name: str | list[str] | None)
     ("ForTextEncoding", ("pooling", "embed")),
     ("EmbeddingModel", ("pooling", "embed")),
     ("ForSequenceClassification", ("pooling", "classify")),
+    ("ForTokenClassification", ("pooling", "classify")),
     ("ForAudioClassification", ("pooling", "classify")),
     ("ForImageClassification", ("pooling", "classify")),
     ("ForVideoClassification", ("pooling", "classify")),
 
@@ -74,24 +74,24 @@ class TurnMetrics:
 
     def __init__(
         self,
-        input_tokens=0,
-        output_tokens=0,
-        cached_input_tokens=0,
-        tool_output_tokens=0,
-    ):
+        input_tokens: int = 0,
+        output_tokens: int = 0,
+        cached_input_tokens: int = 0,
+        tool_output_tokens: int = 0,
+    ) -> None:
         self.input_tokens = input_tokens
         self.output_tokens = output_tokens
         self.cached_input_tokens = cached_input_tokens
         self.tool_output_tokens = tool_output_tokens
 
-    def reset(self):
+    def reset(self) -> None:
         """Reset counters for a new turn."""
         self.input_tokens = 0
         self.output_tokens = 0
         self.cached_input_tokens = 0
         self.tool_output_tokens = 0
 
-    def copy(self):
+    def copy(self) -> "TurnMetrics":
         """Create a copy of this turn's token counts."""
         return TurnMetrics(
             self.input_tokens,
 
@@ -337,6 +337,18 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             tokens = getattr(text_config, "classifier_from_token", None)
             method = getattr(text_config, "method", None)
 
+            def auto_set_score_bias(weights):
+                for name, weight in weights:
+                    if name == "score.bias":
+                        device = self.score.weight.device
+                        dtype = self.score.weight.dtype
+                        bias = weight.to(device).to(dtype)
+                        self.score.bias = torch.nn.Parameter(bias)
+                        self.score.skip_bias_add = False
+                    else:
+                        yield name, weight
+
+            weights = auto_set_score_bias(weights)
             if tokens is None and method is None:
                 return super().load_weights(weights)
             else: