Support for Whisper model in vLLM (#6)

rebel-eunji · web-flow · commit a1235ee5884a · 2025-07-22T17:38:16.000+09:00
diff --git a/examples/optimum/run_whisper.py b/examples/optimum/run_whisper.py
@@ -0,0 +1,108 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+
+import fire
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
+
+
+def generate_prompts(batch_size: int, model_id: str):
+    dataset = load_dataset("distil-whisper/librispeech_asr-noise",
+                           "test-pub-noise",
+                           split="40")
+
+    messages = [{
+        "prompt": "<|startoftranscript|>",
+        "multi_modal_data": {
+            "audio": (dataset[i]["audio"]["array"],
+                      dataset[i]["audio"]["sampling_rate"])
+        },
+    } for i in range(batch_size)]
+
+    return messages
+
+
+async def generate(engine: AsyncLLMEngine, tokenizer, request_id, request):
+    results_generator = engine.generate(
+        request,
+        SamplingParams(temperature=0,
+                       ignore_eos=False,
+                       skip_special_tokens=True,
+                       stop_token_ids=[tokenizer.eos_token_id],
+                       max_tokens=448),
+        request_id,
+    )
+
+    final_output = None
+    async for request_output in results_generator:
+        final_output = request_output
+    return final_output
+
+
+async def main(
+    batch_size: int,
+    max_seq_len: int,
+    num_input_prompt: int,
+    model_id: str,
+):
+    engine_args = AsyncEngineArgs(model=model_id,
+                                  device="auto",
+                                  max_num_seqs=batch_size,
+                                  max_num_batched_tokens=max_seq_len,
+                                  max_model_len=max_seq_len,
+                                  block_size=max_seq_len,
+                                  limit_mm_per_prompt={"audio": 1})
+
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    inputs = generate_prompts(num_input_prompt, model_id)
+
+    futures = []
+    for request_id, request in enumerate(inputs):
+        futures.append(
+            asyncio.create_task(
+                generate(engine, tokenizer, request_id, request)))
+
+    results = await asyncio.gather(*futures)
+
+    for i, result in enumerate(results):
+        output = result.outputs[0].text
+        print(
+            f"===================== Output {i} ==============================")
+        print(output)
+        print(
+            "===============================================================\n"
+        )
+
+
+def entry_point(
+    batch_size: int = 4,
+    max_seq_len: int = 448,
+    num_input_prompt: int = 1,
+    model_id: str = "/whisper-base-b4-wo-token-timestamps",
+):
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(
+        main(
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            num_input_prompt=num_input_prompt,
+            model_id=model_id,
+        ))
+
+
+if __name__ == "__main__":
+    fire.Fire(entry_point)
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,4 @@ datasets
 qwen_vl_utils
 transformers==4.51.3
 vllm==0.9.1
-optimum-rbln>=0.8.1
+optimum-rbln>=0.8.2a4
diff --git a/vllm_rbln/model_executor/models/optimum/__init__.py b/vllm_rbln/model_executor/models/optimum/__init__.py
@@ -30,6 +30,7 @@
     RBLNOptimumLlavaNextForConditionalGeneration)
 from .qwen2_5_vl import (  # noqa: F401
     RBLNOptimumQwen2_5_VLForConditionalGeneration)
+from .whisper import RBLNOptimumWhisperForConditionalGeneration  # noqa: F401
 
 logger = init_logger(__name__)
 
diff --git a/vllm_rbln/model_executor/models/optimum/base.py b/vllm_rbln/model_executor/models/optimum/base.py
@@ -58,6 +58,8 @@
     ("blip2", "RBLNBlip2ForConditionalGeneration"),
     "Gemma3ForConditionalGeneration": ("gemma3",
                                        "RBLNGemma3ForConditionalGeneration"),
+    "WhisperForConditionalGeneration": ("whisper",
+                                        "RBLNWhisperForConditionalGeneration"),
 }
 
 _RBLN_EMBEDDING_MODELS = {
diff --git a/vllm_rbln/model_executor/models/optimum/gemma3.py b/vllm_rbln/model_executor/models/optimum/gemma3.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, cast
 
 import torch
 from vllm.config import ModelConfig, SchedulerConfig
@@ -21,7 +21,8 @@
                                                   Gemma3ImagePixelInputs)
 
 from .base import ModelInputForRBLN, version_error
-from .model_base import RBLNOptimumDecoderMixin, RBLNOptimumModelBase
+from .model_base import (RBLNOptimumDecoderMixin, RBLNOptimumDictTableMixin,
+                         RBLNOptimumModelBase)
 
 logger = init_logger(__name__)
 
@@ -34,7 +35,8 @@ class SlidingWindowEntry:
 
 
 class RBLNOptimumGemma3ForConditionalGeneration(RBLNOptimumModelBase,
-                                                RBLNOptimumDecoderMixin):
+                                                RBLNOptimumDecoderMixin,
+                                                RBLNOptimumDictTableMixin):
 
     def __init__(
         self,
@@ -120,49 +122,37 @@ def select_local_block_table_value(
         running_requests_ids: list[str],
         finished_requests_ids: list[str],
     ) -> Tuple[list[int], list[int], list[torch.Tensor]]:
-        if is_prompt:
-            # Generate attention mask without padding
-            attention_mask = torch.ones_like(input_ids).squeeze(0)
-
-            # Determine sliding_window_table_id
-            # FIXME:
-            # finished_requests_ids is typed as list[str],
-            # but used as list[int].
-            if finished_requests_ids:
-                first_id = finished_requests_ids[0]
-                local_table_id = self.sliding_window_table[
-                    first_id].local_table_id
-
-                for request_id in finished_requests_ids:
-                    self.sliding_window_table.pop(request_id)
-            else:
-                used_ids = {
-                    v.local_table_id
-                    for v in self.sliding_window_table.values()
-                }
-                available_ids = set(range(self.decoder_batch_size)) - used_ids
-                assert len(available_ids) > 0
-                local_table_id = min(available_ids)
-
-            if len(self.sliding_window_table) > self.decoder_batch_size:
-                raise ValueError(
-                    "Sliding window table size must not exceed the batch size."
-                )
 
-            return [local_table_id], [], [attention_mask]
+        get_extra_values_fn = None
+        attention_mask = None
 
+        if is_prompt:
+            attention_mask = torch.ones_like(input_ids).squeeze(0)
         else:
-            local_table_ids: List[int] = []
-            padded_cache_lengths: List[int] = []
-            attention_masks: List[torch.Tensor] = []
+            get_extra_values_fn = lambda entry: (
+                entry.padded_cache_length,
+                entry.attention_mask,
+            )
 
-            for request_id in running_requests_ids:
-                sliding_window = self.sliding_window_table[request_id]
-                local_table_ids.append(sliding_window.local_table_id)
-                padded_cache_lengths.append(sliding_window.padded_cache_length)
-                attention_masks.append(sliding_window.attention_mask)
+        result = self.get_table_mapping_values(
+            self.sliding_window_table,
+            self.decoder_batch_size,
+            is_prompt,
+            finished_requests_ids,
+            running_requests_ids,
+            get_entry_fn=lambda entry: entry.local_table_id,
+            get_extra_values_fn=get_extra_values_fn,
+        )
 
-            return local_table_ids, padded_cache_lengths, attention_masks
+        if is_prompt:
+            result = cast(list[int], result)
+            table_ids = result
+            return table_ids, [], [attention_mask]
+        else:
+            result = cast(Tuple[list[int], list[int], list[torch.Tensor]],
+                          result)
+            table_ids, padded_cache_lengths, attention_masks = result
+            return table_ids, padded_cache_lengths, attention_masks
 
     def get_pixel_values(self, model_input: ModelInputForRBLN):
         image_input = None
diff --git a/vllm_rbln/model_executor/models/optimum/model_base.py b/vllm_rbln/model_executor/models/optimum/model_base.py
@@ -15,7 +15,7 @@
 import os
 from functools import cache
 from pathlib import Path
-from typing import Optional
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import optimum.rbln
 import torch
@@ -238,3 +238,64 @@ def select_lower_bounded_batch_size(self, original_batch_size: int,
                                         decoder_batch_sizes: tuple):
         index = bisect.bisect_left(decoder_batch_sizes, original_batch_size)
         return decoder_batch_sizes[index]
+
+
+class RBLNOptimumDictTableMixin:
+    """
+    Mixin for models using a request-ID keyed table implemented as a dictionary.
+    """
+
+    def get_table_mapping_values(
+        self,
+        table_mapping: Dict[str, Any],
+        decoder_batch_size: int,
+        is_prompt: bool,
+        finished_requests_ids: list[str],
+        running_requests_ids: list[str],
+        get_entry_fn: Optional[Callable[[Any], Any]] = None,
+        get_extra_values_fn: Optional[Callable[[Any],
+                                               Union[Any, Tuple[Any,
+                                                                ...]]]] = None,
+    ) -> Union[list[int], Tuple[list[int], ...]]:
+        if is_prompt:
+            if finished_requests_ids:
+                first_id = finished_requests_ids[0]
+                first_entry = table_mapping[first_id]
+                table_id = get_entry_fn(
+                    first_entry) if get_entry_fn else first_entry
+
+                for request_id in finished_requests_ids:
+                    table_mapping.pop(request_id)
+            else:
+                used_ids = {
+                    get_entry_fn(v) if get_entry_fn else v
+                    for v in table_mapping.values()
+                }
+                available_ids = set(range(decoder_batch_size)) - used_ids
+                assert available_ids, "No available table IDs"
+                table_id = min(available_ids)
+            return [table_id]
+
+        table_ids = []
+        extra_values = []
+
+        for request_id in running_requests_ids:
+            entry = table_mapping[request_id]
+            table_id = get_entry_fn(entry) if get_entry_fn else entry
+            table_ids.append(table_id)
+
+            if get_extra_values_fn:
+                result = get_extra_values_fn(entry)
+                if not isinstance(result, tuple):
+                    result = (result, )
+                extra_values.append(result)
+
+        if get_extra_values_fn:
+            extra_values_lists: list[list[Any]] = [
+                list(col) for col in zip(*extra_values)
+            ]
+            return (table_ids, *extra_values_lists)
+        return table_ids
+
+    def clear_table_mapping(self, table_mapping: Dict[str, Any]):
+        table_mapping.clear()
diff --git a/vllm_rbln/model_executor/models/optimum/whisper.py b/vllm_rbln/model_executor/models/optimum/whisper.py

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,8 @@`
`58`	`58`	`("blip2", "RBLNBlip2ForConditionalGeneration"),`
`59`	`59`	`"Gemma3ForConditionalGeneration": ("gemma3",`
`60`	`60`	`"RBLNGemma3ForConditionalGeneration"),`
	`61`	`+ "WhisperForConditionalGeneration": ("whisper",`
	`62`	`+ "RBLNWhisperForConditionalGeneration"),`
`61`	`63`	`}`
`62`	`64`
`63`	`65`	`_RBLN_EMBEDDING_MODELS = {`