Add Qwen3 models (#11)

rebel-eunji · web-flow · commit 73691fa78dce · 2025-07-24T22:45:33.000+09:00
* qwen reranker

* encoder

* make qwen3forcausallm -&gt; qwen3model for embedding

* pre-commit

* fix running script

* add note

* add comment

* adhoc for pooling configuration in qwen3 embedding model

* pre-commit
diff --git a/examples/optimum/run_qwen3_embedding.py b/examples/optimum/run_qwen3_embedding.py
@@ -0,0 +1,124 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+import fire
+import torch
+from vllm import AsyncEngineArgs, AsyncLLMEngine, PoolingParams
+
+
+def get_input_prompts() -> list[str]:
+
+    def get_detailed_instruct(task_description: str, query: str) -> str:
+        return f'Instruct: {task_description}\nQuery:{query}'
+
+    # Each query must come with a one-sentence instruction
+    # that describes the task
+    task = ('Given a web search query, '
+            'retrieve relevant passages that answer the query')
+
+    queries = [
+        get_detailed_instruct(task, 'What is the capital of China?'),
+        get_detailed_instruct(task, 'Explain gravity')
+    ]
+    documents = [
+        "The capital of China is Beijing.",
+        ("Gravity is a force that attracts two bodies towards each other. "
+         "It gives weight to physical objects and "
+         "is responsible for the movement of planets around the sun.")
+    ]
+
+    inputs_texts = queries + documents
+    return inputs_texts
+
+
+async def embed(engine: AsyncLLMEngine, prompt: str, model: str,
+                requst_id: int):
+    print(f"embed request_id={requst_id}, prompt={prompt}")
+    pooling_params = PoolingParams()
+    results_generator = engine.encode(
+        prompt,
+        pooling_params,
+        str(requst_id),
+    )
+
+    # get the results
+    final_output = None
+    async for request_output in results_generator:
+        final_output = request_output
+    return final_output
+
+
+async def main(
+    batch_size: int,
+    max_seq_len: int,
+    kvcache_block_size: int,
+    num_input_prompt: int,
+    model_id: str,
+):
+    engine_args = AsyncEngineArgs(model=model_id,
+                                  device="auto",
+                                  max_num_seqs=batch_size,
+                                  max_num_batched_tokens=max_seq_len,
+                                  max_model_len=max_seq_len,
+                                  block_size=kvcache_block_size,
+                                  task="embed")
+
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    prompt_list = get_input_prompts()
+    if len(prompt_list) > 2 * num_input_prompt:
+        raise RuntimeError("The len(QUERIES) and len(DOCUMENTS) ",
+                           "should be equal with 2 * `num_input_prompt`.")
+    futures = []
+    for i, p in enumerate(prompt_list):
+        if i == num_input_prompt * 2:
+            break
+        futures.append(
+            asyncio.create_task(
+                embed(
+                    engine,
+                    prompt=p,
+                    model=model_id,
+                    requst_id=i,
+                )))
+
+    outputs = await asyncio.gather(*futures)
+
+    embeddings = torch.stack([o.outputs.data for o in outputs])
+    scores = (embeddings[:num_input_prompt] @ embeddings[num_input_prompt:].T)
+
+    print(f"scores: {scores.tolist()}")
+
+
+def entry_point(
+    batch_size: int = 1,
+    max_seq_len: int = 32768,
+    kvcache_block_size: int = 32768,
+    num_input_prompt: int = 2,
+    model_id: str = "/qwen3-0.6b-b1-embedding",
+):
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(
+        main(
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            kvcache_block_size=kvcache_block_size,
+            num_input_prompt=num_input_prompt,
+            model_id=model_id,
+        ))
+
+
+if __name__ == "__main__":
+    fire.Fire(entry_point)
diff --git a/examples/optimum/run_qwen3_reranker.py b/examples/optimum/run_qwen3_reranker.py
@@ -0,0 +1,188 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import math
+
+import fire
+from transformers import AutoTokenizer
+from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
+from vllm.inputs.data import TokensPrompt
+
+SUFFIX = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+
+
+def format_instruction(instruction, query, doc):
+    text = [{
+        "role":
+        "system",
+        "content": ("Judge whether the Document meets the requirements "
+                    "based on the Query and the Instruct provided. "
+                    "Note that the answer can only be \"yes\" or \"no\".")
+    }, {
+        "role":
+        "user",
+        "content":
+        f"<Instruct>: {instruction}\n\n<Query>: {query}\n\n<Document>: {doc}"
+    }]
+    return text
+
+
+def process_inputs(pairs, instruction, max_length, suffix_tokens, tokenizer):
+    messages = [
+        format_instruction(instruction, query, doc) for query, doc in pairs
+    ]
+    messages = tokenizer.apply_chat_template(messages,
+                                             tokenize=True,
+                                             add_generation_prompt=False,
+                                             enable_thinking=False)
+    messages = [ele[:max_length] + suffix_tokens for ele in messages]
+    messages = [TokensPrompt(prompt_token_ids=ele) for ele in messages]
+    return messages
+
+
+def get_input_prompts(model_id, max_length, suffix_tokens,
+                      tokenizer) -> list[str]:
+    task = ('Given a web search query, '
+            'retrieve relevant passages that answer the query')
+    queries = [
+        "What is the capital of China?",
+        "Explain gravity",
+    ]
+    documents = [
+        "The capital of China is Beijing.",
+        ("Gravity is a force that attracts two bodies towards each other. "
+         "It gives weight to physical objects and "
+         "is responsible for the movement of planets around the sun.")
+    ]
+
+    pairs = list(zip(queries, documents))
+    inputs = process_inputs(pairs, task, max_length - len(suffix_tokens),
+                            suffix_tokens, tokenizer)
+
+    return inputs
+
+
+async def generate(engine: AsyncLLMEngine, prompt_tokens: list[int],
+                   model: str, requst_id: int, true_token: int,
+                   false_token: int):
+    print(f"generate request_id={requst_id}, prompt_tokens={prompt_tokens}")
+    example_input = {
+        "stream": True,
+        "temperature": 0.0,
+        "request_id": str(requst_id),
+    }
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=1,
+        logprobs=20,
+        allowed_token_ids=[true_token, false_token],
+    )
+
+    results_generator = engine.generate(
+        prompt_tokens,
+        sampling_params,
+        example_input["request_id"],
+    )
+
+    # get the results
+    final_output = None
+    async for request_output in results_generator:
+        final_output = request_output
+    return final_output
+
+
+def compute_logits(outputs, true_token, false_token):
+    scores = []
+    for i in range(len(outputs)):
+        final_logits = outputs[i].outputs[0].logprobs[-1]
+        if true_token not in final_logits:
+            true_logit = -10
+        else:
+            true_logit = final_logits[true_token].logprob
+        if false_token not in final_logits:
+            false_logit = -10
+        else:
+            false_logit = final_logits[false_token].logprob
+        true_score = math.exp(true_logit)
+        false_score = math.exp(false_logit)
+        score = true_score / (true_score + false_score)
+        scores.append(score)
+    return scores
+
+
+async def main(
+    batch_size: int,
+    max_seq_len: int,
+    kvcache_block_size: int,
+    num_input_prompt: int,
+    model_id: str,
+):
+    engine_args = AsyncEngineArgs(model=model_id,
+                                  device="auto",
+                                  max_num_seqs=batch_size,
+                                  max_num_batched_tokens=max_seq_len,
+                                  max_model_len=max_seq_len,
+                                  block_size=kvcache_block_size)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.padding_side = "left"
+    tokenizer.pad_token = tokenizer.eos_token
+    suffix_tokens = tokenizer.encode(SUFFIX, add_special_tokens=False)
+
+    true_token = tokenizer("yes", add_special_tokens=False).input_ids[0]
+    false_token = tokenizer("no", add_special_tokens=False).input_ids[0]
+
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    prompt_tokens_list = get_input_prompts(model_id, max_seq_len,
+                                           suffix_tokens, tokenizer)
+    futures = []
+    for i, p in enumerate(prompt_tokens_list):
+        if i == num_input_prompt:
+            break
+
+        futures.append(
+            asyncio.create_task(
+                generate(engine,
+                         prompt_tokens=p,
+                         model=model_id,
+                         requst_id=i,
+                         true_token=true_token,
+                         false_token=false_token)))
+
+    result = await asyncio.gather(*futures)
+    score = compute_logits(result, true_token, false_token)
+    print(f"scores: {score}")
+
+
+def entry_point(
+    batch_size: int = 1,
+    max_seq_len: int = 32768,
+    kvcache_block_size: int = 32768,
+    num_input_prompt: int = 2,
+    model_id: str = "/qwen3-0.6b-b1",
+):
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(
+        main(
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            kvcache_block_size=kvcache_block_size,
+            num_input_prompt=num_input_prompt,
+            model_id=model_id,
+        ))
+
+
+if __name__ == "__main__":
+    fire.Fire(entry_point)
diff --git a/vllm_rbln/model_executor/models/optimum/__init__.py b/vllm_rbln/model_executor/models/optimum/__init__.py
@@ -69,7 +69,6 @@ def load_model(
 
 
 __all__ = [
-    "load_model",
-    "get_rbln_model_info",
-    "ModelInputForRBLN",
+    "load_model", "get_rbln_model_info", "ModelInputForRBLN",
+    "RBLNOptimumForEncoderModel"
 ]
diff --git a/vllm_rbln/model_executor/models/optimum/base.py b/vllm_rbln/model_executor/models/optimum/base.py
@@ -38,6 +38,7 @@
     "ExaoneForCausalLM": ("exaone", "RBLNExaoneForCausalLM"),
     "Qwen2ForCausalLM": ("qwen2", "RBLNQwen2ForCausalLM"),
     "OPTForCausalLM": ("opt", "RBLNOPTForCausalLM"),
+    "Qwen3ForCausalLM": ("qwen3", "RBLNQwen3ForCausalLM"),
 }
 
 _RBLN_ENCODER_DECODER_MODELS: Dict[str, Tuple[str, str]] = {
@@ -71,6 +72,7 @@
     "XLMRobertaForSequenceClassification":
     ("xlm_roberta_classification", "RBLNXLMRobertaForSequenceClassification"),
     "XLMRobertaModel": ("xlm_roberta", "RBLNXLMRobertaModel"),
+    "Qwen3Model": ("qwen3", "RBLNQwen3Model"),
 }
 
 _RBLN_SUPPORTED_MODELS = {
diff --git a/vllm_rbln/model_executor/models/optimum/encoder.py b/vllm_rbln/model_executor/models/optimum/encoder.py
@@ -114,14 +114,17 @@ def forward(self, model_input: ModelInputForRBLN) -> torch.Tensor:
         if token_type_ids:
             kwargs["token_type_ids"] = token_type_ids
         else:
-            rbln_model_input_names = self.rbln_model_config.model_input_names
-            if "token_type_ids" in rbln_model_input_names:
-                kwargs["token_type_ids"] = torch.zeros_like(input_ids)
+            model_input_names = getattr(self.rbln_model_config,
+                                        "model_input_names", None)
+            if model_input_names is not None:
+                rbln_model_input_names = \
+                    self.rbln_model_config.model_input_names
+                if "token_type_ids" in rbln_model_input_names:
+                    kwargs["token_type_ids"] = torch.zeros_like(input_ids)
 
         embeds = self.model.forward(**kwargs)
 
         hidden_states = embeds[0]
-
         if isinstance(hidden_states, tuple):
             hidden_states = hidden_states[0]
 
diff --git a/vllm_rbln/worker/optimum_model_runner.py b/vllm_rbln/worker/optimum_model_runner.py

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,6 @@ def load_model(`
`69`	`69`
`70`	`70`
`71`	`71`	`__all__ = [`
`72`		`- "load_model",`
`73`		`- "get_rbln_model_info",`
`74`		`- "ModelInputForRBLN",`
	`72`	`+ "load_model", "get_rbln_model_info", "ModelInputForRBLN",`
	`73`	`+ "RBLNOptimumForEncoderModel"`
`75`	`74`	`]`
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@`
`38`	`38`	`"ExaoneForCausalLM": ("exaone", "RBLNExaoneForCausalLM"),`
`39`	`39`	`"Qwen2ForCausalLM": ("qwen2", "RBLNQwen2ForCausalLM"),`
`40`	`40`	`"OPTForCausalLM": ("opt", "RBLNOPTForCausalLM"),`
	`41`	`+ "Qwen3ForCausalLM": ("qwen3", "RBLNQwen3ForCausalLM"),`
`41`	`42`	`}`
`42`	`43`
`43`	`44`	`_RBLN_ENCODER_DECODER_MODELS: Dict[str, Tuple[str, str]] = {`
`@@ -71,6 +72,7 @@`
`71`	`72`	`"XLMRobertaForSequenceClassification":`
`72`	`73`	`("xlm_roberta_classification", "RBLNXLMRobertaForSequenceClassification"),`
`73`	`74`	`"XLMRobertaModel": ("xlm_roberta", "RBLNXLMRobertaModel"),`
	`75`	`+ "Qwen3Model": ("qwen3", "RBLNQwen3Model"),`
`74`	`76`	`}`
`75`	`77`
`76`	`78`	`_RBLN_SUPPORTED_MODELS = {`