Skip to content

Commit 7d4c636

Browse files
wanghangkaiclaude
andcommitted
feat: add MiniMax-M2.7 model support
Add support for MiniMax-M2.7-W8A8-INT8-Dynamic model on Kunlun P800 XPU: 1. ops/activation.py: Register SiluAndMul/GeluAndMul via OOT mechanism to use XPU kernels instead of Python fallback 2. quantization/compressed_tensors/compressed_tensors_moe.py: Fix MoE routing by reading scoring_func from layer attributes (MiniMax uses sigmoid instead of default softmax) 3. reasoning/minimax_m2_reasoning_parser.py: Add reasoning parser for MiniMax-M2.7 model's <think>...</think> tokens Tested on P800 x4 with TP=4, verified output quality (no repetition). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent bbcbf36 commit 7d4c636

5 files changed

Lines changed: 172 additions & 0 deletions

File tree

vllm_kunlun/ops/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,8 @@
2727
import vllm_kunlun.ops.vocab_parallel_embedding
2828
import vllm_kunlun.v1.sample.spec_decode.eagle # noqa: F401
2929

30+
# activation ops (SiluAndMul, GeluAndMul OOT registration)
31+
import vllm_kunlun.ops.activation
32+
3033
# TODO @xyDong0223 remove v0.16.0
3134
# import vllm_kunlun.ops.mla

vllm_kunlun/ops/activation.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#
2+
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# This file is a part of the vllm-kunlun project.
16+
#
17+
"""Kunlun-optimized SiluAndMul/GeluAndMul via CustomOp.register_oot."""
18+
19+
import torch
20+
from vllm.model_executor.custom_op import CustomOp
21+
from vllm.model_executor.layers.activation import SiluAndMul, GeluAndMul
22+
23+
24+
@CustomOp.register_oot(name="SiluAndMul")
25+
class KunlunSiluAndMul(SiluAndMul):
26+
"""Kunlun-optimized SiluAndMul using XPU kernel."""
27+
28+
def __init__(self, *, compile_native: bool = True):
29+
CustomOp.__init__(self, compile_native=compile_native)
30+
31+
def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
32+
d = x.shape[-1] // 2
33+
out = torch.empty(x.shape[:-1] + (d,), dtype=x.dtype, device=x.device)
34+
torch.ops._C.silu_and_mul(out, x)
35+
return out
36+
37+
38+
@CustomOp.register_oot(name="GeluAndMul")
39+
class KunlunGeluAndMul(GeluAndMul):
40+
"""Kunlun-optimized GeluAndMul using XPU kernel."""
41+
42+
def __init__(self, approximate: str = "none"):
43+
CustomOp.__init__(self)
44+
self.approximate = approximate
45+
46+
def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
47+
d = x.shape[-1] // 2
48+
out = torch.empty(x.shape[:-1] + (d,), dtype=x.dtype, device=x.device)
49+
if self.approximate == "tanh":
50+
torch.ops._C.gelu_tanh_and_mul(out, x)
51+
else:
52+
torch.ops._C.gelu_and_mul(out, x)
53+
return out

vllm_kunlun/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,17 @@ def apply_monolithic(
132132
e_score_correction_bias: Optional[torch.Tensor] = None,
133133
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
134134
hidden_states = x
135+
136+
# Read correct config from FusedMoE layer attributes (MiniMax-M2.7 fix)
137+
if hasattr(layer, "scoring_func") and scoring_func == "softmax":
138+
scoring_func = layer.scoring_func
139+
if hasattr(layer, "e_score_correction_bias") and e_score_correction_bias is None:
140+
e_score_correction_bias = layer.e_score_correction_bias
141+
if hasattr(layer, "num_expert_group") and num_expert_group is None:
142+
num_expert_group = layer.num_expert_group
143+
if hasattr(layer, "topk_group") and topk_group is None:
144+
topk_group = layer.topk_group
145+
135146
global_num_experts, up_gate_size, _ = layer.w13_weight.shape
136147
M, N = hidden_states.shape
137148
hidden_dim = layer.w2_weight.shape[1]

vllm_kunlun/reasoning/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
REASONING_PARSERS = {
1111
"qwen3": (".qwen3_reasoning_parser", "Qwen3ReasoningParser"),
1212
"gemma4": (".gemma4_reasoning_parser", "Gemma4ReasoningParser"),
13+
"minimax_m2": (".minimax_m2_reasoning_parser", "MiniMaxM2ReasoningParser"),
1314
}
1415

1516

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#
2+
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# This file is a part of the vllm-kunlun project.
16+
#
17+
"""Reasoning parser for MiniMax-M2.7 model."""
18+
19+
from collections.abc import Sequence
20+
from typing import TYPE_CHECKING
21+
22+
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
23+
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
24+
25+
if TYPE_CHECKING:
26+
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
27+
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
28+
from vllm.tokenizers import TokenizerLike
29+
30+
31+
class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
32+
"""
33+
Reasoning parser for the MiniMax-M2.7 model.
34+
35+
MiniMax-M2.7 uses <think>...</think> tokens to denote reasoning text,
36+
similar to Qwen3. This parser handles the extraction of reasoning
37+
content from model outputs.
38+
"""
39+
40+
def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
41+
super().__init__(tokenizer, *args, **kwargs)
42+
chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
43+
self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
44+
45+
@property
46+
def start_token(self) -> str:
47+
return "<think>"
48+
49+
@property
50+
def end_token(self) -> str:
51+
return "</think>"
52+
53+
def extract_reasoning(
54+
self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest"
55+
) -> tuple[str | None, str | None]:
56+
model_output_parts = model_output.partition(self.start_token)
57+
model_output = (
58+
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
59+
)
60+
61+
if self.end_token not in model_output:
62+
if not self.thinking_enabled:
63+
return None, model_output
64+
return model_output, None
65+
66+
reasoning, _, content = model_output.partition(self.end_token)
67+
return reasoning, content or None
68+
69+
def extract_reasoning_streaming(
70+
self,
71+
previous_text: str,
72+
current_text: str,
73+
delta_text: str,
74+
previous_token_ids: Sequence[int],
75+
current_token_ids: Sequence[int],
76+
delta_token_ids: Sequence[int],
77+
) -> DeltaMessage | None:
78+
if not self.thinking_enabled:
79+
return DeltaMessage(content=delta_text) if delta_text else None
80+
81+
if self.start_token_id in delta_token_ids:
82+
start_idx = delta_text.find(self.start_token)
83+
if start_idx >= 0:
84+
delta_text = delta_text[start_idx + len(self.start_token) :]
85+
86+
if self.end_token_id in delta_token_ids:
87+
end_index = delta_text.find(self.end_token)
88+
if end_index >= 0:
89+
reasoning = delta_text[:end_index]
90+
content = delta_text[end_index + len(self.end_token) :]
91+
if not reasoning and not content:
92+
return None
93+
return DeltaMessage(
94+
reasoning=reasoning if reasoning else None,
95+
content=content if content else None,
96+
)
97+
return None
98+
99+
if not delta_text:
100+
return None
101+
elif self.end_token_id in previous_token_ids:
102+
return DeltaMessage(content=delta_text)
103+
else:
104+
return DeltaMessage(reasoning=delta_text)

0 commit comments

Comments
 (0)