feat: add MiniMax-M2.7 model support

wanghangkai · claude · wanghangkai · commit 7d4c636cc891 · 2026-05-27T20:01:19.000+08:00
Add support for MiniMax-M2.7-W8A8-INT8-Dynamic model on Kunlun P800 XPU:

1. ops/activation.py: Register SiluAndMul/GeluAndMul via OOT mechanism
   to use XPU kernels instead of Python fallback

2. quantization/compressed_tensors/compressed_tensors_moe.py: Fix MoE
   routing by reading scoring_func from layer attributes (MiniMax uses
   sigmoid instead of default softmax)

3. reasoning/minimax_m2_reasoning_parser.py: Add reasoning parser for
   MiniMax-M2.7 model's &lt;think&gt;...&lt;/think&gt; tokens

Tested on P800 x4 with TP=4, verified output quality (no repetition).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/vllm_kunlun/ops/__init__.py b/vllm_kunlun/ops/__init__.py
@@ -27,5 +27,8 @@
 import vllm_kunlun.ops.vocab_parallel_embedding
 import vllm_kunlun.v1.sample.spec_decode.eagle  # noqa: F401
 
+# activation ops (SiluAndMul, GeluAndMul OOT registration)
+import vllm_kunlun.ops.activation
+
 # TODO @xyDong0223 remove v0.16.0
 # import vllm_kunlun.ops.mla
diff --git a/vllm_kunlun/ops/activation.py b/vllm_kunlun/ops/activation.py
@@ -0,0 +1,53 @@
+#
+# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-kunlun project.
+#
+"""Kunlun-optimized SiluAndMul/GeluAndMul via CustomOp.register_oot."""
+
+import torch
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.activation import SiluAndMul, GeluAndMul
+
+
+@CustomOp.register_oot(name="SiluAndMul")
+class KunlunSiluAndMul(SiluAndMul):
+    """Kunlun-optimized SiluAndMul using XPU kernel."""
+
+    def __init__(self, *, compile_native: bool = True):
+        CustomOp.__init__(self, compile_native=compile_native)
+
+    def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        out = torch.empty(x.shape[:-1] + (d,), dtype=x.dtype, device=x.device)
+        torch.ops._C.silu_and_mul(out, x)
+        return out
+
+
+@CustomOp.register_oot(name="GeluAndMul")
+class KunlunGeluAndMul(GeluAndMul):
+    """Kunlun-optimized GeluAndMul using XPU kernel."""
+
+    def __init__(self, approximate: str = "none"):
+        CustomOp.__init__(self)
+        self.approximate = approximate
+
+    def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        out = torch.empty(x.shape[:-1] + (d,), dtype=x.dtype, device=x.device)
+        if self.approximate == "tanh":
+            torch.ops._C.gelu_tanh_and_mul(out, x)
+        else:
+            torch.ops._C.gelu_and_mul(out, x)
+        return out
diff --git a/vllm_kunlun/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm_kunlun/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -132,6 +132,17 @@ def apply_monolithic(
         e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         hidden_states = x
+
+        # Read correct config from FusedMoE layer attributes (MiniMax-M2.7 fix)
+        if hasattr(layer, "scoring_func") and scoring_func == "softmax":
+            scoring_func = layer.scoring_func
+        if hasattr(layer, "e_score_correction_bias") and e_score_correction_bias is None:
+            e_score_correction_bias = layer.e_score_correction_bias
+        if hasattr(layer, "num_expert_group") and num_expert_group is None:
+            num_expert_group = layer.num_expert_group
+        if hasattr(layer, "topk_group") and topk_group is None:
+            topk_group = layer.topk_group
+
         global_num_experts, up_gate_size, _ = layer.w13_weight.shape
         M, N = hidden_states.shape
         hidden_dim = layer.w2_weight.shape[1]
diff --git a/vllm_kunlun/reasoning/__init__.py b/vllm_kunlun/reasoning/__init__.py
@@ -10,6 +10,7 @@
 REASONING_PARSERS = {
     "qwen3": (".qwen3_reasoning_parser", "Qwen3ReasoningParser"),
     "gemma4": (".gemma4_reasoning_parser", "Gemma4ReasoningParser"),
+    "minimax_m2": (".minimax_m2_reasoning_parser", "MiniMaxM2ReasoningParser"),
 }
 
 
diff --git a/vllm_kunlun/reasoning/minimax_m2_reasoning_parser.py b/vllm_kunlun/reasoning/minimax_m2_reasoning_parser.py
@@ -0,0 +1,104 @@
+#
+# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-kunlun project.
+#
+"""Reasoning parser for MiniMax-M2.7 model."""
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
+
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+    from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+    from vllm.tokenizers import TokenizerLike
+
+
+class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
+    """
+    Reasoning parser for the MiniMax-M2.7 model.
+
+    MiniMax-M2.7 uses <think>...</think> tokens to denote reasoning text,
+    similar to Qwen3. This parser handles the extraction of reasoning
+    content from model outputs.
+    """
+
+    def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
+
+    @property
+    def start_token(self) -> str:
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        return "</think>"
+
+    def extract_reasoning(
+        self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest"
+    ) -> tuple[str | None, str | None]:
+        model_output_parts = model_output.partition(self.start_token)
+        model_output = (
+            model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
+        )
+
+        if self.end_token not in model_output:
+            if not self.thinking_enabled:
+                return None, model_output
+            return model_output, None
+
+        reasoning, _, content = model_output.partition(self.end_token)
+        return reasoning, content or None
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        if not self.thinking_enabled:
+            return DeltaMessage(content=delta_text) if delta_text else None
+
+        if self.start_token_id in delta_token_ids:
+            start_idx = delta_text.find(self.start_token)
+            if start_idx >= 0:
+                delta_text = delta_text[start_idx + len(self.start_token) :]
+
+        if self.end_token_id in delta_token_ids:
+            end_index = delta_text.find(self.end_token)
+            if end_index >= 0:
+                reasoning = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token) :]
+                if not reasoning and not content:
+                    return None
+                return DeltaMessage(
+                    reasoning=reasoning if reasoning else None,
+                    content=content if content else None,
+                )
+            return None
+
+        if not delta_text:
+            return None
+        elif self.end_token_id in previous_token_ids:
+            return DeltaMessage(content=delta_text)
+        else:
+            return DeltaMessage(reasoning=delta_text)

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@`
`10`	`10`	`REASONING_PARSERS = {`
`11`	`11`	`"qwen3": (".qwen3_reasoning_parser", "Qwen3ReasoningParser"),`
`12`	`12`	`"gemma4": (".gemma4_reasoning_parser", "Gemma4ReasoningParser"),`
	`13`	`+ "minimax_m2": (".minimax_m2_reasoning_parser", "MiniMaxM2ReasoningParser"),`
`13`	`14`	`}`
`14`	`15`
`15`	`16`