update

yuhao-zh · yuhao-zh · commit ed80339ce978 · 2026-02-07T19:59:15.000+08:00
diff --git a/docs/user_guide/install.md b/docs/user_guide/install.md
@@ -19,7 +19,7 @@ Note: If you are using DGX Spark, please refer to the Docker installation sectio
 ```sh
 git clone https://github.com/GradientHQ/parallax.git
 cd parallax
-pip install -e ".[gpu]" && pip install mlx-lm==0.30.6 "mlx[cpu]==0.30.4" --no-deps
+pip install -e ".[gpu]" && pip install mlx-lm==0.30.6 --no-deps
 ```
 
 #### For macOS (Apple silicon):
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,9 +55,9 @@ mac = [
 gpu = [
   "sglang[all] @ git+https://github.com/sgl-project/sglang.git@9409c43593f2d6d64595981abf216a15752b0875#subdirectory=python",
   "mlx-lm==0.28.4",
-  "mlx[cpu]==0.30.0",
-  # due to transformers version conflict, we need to install mlx-lm and mlx separately
-  # pip install mlx-lm==0.30.6 "mlx[cpu]==0.30.4" --no-deps
+  "mlx[cpu]==0.30.4",
+  # due to transformers version conflict, we need to install mlx-lm separately
+  # pip install mlx-lm==0.30.6 --no-deps
 ]
 
 vllm = [
diff --git a/src/parallax/server/executor/base_executor.py b/src/parallax/server/executor/base_executor.py
@@ -158,6 +158,7 @@ def __init__(
 
         self.eos_token_id = self._config_accessor.get_eos_token_id()
 
+        self._augment_eos_with_im_end()
         # Build multimodal config (only meaningful for VLM models)
         self.mm_config = self._config_accessor.build_mm_config()
 
@@ -628,6 +629,37 @@ def shutdown(self):
 
         logger.debug("Executor shutdown complete.")
 
+    def _augment_eos_with_im_end(self):
+        """Add ``<|im_end|>`` to the EOS token list when it is present in the
+        vocabulary but missing from the configured ``eos_token_id``.
+
+        Many chat models (Kimi-K2.5, Qwen, etc.) use ``<|im_end|>`` as the
+        turn-ending token, yet their ``config.json`` only lists ``[EOS]`` as
+        the EOS token.  Without this augmentation the scheduler will never
+        detect end-of-turn and generation will run until ``max_tokens``.
+        """
+        _get_vocab = getattr(self.tokenizer, "get_vocab", None)
+        vocab = _get_vocab() if _get_vocab else {}
+        im_end_id = vocab.get("<|im_end|>")
+        if im_end_id is None:
+            return
+
+        # Normalise eos_token_id to a list for easy comparison
+        if self.eos_token_id is None:
+            self.eos_token_id = [im_end_id]
+            logger.info(f"Set eos_token_id to [{im_end_id}] (<|im_end|>)")
+        elif isinstance(self.eos_token_id, list):
+            if im_end_id not in self.eos_token_id:
+                self.eos_token_id.append(im_end_id)
+                logger.info(f"Added <|im_end|> (id={im_end_id}) to eos_token_id list")
+        elif isinstance(self.eos_token_id, int):
+            if self.eos_token_id != im_end_id:
+                self.eos_token_id = [self.eos_token_id, im_end_id]
+                logger.info(
+                    f"Expanded eos_token_id to {self.eos_token_id} "
+                    f"(added <|im_end|> id={im_end_id})"
+                )
+
     def _process_text_request(self, rid: str, messages: list, raw_request: Dict) -> list:
         """Process a text-only request using the tokenizer."""
         if self.tokenizer.chat_template:
@@ -748,6 +780,29 @@ def _handle_raw_request(self, raw_request: Dict):
             if "ignore_eos" in raw_sampling_params:
                 sampling_params.ignore_eos = raw_sampling_params["ignore_eos"]
 
+        # Also read OpenAI-style top-level sampling parameters as fallback
+        if "temperature" in raw_request and raw_sampling_params is None:
+            sampling_params.temperature = raw_request["temperature"]
+            if sampling_params.temperature == 0.0:
+                sampling_params.temperature = 1.0
+                sampling_params.top_k = 1
+        if "top_p" in raw_request and raw_sampling_params is None:
+            sampling_params.top_p = raw_request["top_p"]
+
+        # When tools are present, add tool-call-related stop token IDs so the
+        # scheduler halts generation at the tool-call boundary instead of
+        # running until max_tokens.
+        tools = raw_request.get("tools")
+        if tools and self.tokenizer is not None:
+            from parallax.utils.tokenizer_utils import get_tool_call_stop_token_ids
+
+            tool_stop_ids = get_tool_call_stop_token_ids(self.tokenizer)
+            if tool_stop_ids:
+                if sampling_params.stop_token_ids is None:
+                    sampling_params.stop_token_ids = set()
+                sampling_params.stop_token_ids.update(tool_stop_ids)
+                logger.debug(f"Added tool call stop token IDs for request {rid}: {tool_stop_ids}")
+
         req = InitialRequest(
             request_id=rid,
             output_ids=None,
diff --git a/src/parallax/server/scheduler.py b/src/parallax/server/scheduler.py
@@ -224,6 +224,15 @@ def check_and_update_request_status(self, request: InitialRequest) -> bool:
         ):
             request.update_status(RequestStatus.FINISHED_EOS)
             finished = True
+        elif (
+            not finished
+            and not request.sampling_params.ignore_eos
+            and request.sampling_params.stop_token_ids
+            and last_token_id is not None
+            and last_token_id in request.sampling_params.stop_token_ids
+        ):
+            request.update_status(RequestStatus.FINISHED_EOS)
+            finished = True
         elif request.output_length >= request.max_new_tokens:
             request.update_status(RequestStatus.FINISHED_MAX_LENGTH)
             finished = True
diff --git a/src/parallax/utils/tokenizer_utils.py b/src/parallax/utils/tokenizer_utils.py
@@ -128,6 +128,36 @@ def load_tokenizer(model_path, trust_remote_code=True, tokenizer_config_extra=No
     return _mlx_load_tokenizer(model_path, tokenizer_config_extra=tokenizer_config_extra, **kwargs)
 
 
+def get_tool_call_stop_token_ids(tokenizer) -> List[int]:
+    """Return token IDs that should act as *stop tokens* for tool call generation.
+
+    When the model generates one of these tokens the scheduler should treat it
+    as end-of-sequence so that the HTTP handler can inspect the generated text
+    and extract tool calls.
+
+    Note: tool call *parsing* (``has_tool_calling``, ``tool_parser``, etc.) is
+    handled automatically by the updated ``mlx-lm`` ``TokenizerWrapper``.
+    This function only provides the stop-token IDs that the parallax scheduler
+    needs to halt generation at tool-call boundaries.
+    """
+    stop_ids: List[int] = []
+    _get_vocab = getattr(tokenizer, "get_vocab", None)
+    vocab = _get_vocab() if _get_vocab else {}
+
+    # Markers whose token IDs should halt generation
+    markers = [
+        "<|tool_calls_section_end|>",  # Kimi K2 / K2.5
+        "<|im_end|>",  # common chat turn-end token
+    ]
+
+    for marker in markers:
+        token_id = vocab.get(marker)
+        if token_id is not None:
+            stop_ids.append(token_id)
+
+    return list(set(stop_ids))
+
+
 @dataclass
 class ToolCallState:
     has_tool_calling: bool