Skip to content

Commit ed80339

Browse files
committed
update
1 parent 2dea44b commit ed80339

5 files changed

Lines changed: 98 additions & 4 deletions

File tree

docs/user_guide/install.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Note: If you are using DGX Spark, please refer to the Docker installation sectio
1919
```sh
2020
git clone https://github.com/GradientHQ/parallax.git
2121
cd parallax
22-
pip install -e ".[gpu]" && pip install mlx-lm==0.30.6 "mlx[cpu]==0.30.4" --no-deps
22+
pip install -e ".[gpu]" && pip install mlx-lm==0.30.6 --no-deps
2323
```
2424

2525
#### For macOS (Apple silicon):

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ mac = [
5555
gpu = [
5656
"sglang[all] @ git+https://github.com/sgl-project/sglang.git@9409c43593f2d6d64595981abf216a15752b0875#subdirectory=python",
5757
"mlx-lm==0.28.4",
58-
"mlx[cpu]==0.30.0",
59-
# due to transformers version conflict, we need to install mlx-lm and mlx separately
60-
# pip install mlx-lm==0.30.6 "mlx[cpu]==0.30.4" --no-deps
58+
"mlx[cpu]==0.30.4",
59+
# due to transformers version conflict, we need to install mlx-lm separately
60+
# pip install mlx-lm==0.30.6 --no-deps
6161
]
6262

6363
vllm = [

src/parallax/server/executor/base_executor.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def __init__(
158158

159159
self.eos_token_id = self._config_accessor.get_eos_token_id()
160160

161+
self._augment_eos_with_im_end()
161162
# Build multimodal config (only meaningful for VLM models)
162163
self.mm_config = self._config_accessor.build_mm_config()
163164

@@ -628,6 +629,37 @@ def shutdown(self):
628629

629630
logger.debug("Executor shutdown complete.")
630631

632+
def _augment_eos_with_im_end(self):
633+
"""Add ``<|im_end|>`` to the EOS token list when it is present in the
634+
vocabulary but missing from the configured ``eos_token_id``.
635+
636+
Many chat models (Kimi-K2.5, Qwen, etc.) use ``<|im_end|>`` as the
637+
turn-ending token, yet their ``config.json`` only lists ``[EOS]`` as
638+
the EOS token. Without this augmentation the scheduler will never
639+
detect end-of-turn and generation will run until ``max_tokens``.
640+
"""
641+
_get_vocab = getattr(self.tokenizer, "get_vocab", None)
642+
vocab = _get_vocab() if _get_vocab else {}
643+
im_end_id = vocab.get("<|im_end|>")
644+
if im_end_id is None:
645+
return
646+
647+
# Normalise eos_token_id to a list for easy comparison
648+
if self.eos_token_id is None:
649+
self.eos_token_id = [im_end_id]
650+
logger.info(f"Set eos_token_id to [{im_end_id}] (<|im_end|>)")
651+
elif isinstance(self.eos_token_id, list):
652+
if im_end_id not in self.eos_token_id:
653+
self.eos_token_id.append(im_end_id)
654+
logger.info(f"Added <|im_end|> (id={im_end_id}) to eos_token_id list")
655+
elif isinstance(self.eos_token_id, int):
656+
if self.eos_token_id != im_end_id:
657+
self.eos_token_id = [self.eos_token_id, im_end_id]
658+
logger.info(
659+
f"Expanded eos_token_id to {self.eos_token_id} "
660+
f"(added <|im_end|> id={im_end_id})"
661+
)
662+
631663
def _process_text_request(self, rid: str, messages: list, raw_request: Dict) -> list:
632664
"""Process a text-only request using the tokenizer."""
633665
if self.tokenizer.chat_template:
@@ -748,6 +780,29 @@ def _handle_raw_request(self, raw_request: Dict):
748780
if "ignore_eos" in raw_sampling_params:
749781
sampling_params.ignore_eos = raw_sampling_params["ignore_eos"]
750782

783+
# Also read OpenAI-style top-level sampling parameters as fallback
784+
if "temperature" in raw_request and raw_sampling_params is None:
785+
sampling_params.temperature = raw_request["temperature"]
786+
if sampling_params.temperature == 0.0:
787+
sampling_params.temperature = 1.0
788+
sampling_params.top_k = 1
789+
if "top_p" in raw_request and raw_sampling_params is None:
790+
sampling_params.top_p = raw_request["top_p"]
791+
792+
# When tools are present, add tool-call-related stop token IDs so the
793+
# scheduler halts generation at the tool-call boundary instead of
794+
# running until max_tokens.
795+
tools = raw_request.get("tools")
796+
if tools and self.tokenizer is not None:
797+
from parallax.utils.tokenizer_utils import get_tool_call_stop_token_ids
798+
799+
tool_stop_ids = get_tool_call_stop_token_ids(self.tokenizer)
800+
if tool_stop_ids:
801+
if sampling_params.stop_token_ids is None:
802+
sampling_params.stop_token_ids = set()
803+
sampling_params.stop_token_ids.update(tool_stop_ids)
804+
logger.debug(f"Added tool call stop token IDs for request {rid}: {tool_stop_ids}")
805+
751806
req = InitialRequest(
752807
request_id=rid,
753808
output_ids=None,

src/parallax/server/scheduler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,15 @@ def check_and_update_request_status(self, request: InitialRequest) -> bool:
224224
):
225225
request.update_status(RequestStatus.FINISHED_EOS)
226226
finished = True
227+
elif (
228+
not finished
229+
and not request.sampling_params.ignore_eos
230+
and request.sampling_params.stop_token_ids
231+
and last_token_id is not None
232+
and last_token_id in request.sampling_params.stop_token_ids
233+
):
234+
request.update_status(RequestStatus.FINISHED_EOS)
235+
finished = True
227236
elif request.output_length >= request.max_new_tokens:
228237
request.update_status(RequestStatus.FINISHED_MAX_LENGTH)
229238
finished = True

src/parallax/utils/tokenizer_utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,36 @@ def load_tokenizer(model_path, trust_remote_code=True, tokenizer_config_extra=No
128128
return _mlx_load_tokenizer(model_path, tokenizer_config_extra=tokenizer_config_extra, **kwargs)
129129

130130

131+
def get_tool_call_stop_token_ids(tokenizer) -> List[int]:
132+
"""Return token IDs that should act as *stop tokens* for tool call generation.
133+
134+
When the model generates one of these tokens the scheduler should treat it
135+
as end-of-sequence so that the HTTP handler can inspect the generated text
136+
and extract tool calls.
137+
138+
Note: tool call *parsing* (``has_tool_calling``, ``tool_parser``, etc.) is
139+
handled automatically by the updated ``mlx-lm`` ``TokenizerWrapper``.
140+
This function only provides the stop-token IDs that the parallax scheduler
141+
needs to halt generation at tool-call boundaries.
142+
"""
143+
stop_ids: List[int] = []
144+
_get_vocab = getattr(tokenizer, "get_vocab", None)
145+
vocab = _get_vocab() if _get_vocab else {}
146+
147+
# Markers whose token IDs should halt generation
148+
markers = [
149+
"<|tool_calls_section_end|>", # Kimi K2 / K2.5
150+
"<|im_end|>", # common chat turn-end token
151+
]
152+
153+
for marker in markers:
154+
token_id = vocab.get(marker)
155+
if token_id is not None:
156+
stop_ids.append(token_id)
157+
158+
return list(set(stop_ids))
159+
160+
131161
@dataclass
132162
class ToolCallState:
133163
has_tool_calling: bool

0 commit comments

Comments
 (0)