Skip to content

Commit 961dcc0

Browse files
Your Nameclaude
andcommitted
fix: stop sequence truncation, parameter validation, special token stripping
Autoresearch deep testing (20 rounds across OpenAI SDK, Aider, LangChain, LiteLLM, Cline, OpenCode patterns) found 8 bugs: - P1: stop sequences not forwarded to stream_generate()/generate() in SimpleEngine - P1: stop sequences included in output (OpenAI spec requires truncation) - P2: n > 1 silently ignored — now returns 400 - P2: negative max_tokens accepted — now returns 400 - P2: temperature out of 0-2 range accepted — now returns 400 - P2: <|eom_id|>/<|python_tag|> Llama tokens leaking into responses - P2: uvicorn keep-alive too short for agentic long-poll clients (now 30s) - P3: completion_tokens off-by-one in non-streaming (re-encoding vs actual) All 8 fixes verified against live server. 195/197 unit tests pass (2 require live server on port 8000). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1d13b56 commit 961dcc0

File tree

7 files changed

+55
-12
lines changed

7 files changed

+55
-12
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "rapid-mlx"
7-
version = "0.3.7"
7+
version = "0.3.8"
88
description = "Rapid-MLX — AI inference for Apple Silicon. Drop-in OpenAI API, 2-4x faster than Ollama."
99
readme = "README.md"
1010
license = {text = "Apache-2.0"}

vllm_mlx/api/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@ class ChatCompletionRequest(BaseModel):
207207
timeout: float | None = None
208208
# Thinking/reasoning control (Qwen3 style). None = server default.
209209
enable_thinking: bool | None = None
210+
# Number of completions (only n=1 supported)
211+
n: int | None = None
210212

211213

212214
class AssistantMessage(BaseModel):

vllm_mlx/api/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
# Keeps <think>...</think> blocks intact for reasoning models
1616
SPECIAL_TOKENS_PATTERN = re.compile(
1717
r"<\|im_end\|>|<\|im_start\|>|<\|endoftext\|>|"
18-
r"<\|end\|>|<\|eot_id\|>|<\|start_header_id\|>|<\|end_header_id\|>|"
18+
r"<\|end\|>|<\|eot_id\|>|<\|eom_id\|>|<\|python_tag\|>|"
19+
r"<\|start_header_id\|>|<\|end_header_id\|>|"
1920
r"<\|channel\|>|<\|message\|>|<\|start\|>|<\|return\|>|<\|call\|>|<\|constrain\|>|"
2021
r"</s>|<s>|<pad>|\[PAD\]|\[SEP\]|\[CLS\]|"
2122
r"\[e~\[|\]~b\][a-z]*|\]~!b\["

vllm_mlx/cli.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,13 @@ def serve_command(args):
357357
print(f" Ready: http://{host_display}:{args.port}/v1")
358358
print(f" Docs: http://{host_display}:{args.port}/docs")
359359
print()
360-
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
360+
uvicorn.run(
361+
app,
362+
host=args.host,
363+
port=args.port,
364+
log_level="info",
365+
timeout_keep_alive=30,
366+
)
361367

362368

363369
def bench_command(args):

vllm_mlx/engine/simple.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ async def chat(
392392
max_tokens=max_tokens,
393393
temperature=temperature,
394394
top_p=top_p,
395+
stop=stop,
395396
**kwargs_copy,
396397
)
397398
# Return raw text — server handles cleaning after
@@ -557,6 +558,7 @@ async def stream_chat(
557558
max_tokens=max_tokens,
558559
temperature=temperature,
559560
top_p=top_p,
561+
stop=stop,
560562
**kwargs,
561563
):
562564
yield output

vllm_mlx/models/llm.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ def generate(
199199
# preserved via skip_special_tokens=False decoding, and the
200200
# prompt cache is properly managed.
201201
output_text = ""
202+
token_ids = []
203+
finish_reason = "stop"
202204
for chunk in self.stream_generate(
203205
prompt=prompt,
204206
max_tokens=max_tokens,
@@ -207,18 +209,19 @@ def generate(
207209
stop=stop,
208210
):
209211
output_text += chunk.text
212+
if hasattr(chunk, "token") and chunk.token:
213+
token_ids.append(chunk.token)
210214
if chunk.finished:
215+
finish_reason = chunk.finish_reason or "stop"
211216
break
212217

213-
# Tokenize output to get token IDs
214-
tokens = self.tokenizer.encode(output_text)
215-
216-
# Determine finish reason
217-
finish_reason = "length" if len(tokens) >= max_tokens else "stop"
218+
# Fall back to re-encoding if no token IDs were collected
219+
if not token_ids:
220+
token_ids = self.tokenizer.encode(output_text)
218221

219222
return GenerationOutput(
220223
text=output_text,
221-
tokens=tokens,
224+
tokens=token_ids,
222225
finish_reason=finish_reason,
223226
)
224227

@@ -661,12 +664,18 @@ def _make_generator():
661664
new_text = decoder.add_token(token_id)
662665
accumulated_text += new_text
663666

664-
# Check for stop sequences
667+
# Check for stop sequences — truncate at the stop point
668+
# (OpenAI spec: stop sequence is not included in output)
665669
should_stop = False
670+
stop_truncate_text = None
666671
if stop:
667672
for stop_seq in stop:
668-
if stop_seq in accumulated_text:
673+
idx = accumulated_text.find(stop_seq)
674+
if idx != -1:
669675
should_stop = True
676+
# Truncate new_text so accumulated ends just before the stop seq
677+
stop_truncate_text = new_text[: len(new_text) - (len(accumulated_text) - idx)]
678+
accumulated_text = accumulated_text[:idx]
670679
break
671680

672681
# Check if mlx-lm signalled completion (EOS token hit)
@@ -689,7 +698,7 @@ def _make_generator():
689698
cache_saved = True
690699

691700
yield StreamingOutput(
692-
text=new_text,
701+
text=stop_truncate_text if stop_truncate_text is not None else new_text,
693702
token=response.token if hasattr(response, "token") else 0,
694703
finished=finished,
695704
finish_reason=finish_reason,

vllm_mlx/server.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,6 +1710,29 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
17101710
detail=f"Invalid role '{msg.role}'. Must be one of: {', '.join(sorted(_valid_roles))}",
17111711
)
17121712

1713+
# Validate n parameter (only n=1 supported)
1714+
if request.n is not None and request.n > 1:
1715+
raise HTTPException(
1716+
status_code=400,
1717+
detail="n > 1 is not supported. Rapid-MLX generates one completion per request.",
1718+
)
1719+
1720+
# Validate max_tokens (must be positive)
1721+
if request.max_tokens is not None and request.max_tokens < 1:
1722+
raise HTTPException(
1723+
status_code=400,
1724+
detail="max_tokens must be at least 1",
1725+
)
1726+
1727+
# Validate temperature range (OpenAI spec: 0-2)
1728+
if request.temperature is not None and (
1729+
request.temperature < 0 or request.temperature > 2
1730+
):
1731+
raise HTTPException(
1732+
status_code=400,
1733+
detail="temperature must be between 0 and 2",
1734+
)
1735+
17131736
# Validate top_logprobs range (OpenAI spec: 0-20)
17141737
if request.top_logprobs is not None and (
17151738
request.top_logprobs < 0 or request.top_logprobs > 20

0 commit comments

Comments
 (0)