Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions mlx_vlm/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,10 @@ class OpenAIRequest(GenerationParams, TemplateParams):
stream: bool = Field(
False, description="Whether to stream the response chunk by chunk."
)
stop: Optional[Union[str, List[str]]] = Field(
None,
description="Up to 4 sequences where the API will stop generating further tokens.",
)

def generation_kwargs(self) -> dict[str, Any]:
kwargs = self.dump_kwargs("max_output_tokens")
Expand Down Expand Up @@ -559,6 +563,10 @@ class VLMRequest(GenerationParams, TemplateParams):
description="Maximum number of tokens to generate.",
)
seed: int = Field(DEFAULT_SEED, description="Seed for random generation.")
stop: Optional[Union[str, List[str]]] = Field(
None,
description="Up to 4 sequences where the API will stop generating further tokens.",
)
resize_shape: Optional[ResizeShapeInput] = Field(
None,
description="Resize shape for the image. Provide one integer for a square resize or two integers for (height, width).",
Expand Down Expand Up @@ -630,6 +638,28 @@ class ChatStreamChunk(BaseModel):
usage: Optional[UsageStats]


def resolve_stop_sequences(
stop: Optional[Union[str, List[str]]],
) -> Optional[List[str]]:
"""Normalize stop sequences for the generation stopping criteria.

The generation pipeline's ``add_eos_token_ids`` accepts strings
and handles tokenization internally.

Args:
stop: A single stop string or list of stop strings, or None.

Returns:
A list of stop strings (max 4), or None.
"""
if not stop:
return None
if isinstance(stop, str):
stop = [stop]
sequences = [s for s in stop[:4] if isinstance(s, str) and s]
return sequences if sequences else None


def build_generation_kwargs(
request: Any,
template_kwargs: dict[str, Any],
Expand Down Expand Up @@ -847,6 +877,11 @@ def run_openai(prompt, img_url,system, stream=False, max_output_tokens=512, mode
)
generation_kwargs = build_generation_kwargs(openai_request, template_kwargs)

# Resolve stop sequences to strings for eos_tokens
stop_seqs = resolve_stop_sequences(getattr(openai_request, "stop", None))
if stop_seqs:
generation_kwargs["eos_tokens"] = stop_seqs

generated_at = datetime.now().timestamp()
response_id = f"resp_{uuid.uuid4().hex}"
message_id = f"msg_{uuid.uuid4().hex}"
Expand Down Expand Up @@ -1115,6 +1150,11 @@ async def chat_completions_endpoint(request: ChatRequest):
)
generation_kwargs = build_generation_kwargs(request, template_kwargs)

# Resolve stop sequences to strings for eos_tokens
stop_seqs = resolve_stop_sequences(getattr(request, "stop", None))
if stop_seqs:
generation_kwargs["eos_tokens"] = stop_seqs

if request.stream:
# Streaming response
async def stream_generator():
Expand Down
122 changes: 122 additions & 0 deletions mlx_vlm/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,125 @@ def test_chat_completions_endpoint_forwards_explicit_sampling_args(client):
assert mock_generate.call_args.kwargs["repetition_penalty"] == 1.15
assert mock_generate.call_args.kwargs["logit_bias"] == {12: -1.5}
assert mock_generate.call_args.kwargs["resize_shape"] == (512, 512)


# ---------------------------------------------------------------------------
# Stop sequences tests
# ---------------------------------------------------------------------------


def test_chat_completions_stop_passed_as_eos_tokens(client):
"""stop parameter should be forwarded as eos_tokens strings in generate kwargs."""
model = SimpleNamespace()
processor = SimpleNamespace(
tokenizer=SimpleNamespace(chat_template=""),
)
config = SimpleNamespace(model_type="test")
result = SimpleNamespace(
text="Hello",
prompt_tokens=5,
generation_tokens=1,
total_tokens=6,
prompt_tps=100.0,
generation_tps=50.0,
peak_memory=1.0,
)

with (
patch.object(server, "get_cached_model", return_value=(model, processor, config)),
patch.object(server, "apply_chat_template", return_value="prompt"),
patch.object(server, "generate", return_value=result) as mock_gen,
):
resp = client.post(
"/chat/completions",
json={
"model": "demo",
"messages": [{"role": "user", "content": "hello"}],
"stop": ["\n\n", "</s>"],
},
)
assert resp.status_code == 200
assert "eos_tokens" in mock_gen.call_args.kwargs
assert mock_gen.call_args.kwargs["eos_tokens"] == ["\n\n", "</s>"]


def test_chat_completions_no_stop_no_eos_tokens(client):
"""Without stop parameter, eos_tokens should not be in kwargs."""
model = SimpleNamespace()
processor = SimpleNamespace(tokenizer=SimpleNamespace(chat_template=""))
config = SimpleNamespace(model_type="test")
result = SimpleNamespace(
text="Hi",
prompt_tokens=5,
generation_tokens=1,
total_tokens=6,
prompt_tps=100.0,
generation_tps=50.0,
peak_memory=1.0,
)

with (
patch.object(server, "get_cached_model", return_value=(model, processor, config)),
patch.object(server, "apply_chat_template", return_value="prompt"),
patch.object(server, "generate", return_value=result) as mock_gen,
):
resp = client.post(
"/chat/completions",
json={"model": "demo", "messages": [{"role": "user", "content": "hi"}]},
)
assert resp.status_code == 200
assert "eos_tokens" not in mock_gen.call_args.kwargs


def test_responses_stop_passed_as_eos_tokens(client):
"""stop parameter on /responses should also forward as eos_tokens strings."""
model = SimpleNamespace()
processor = SimpleNamespace(
tokenizer=SimpleNamespace(chat_template=""),
)
config = SimpleNamespace(model_type="test")
result = SimpleNamespace(
text="Hello",
prompt_tokens=5,
generation_tokens=1,
total_tokens=6,
prompt_tps=100.0,
generation_tps=50.0,
peak_memory=1.0,
)

with (
patch.object(server, "get_cached_model", return_value=(model, processor, config)),
patch.object(server, "apply_chat_template", return_value="prompt"),
patch.object(server, "generate", return_value=result) as mock_gen,
):
resp = client.post(
"/responses",
json={"model": "demo", "input": "hi", "stop": "STOP"},
)
assert resp.status_code == 200
assert "eos_tokens" in mock_gen.call_args.kwargs
assert mock_gen.call_args.kwargs["eos_tokens"] == ["STOP"]


def test_resolve_stop_sequences_single_string():
"""resolve_stop_sequences should wrap a single string in a list."""
result = server.resolve_stop_sequences("hello")
assert result == ["hello"]


def test_resolve_stop_sequences_list():
"""resolve_stop_sequences should pass through a list of strings."""
result = server.resolve_stop_sequences(["a", "b"])
assert result == ["a", "b"]


def test_resolve_stop_sequences_none():
"""resolve_stop_sequences should return None for None input."""
assert server.resolve_stop_sequences(None) is None


def test_resolve_stop_sequences_limits_to_four():
"""resolve_stop_sequences should truncate to at most 4 sequences."""
result = server.resolve_stop_sequences(["a", "b", "c", "d", "e", "f"])
assert len(result) == 4