Skip to content

Commit 628c2bb

Browse files
scouzi1966claude
andauthored
fix: --guided-json server flag and per-test spec extraction (#97, #98)
* fix: --guided-json server flag now applied to incoming requests (#97) The --guided-json CLI flag was parsed and validated but its result was discarded — server requests never received the schema. The flag only worked in single-prompt CLI mode (afm mlx -s). Fix: store the parsed schema in MLXModelService.defaultGuidedJsonSchema and apply it as a fallback in both chat completions and batch completions controllers when the request body omits response_format. Per-request response_format still wins. Verified: `afm mlx --guided-json '{...}'` now produces valid JSON output matching the schema for all test cases (color/hex, person, etc.). Closes #97 Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]> * fix: per-test spec extraction for [@ label] template format The regex for parsing test section headers was failing on the [@ label] template format used throughout test-llm-comprehensive.txt. The match captured "@ label" as group 1 and None as group 2, then took label from the (empty) group 2. Result: 0 specs extracted, all per-test scoring runs received empty test context. The judge then scored entries based only on the JSONL content, with no knowledge of the test intent — leading to nonsensical reasoning where codex would invent expectations from neighboring tests. Fix: separate regexes for [@ label] (template) and [model @ label] (named variant). Verified: 91/91 labels now extracted correctly. Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]> * refactor: address PR #98 review feedback - Centralize effectiveResponseFormat logic in MLXModelService.effectiveResponseFormat() helper, called from both chat and batch controllers - Make effectiveResponseFormat parameter required (no default) in createStreamingResponse to prevent future callers from accidentally skipping the guided-json fallback Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]> --------- Co-authored-by: Claude Opus 4.6 (1M context) <[email protected]>
1 parent 6b36714 commit 628c2bb

6 files changed

Lines changed: 49 additions & 23 deletions

File tree

Scripts/mlx-model-test.sh

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1653,14 +1653,18 @@ for line in lines:
16531653
if stripped.startswith('#'):
16541654
comment_buf.append(stripped)
16551655
elif re.match(r'^\[.+\]$', stripped):
1656-
# Extract label from [org/model @ label]
1657-
m = re.match(r'^\[(.+?)(?:\s*@\s*(.+?))?\]$', stripped)
1658-
if m:
1659-
label = (m.group(2) or '').strip()
1660-
if label:
1661-
# Keep the AI: comments + section header
1662-
spec_lines = [c for c in comment_buf if '# AI:' in c or c.startswith('# ---')]
1663-
specs[label] = '\n'.join(spec_lines) if spec_lines else ''
1656+
# Extract label from [org/model @ label] OR [@ label] (template mode)
1657+
m_template = re.match(r'^\[@\s*(.+?)\s*\]$', stripped)
1658+
m_named = re.match(r'^\[(.+?)\s*@\s*(.+?)\]$', stripped)
1659+
label = ''
1660+
if m_template:
1661+
label = m_template.group(1).strip()
1662+
elif m_named:
1663+
label = m_named.group(2).strip()
1664+
if label:
1665+
# Keep the AI: comments + section header
1666+
spec_lines = [c for c in comment_buf if '# AI:' in c or c.startswith('# ---')]
1667+
specs[label] = '\n'.join(spec_lines) if spec_lines else ''
16641668
comment_buf = []
16651669
else:
16661670
if not stripped.startswith(('temperature:', 'max_tokens:', 'stop:', 'afm:', 'system:',

Sources/MacLocalAPI/Controllers/BatchCompletionsController.swift

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,10 @@ struct BatchCompletionsController: RouteCollection {
7878
response.headers.add(name: .accessControlAllowOrigin, value: "*")
7979

8080
// Grammar constraint header: check if any request has strict tools/schema
81+
// Apply server-level --guided-json default for requests without response_format (#97)
8182
let anyStrict = batchReq.requests.contains { item in
8283
MLXModelService.shouldDowngradeGrammarConstraints(
83-
responseFormat: item.body.responseFormat,
84+
responseFormat: service.effectiveResponseFormat(requestFormat: item.body.responseFormat),
8485
tools: item.body.tools,
8586
supportsStrictToolGrammar: service.supportsStrictToolGrammar,
8687
enableGrammarConstraints: service.enableGrammarConstraints
@@ -182,6 +183,8 @@ struct BatchCompletionsController: RouteCollection {
182183

183184
let effectiveModel = service.normalizeModel(chatReq.model ?? modelID)
184185
let effectiveMaxTokens = chatReq.effectiveMaxTokens ?? maxTokens ?? Int.max
186+
// Apply server-level --guided-json default when request omits response_format (#97)
187+
let effectiveResponseFormat = service.effectiveResponseFormat(requestFormat: chatReq.responseFormat)
185188

186189
let streamResult = try await service.generateStreaming(
187190
model: effectiveModel,
@@ -198,7 +201,7 @@ struct BatchCompletionsController: RouteCollection {
198201
topLogprobs: chatReq.topLogprobs,
199202
tools: chatReq.tools,
200203
stop: chatReq.stop,
201-
responseFormat: chatReq.responseFormat,
204+
responseFormat: effectiveResponseFormat,
202205
chatTemplateKwargs: chatReq.chatTemplateKwargs
203206
)
204207

@@ -215,7 +218,7 @@ struct BatchCompletionsController: RouteCollection {
215218
var hasToolCalls = false
216219
var fullText = ""
217220
let deferStructuredOutputContent =
218-
MLXChatCompletionsController.requiresStructuredOutputSanitization(chatReq.responseFormat)
221+
MLXChatCompletionsController.requiresStructuredOutputSanitization(effectiveResponseFormat)
219222

220223
// Think extraction state
221224
var thinkBuffer = ""
@@ -313,7 +316,7 @@ struct BatchCompletionsController: RouteCollection {
313316
if deferStructuredOutputContent && !hasToolCalls && chunk.toolCalls == nil {
314317
let sanitized = MLXChatCompletionsController.sanitizeStructuredOutput(
315318
fullText,
316-
responseFormat: chatReq.responseFormat
319+
responseFormat: effectiveResponseFormat
317320
)
318321
if !sanitized.isEmpty {
319322
delta["content"] = sanitized
@@ -365,7 +368,7 @@ struct BatchCompletionsController: RouteCollection {
365368
} else {
366369
message["content"] = MLXChatCompletionsController.sanitizeStructuredOutput(
367370
collected.content ?? "",
368-
responseFormat: chatReq.responseFormat
371+
responseFormat: effectiveResponseFormat
369372
)
370373
if let reasoning = collected.reasoningContent {
371374
message["reasoning_content"] = reasoning

Sources/MacLocalAPI/Controllers/MLXChatCompletionsController.swift

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,12 @@ struct MLXChatCompletionsController: RouteCollection {
116116
print("\(Self.red)[\(Self.timestamp())] RECV MLX user prompt:\n \(truncated)\(Self.reset)"); fflush(stdout)
117117
}
118118
}
119+
// Apply server-level --guided-json default when request omits response_format (#97)
120+
let effectiveResponseFormat = service.effectiveResponseFormat(requestFormat: chatRequest.responseFormat)
121+
119122
// Detect strict-mode downgrade: user requested grammar enforcement but admin didn't enable the engine
120123
let grammarDowngraded = MLXModelService.shouldDowngradeGrammarConstraints(
121-
responseFormat: chatRequest.responseFormat,
124+
responseFormat: effectiveResponseFormat,
122125
tools: chatRequest.tools,
123126
supportsStrictToolGrammar: service.supportsStrictToolGrammar,
124127
enableGrammarConstraints: service.enableGrammarConstraints
@@ -201,7 +204,7 @@ struct MLXChatCompletionsController: RouteCollection {
201204
let extractThinking = !rawOutput || isWebUI
202205

203206
if chatRequest.stream == true && streamingEnabled {
204-
return try await createStreamingResponse(req: req, chatRequest: chatRequest, extractThinking: extractThinking, grammarDowngraded: grammarDowngraded, requestId: reqId)
207+
return try await createStreamingResponse(req: req, chatRequest: chatRequest, extractThinking: extractThinking, effectiveResponseFormat: effectiveResponseFormat, grammarDowngraded: grammarDowngraded, requestId: reqId)
205208
}
206209

207210
// In concurrent mode, non-streaming requests currently bypass the
@@ -259,7 +262,7 @@ struct MLXChatCompletionsController: RouteCollection {
259262
topLogprobs: chatRequest.topLogprobs,
260263
tools: effectiveTools,
261264
stop: effectiveStop,
262-
responseFormat: chatRequest.responseFormat,
265+
responseFormat: effectiveResponseFormat,
263266
chatTemplateKwargs: chatRequest.chatTemplateKwargs,
264267
requestId: reqId
265268
)
@@ -369,7 +372,7 @@ struct MLXChatCompletionsController: RouteCollection {
369372
topLogprobs: chatRequest.topLogprobs,
370373
tools: effectiveTools,
371374
stop: effectiveStop,
372-
responseFormat: chatRequest.responseFormat,
375+
responseFormat: effectiveResponseFormat,
373376
chatTemplateKwargs: chatRequest.chatTemplateKwargs
374377
)
375378
}
@@ -381,7 +384,7 @@ struct MLXChatCompletionsController: RouteCollection {
381384
let sanitizeContent: (String) -> String = {
382385
Self.sanitizeStructuredOutput(
383386
self.sanitizeDegenerateTail($0),
384-
responseFormat: chatRequest.responseFormat
387+
responseFormat: effectiveResponseFormat
385388
)
386389
}
387390
let finalizedTurn = Self.finalizeAssistantTurn(
@@ -496,7 +499,7 @@ struct MLXChatCompletionsController: RouteCollection {
496499
}
497500
}
498501

499-
private func createStreamingResponse(req: Request, chatRequest: ChatCompletionRequest, extractThinking: Bool, grammarDowngraded: Bool = false, requestId: String = "") async throws -> Response {
502+
private func createStreamingResponse(req: Request, chatRequest: ChatCompletionRequest, extractThinking: Bool, effectiveResponseFormat: ResponseFormat?, grammarDowngraded: Bool = false, requestId: String = "") async throws -> Response {
500503
let httpResponse = Response(status: .ok)
501504
httpResponse.headers.add(name: .contentType, value: "text/event-stream")
502505
httpResponse.headers.add(name: .cacheControl, value: "no-cache")
@@ -530,11 +533,11 @@ struct MLXChatCompletionsController: RouteCollection {
530533
let effectivePresencePenalty = chatRequest.presencePenalty ?? self.presencePenalty
531534
let effectiveSeed = chatRequest.seed ?? self.seed
532535
let effectiveStop = self.mergeStopSequences(cliStop: self.stop, apiStop: chatRequest.stop)
533-
let deferStructuredOutputContent = Self.requiresStructuredOutputSanitization(chatRequest.responseFormat)
536+
let deferStructuredOutputContent = Self.requiresStructuredOutputSanitization(effectiveResponseFormat)
534537
let sanitizeContent: (String) -> String = {
535538
Self.sanitizeStructuredOutput(
536539
self.sanitizeDegenerateTail($0),
537-
responseFormat: chatRequest.responseFormat
540+
responseFormat: effectiveResponseFormat
538541
)
539542
}
540543

@@ -565,7 +568,7 @@ struct MLXChatCompletionsController: RouteCollection {
565568
topLogprobs: chatRequest.topLogprobs,
566569
tools: effectiveTools,
567570
stop: effectiveStop,
568-
responseFormat: chatRequest.responseFormat,
571+
responseFormat: effectiveResponseFormat,
569572
chatTemplateKwargs: chatRequest.chatTemplateKwargs,
570573
requestId: streamReqId
571574
)

Sources/MacLocalAPI/Controllers/MLXChatServing.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ protocol MLXChatServing {
3131
var thinkEndTag: String? { get }
3232
var fixToolArgs: Bool { get }
3333
var enableGrammarConstraints: Bool { get }
34+
var defaultGuidedJsonSchema: ResponseFormat? { get }
35+
36+
/// Resolve effective response format: per-request format wins, falls back to server default.
37+
func effectiveResponseFormat(requestFormat: ResponseFormat?) -> ResponseFormat?
3438

3539
func normalizeModel(_ raw: String) -> String
3640
func resolvedToolCallParser(logBypass: Bool) -> String?

Sources/MacLocalAPI/Models/MLXModelService.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,17 @@ final class MLXModelService: @unchecked Sendable {
129129
var kvBits: Int?
130130
var kvEvictionPolicy: String = "none" // "none" or "streaming"
131131
var enablePrefixCaching: Bool = false
132+
/// Server-level default JSON schema from `--guided-json` CLI flag.
133+
/// Applied to requests that don't specify their own response_format. (#97)
134+
var defaultGuidedJsonSchema: ResponseFormat?
135+
136+
/// Resolve the effective response format for an incoming request.
137+
/// Per-request `response_format` takes precedence over the server-level
138+
/// `--guided-json` default. (#97)
139+
func effectiveResponseFormat(requestFormat: ResponseFormat?) -> ResponseFormat? {
140+
requestFormat ?? defaultGuidedJsonSchema
141+
}
142+
132143
var enableGrammarConstraints: Bool = false { didSet { grammarConstraintsActive = enableGrammarConstraints } }
133144
var trace: Bool = false { didSet { traceLogging = trace } }
134145
var supportsStrictToolGrammar: Bool {

Sources/MacLocalAPI/main.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,8 @@ struct MlxCommand: ParsableCommand {
507507
}
508508

509509
if let guidedJson {
510-
_ = try parseGuidedJsonSchema(guidedJson)
510+
let schema = try parseGuidedJsonSchema(guidedJson)
511+
service.defaultGuidedJsonSchema = ResponseFormat(type: "json_schema", jsonSchema: schema)
511512
}
512513

513514
_ = try service.revalidateRegistry()

0 commit comments

Comments
 (0)