fix: --guided-json server flag and per-test spec extraction (#97, #98)

scouzi1966 · claude · web-flow · commit 628c2bb4bec0 · 2026-04-07T12:28:48.000-04:00
* fix: --guided-json server flag now applied to incoming requests (#97) The --guided-json CLI flag was parsed and validated but its result was discarded — server requests never received the schema. The flag only worked in single-prompt CLI mode (afm mlx -s). Fix: store the parsed schema in MLXModelService.defaultGuidedJsonSchema and apply it as a fallback in both chat completions and batch completions controllers when the request body omits response_format. Per-request response_format still wins. Verified: `afm mlx --guided-json '{...}'` now produces valid JSON output matching the schema for all test cases (color/hex, person, etc.). Closes #97 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: per-test spec extraction for [@ label] template format The regex for parsing test section headers was failing on the [@ label] template format used throughout test-llm-comprehensive.txt. The match captured "@ label" as group 1 and None as group 2, then took label from the (empty) group 2. Result: 0 specs extracted, all per-test scoring runs received empty test context. The judge then scored entries based only on the JSONL content, with no knowledge of the test intent — leading to nonsensical reasoning where codex would invent expectations from neighboring tests. Fix: separate regexes for [@ label] (template) and [model @ label] (named variant). Verified: 91/91 labels now extracted correctly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * refactor: address PR #98 review feedback - Centralize effectiveResponseFormat logic in MLXModelService.effectiveResponseFormat() helper, called from both chat and batch controllers - Make effectiveResponseFormat parameter required (no default) in createStreamingResponse to prevent future callers from accidentally skipping the guided-json fallback Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/Scripts/mlx-model-test.sh b/Scripts/mlx-model-test.sh
@@ -1653,14 +1653,18 @@ for line in lines:
     if stripped.startswith('#'):
         comment_buf.append(stripped)
     elif re.match(r'^\[.+\]$', stripped):
-        # Extract label from [org/model @ label]
-        m = re.match(r'^\[(.+?)(?:\s*@\s*(.+?))?\]$', stripped)
-        if m:
-            label = (m.group(2) or '').strip()
-            if label:
-                # Keep the AI: comments + section header
-                spec_lines = [c for c in comment_buf if '# AI:' in c or c.startswith('# ---')]
-                specs[label] = '\n'.join(spec_lines) if spec_lines else ''
+        # Extract label from [org/model @ label] OR [@ label] (template mode)
+        m_template = re.match(r'^\[@\s*(.+?)\s*\]$', stripped)
+        m_named = re.match(r'^\[(.+?)\s*@\s*(.+?)\]$', stripped)
+        label = ''
+        if m_template:
+            label = m_template.group(1).strip()
+        elif m_named:
+            label = m_named.group(2).strip()
+        if label:
+            # Keep the AI: comments + section header
+            spec_lines = [c for c in comment_buf if '# AI:' in c or c.startswith('# ---')]
+            specs[label] = '\n'.join(spec_lines) if spec_lines else ''
         comment_buf = []
     else:
         if not stripped.startswith(('temperature:', 'max_tokens:', 'stop:', 'afm:', 'system:',
diff --git a/Sources/MacLocalAPI/Controllers/BatchCompletionsController.swift b/Sources/MacLocalAPI/Controllers/BatchCompletionsController.swift
@@ -78,9 +78,10 @@ struct BatchCompletionsController: RouteCollection {
         response.headers.add(name: .accessControlAllowOrigin, value: "*")
 
         // Grammar constraint header: check if any request has strict tools/schema
+        // Apply server-level --guided-json default for requests without response_format (#97)
         let anyStrict = batchReq.requests.contains { item in
             MLXModelService.shouldDowngradeGrammarConstraints(
-                responseFormat: item.body.responseFormat,
+                responseFormat: service.effectiveResponseFormat(requestFormat: item.body.responseFormat),
                 tools: item.body.tools,
                 supportsStrictToolGrammar: service.supportsStrictToolGrammar,
                 enableGrammarConstraints: service.enableGrammarConstraints
@@ -182,6 +183,8 @@ struct BatchCompletionsController: RouteCollection {
 
             let effectiveModel = service.normalizeModel(chatReq.model ?? modelID)
             let effectiveMaxTokens = chatReq.effectiveMaxTokens ?? maxTokens ?? Int.max
+            // Apply server-level --guided-json default when request omits response_format (#97)
+            let effectiveResponseFormat = service.effectiveResponseFormat(requestFormat: chatReq.responseFormat)
 
             let streamResult = try await service.generateStreaming(
                 model: effectiveModel,
@@ -198,7 +201,7 @@ struct BatchCompletionsController: RouteCollection {
                 topLogprobs: chatReq.topLogprobs,
                 tools: chatReq.tools,
                 stop: chatReq.stop,
-                responseFormat: chatReq.responseFormat,
+                responseFormat: effectiveResponseFormat,
                 chatTemplateKwargs: chatReq.chatTemplateKwargs
             )
 
@@ -215,7 +218,7 @@ struct BatchCompletionsController: RouteCollection {
                 var hasToolCalls = false
                 var fullText = ""
                 let deferStructuredOutputContent =
-                    MLXChatCompletionsController.requiresStructuredOutputSanitization(chatReq.responseFormat)
+                    MLXChatCompletionsController.requiresStructuredOutputSanitization(effectiveResponseFormat)
 
                 // Think extraction state
                 var thinkBuffer = ""
@@ -313,7 +316,7 @@ struct BatchCompletionsController: RouteCollection {
                         if deferStructuredOutputContent && !hasToolCalls && chunk.toolCalls == nil {
                             let sanitized = MLXChatCompletionsController.sanitizeStructuredOutput(
                                 fullText,
-                                responseFormat: chatReq.responseFormat
+                                responseFormat: effectiveResponseFormat
                             )
                             if !sanitized.isEmpty {
                                 delta["content"] = sanitized
@@ -365,7 +368,7 @@ struct BatchCompletionsController: RouteCollection {
                 } else {
                     message["content"] = MLXChatCompletionsController.sanitizeStructuredOutput(
                         collected.content ?? "",
-                        responseFormat: chatReq.responseFormat
+                        responseFormat: effectiveResponseFormat
                     )
                     if let reasoning = collected.reasoningContent {
                         message["reasoning_content"] = reasoning
diff --git a/Sources/MacLocalAPI/Controllers/MLXChatCompletionsController.swift b/Sources/MacLocalAPI/Controllers/MLXChatCompletionsController.swift
@@ -116,9 +116,12 @@ struct MLXChatCompletionsController: RouteCollection {
                     print("\(Self.red)[\(Self.timestamp())] RECV MLX user prompt:\n  \(truncated)\(Self.reset)"); fflush(stdout)
                 }
             }
+            // Apply server-level --guided-json default when request omits response_format (#97)
+            let effectiveResponseFormat = service.effectiveResponseFormat(requestFormat: chatRequest.responseFormat)
+
             // Detect strict-mode downgrade: user requested grammar enforcement but admin didn't enable the engine
             let grammarDowngraded = MLXModelService.shouldDowngradeGrammarConstraints(
-                responseFormat: chatRequest.responseFormat,
+                responseFormat: effectiveResponseFormat,
                 tools: chatRequest.tools,
                 supportsStrictToolGrammar: service.supportsStrictToolGrammar,
                 enableGrammarConstraints: service.enableGrammarConstraints
@@ -201,7 +204,7 @@ struct MLXChatCompletionsController: RouteCollection {
             let extractThinking = !rawOutput || isWebUI
 
             if chatRequest.stream == true && streamingEnabled {
-                return try await createStreamingResponse(req: req, chatRequest: chatRequest, extractThinking: extractThinking, grammarDowngraded: grammarDowngraded, requestId: reqId)
+                return try await createStreamingResponse(req: req, chatRequest: chatRequest, extractThinking: extractThinking, effectiveResponseFormat: effectiveResponseFormat, grammarDowngraded: grammarDowngraded, requestId: reqId)
             }
 
             // In concurrent mode, non-streaming requests currently bypass the
@@ -259,7 +262,7 @@ struct MLXChatCompletionsController: RouteCollection {
                     topLogprobs: chatRequest.topLogprobs,
                     tools: effectiveTools,
                     stop: effectiveStop,
-                    responseFormat: chatRequest.responseFormat,
+                    responseFormat: effectiveResponseFormat,
                     chatTemplateKwargs: chatRequest.chatTemplateKwargs,
                     requestId: reqId
                 )
@@ -369,7 +372,7 @@ struct MLXChatCompletionsController: RouteCollection {
                     topLogprobs: chatRequest.topLogprobs,
                     tools: effectiveTools,
                     stop: effectiveStop,
-                    responseFormat: chatRequest.responseFormat,
+                    responseFormat: effectiveResponseFormat,
                     chatTemplateKwargs: chatRequest.chatTemplateKwargs
                 )
             }
@@ -381,7 +384,7 @@ struct MLXChatCompletionsController: RouteCollection {
             let sanitizeContent: (String) -> String = {
                 Self.sanitizeStructuredOutput(
                     self.sanitizeDegenerateTail($0),
-                    responseFormat: chatRequest.responseFormat
+                    responseFormat: effectiveResponseFormat
                 )
             }
             let finalizedTurn = Self.finalizeAssistantTurn(
@@ -496,7 +499,7 @@ struct MLXChatCompletionsController: RouteCollection {
         }
     }
 
-    private func createStreamingResponse(req: Request, chatRequest: ChatCompletionRequest, extractThinking: Bool, grammarDowngraded: Bool = false, requestId: String = "") async throws -> Response {
+    private func createStreamingResponse(req: Request, chatRequest: ChatCompletionRequest, extractThinking: Bool, effectiveResponseFormat: ResponseFormat?, grammarDowngraded: Bool = false, requestId: String = "") async throws -> Response {
         let httpResponse = Response(status: .ok)
         httpResponse.headers.add(name: .contentType, value: "text/event-stream")
         httpResponse.headers.add(name: .cacheControl, value: "no-cache")
@@ -530,11 +533,11 @@ struct MLXChatCompletionsController: RouteCollection {
             let effectivePresencePenalty = chatRequest.presencePenalty ?? self.presencePenalty
             let effectiveSeed = chatRequest.seed ?? self.seed
             let effectiveStop = self.mergeStopSequences(cliStop: self.stop, apiStop: chatRequest.stop)
-            let deferStructuredOutputContent = Self.requiresStructuredOutputSanitization(chatRequest.responseFormat)
+            let deferStructuredOutputContent = Self.requiresStructuredOutputSanitization(effectiveResponseFormat)
             let sanitizeContent: (String) -> String = {
                 Self.sanitizeStructuredOutput(
                     self.sanitizeDegenerateTail($0),
-                    responseFormat: chatRequest.responseFormat
+                    responseFormat: effectiveResponseFormat
                 )
             }
 
@@ -565,7 +568,7 @@ struct MLXChatCompletionsController: RouteCollection {
                     topLogprobs: chatRequest.topLogprobs,
                     tools: effectiveTools,
                     stop: effectiveStop,
-                    responseFormat: chatRequest.responseFormat,
+                    responseFormat: effectiveResponseFormat,
                     chatTemplateKwargs: chatRequest.chatTemplateKwargs,
                     requestId: streamReqId
                 )
diff --git a/Sources/MacLocalAPI/Controllers/MLXChatServing.swift b/Sources/MacLocalAPI/Controllers/MLXChatServing.swift
@@ -31,6 +31,10 @@ protocol MLXChatServing {
     var thinkEndTag: String? { get }
     var fixToolArgs: Bool { get }
     var enableGrammarConstraints: Bool { get }
+    var defaultGuidedJsonSchema: ResponseFormat? { get }
+
+    /// Resolve effective response format: per-request format wins, falls back to server default.
+    func effectiveResponseFormat(requestFormat: ResponseFormat?) -> ResponseFormat?
 
     func normalizeModel(_ raw: String) -> String
     func resolvedToolCallParser(logBypass: Bool) -> String?
diff --git a/Sources/MacLocalAPI/Models/MLXModelService.swift b/Sources/MacLocalAPI/Models/MLXModelService.swift
@@ -129,6 +129,17 @@ final class MLXModelService: @unchecked Sendable {
     var kvBits: Int?
     var kvEvictionPolicy: String = "none"  // "none" or "streaming"
     var enablePrefixCaching: Bool = false
+    /// Server-level default JSON schema from `--guided-json` CLI flag.
+    /// Applied to requests that don't specify their own response_format. (#97)
+    var defaultGuidedJsonSchema: ResponseFormat?
+
+    /// Resolve the effective response format for an incoming request.
+    /// Per-request `response_format` takes precedence over the server-level
+    /// `--guided-json` default. (#97)
+    func effectiveResponseFormat(requestFormat: ResponseFormat?) -> ResponseFormat? {
+        requestFormat ?? defaultGuidedJsonSchema
+    }
+
     var enableGrammarConstraints: Bool = false { didSet { grammarConstraintsActive = enableGrammarConstraints } }
     var trace: Bool = false { didSet { traceLogging = trace } }
     var supportsStrictToolGrammar: Bool {
diff --git a/Sources/MacLocalAPI/main.swift b/Sources/MacLocalAPI/main.swift
@@ -507,7 +507,8 @@ struct MlxCommand: ParsableCommand {
         }
 
         if let guidedJson {
-            _ = try parseGuidedJsonSchema(guidedJson)
+            let schema = try parseGuidedJsonSchema(guidedJson)
+            service.defaultGuidedJsonSchema = ResponseFormat(type: "json_schema", jsonSchema: schema)
         }
 
         _ = try service.revalidateRegistry()

Original file line number	Diff line number	Diff line change
`@@ -507,7 +507,8 @@ struct MlxCommand: ParsableCommand {`
`507`	`507`	`}`
`508`	`508`
`509`	`509`	`if let guidedJson {`
`510`		`- _ = try parseGuidedJsonSchema(guidedJson)`
	`510`	`+ let schema = try parseGuidedJsonSchema(guidedJson)`
	`511`	`+ service.defaultGuidedJsonSchema = ResponseFormat(type: "json_schema", jsonSchema: schema)`
`511`	`512`	`}`
`512`	`513`
`513`	`514`	`_ = try service.revalidateRegistry()`