n8n-io · JoseBra · Apr 10, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts b/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
@@ -1,39 +1,25 @@
-import { createEvalAgent, extractText } from '../../src/utils/eval-agents';
+import { z } from 'zod';
+
+import { createEvalAgent } from '../../src/utils/eval-agents';
 import type { WorkflowResponse } from '../clients/n8n-client';
 import { MOCK_EXECUTION_VERIFY_PROMPT } from '../system-prompts/mock-execution-verify';
 import type { ChecklistItem, ChecklistResult } from '../types';
 
 // ---------------------------------------------------------------------------
-// JSON parsing helpers
+// Structured output schema
 // ---------------------------------------------------------------------------
 
-function parseJsonArray(text: string): unknown[] {
-	// Try fenced code block first
-	const fenceMatch = text.match(/```(?:json)?\s*\n?([\s\S]*?)```/);
-	const jsonStr = fenceMatch ? fenceMatch[1].trim() : text.trim();
-
-	try {
-		const parsed: unknown = JSON.parse(jsonStr);
-		if (Array.isArray(parsed)) return parsed;
-	} catch {
-		// Try extracting array from anywhere in the text
-		const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
-		if (arrayMatch) {
-			try {
-				const parsed: unknown = JSON.parse(arrayMatch[0]);
-				if (Array.isArray(parsed)) return parsed;
-			} catch {
-				// fall through
-			}
-		}
-	}
-
-	// Log failure for debugging — this causes "No verification result"
-	console.warn(
-		`[verifier] Failed to parse JSON array from LLM response (${text.length} chars). First 500 chars: ${text.slice(0, 500)}`,
-	);
-	return [];
-}
+const checklistResultSchema = z.object({
+	results: z.array(
+		z.object({
+			id: z.number(),
+			pass: z.boolean(),
+			reasoning: z.string(),
+			failureCategory: z.string().optional(),
+			rootCause: z.string().optional(),
+		}),
+	),
+});
 
 // ---------------------------------------------------------------------------
 // Public API
@@ -61,34 +47,34 @@ Verify each checklist item against the artifact above.`;
 		const agent = createEvalAgent('eval-checklist-verifier', {
 			instructions: MOCK_EXECUTION_VERIFY_PROMPT,
 			cache: true,
-		});
-
-		const result = await agent.generate(userMessage, {
-			providerOptions: { anthropic: { maxTokens: 16_384 } },
-		});
-
-		const content = extractText(result);
+		}).structuredOutput(checklistResultSchema);
 
-		const rawResults = parseJsonArray(content);
+		const result = await agent.generate(userMessage);
 
 		const validIds = new Set(llmItems.map((i) => i.id));
-		for (const raw of rawResults) {
-			const entry = raw as Record<string, unknown>;
-			if (
-				typeof entry.id === 'number' &&
-				typeof entry.pass === 'boolean' &&
-				validIds.has(entry.id)
-			) {
-				results.push({
-					id: entry.id,
-					pass: entry.pass,
-					reasoning: typeof entry.reasoning === 'string' ? entry.reasoning : '',
-					strategy: 'llm',
-					failureCategory:
-						typeof entry.failureCategory === 'string' ? entry.failureCategory : undefined,
-					rootCause: typeof entry.rootCause === 'string' ? entry.rootCause : undefined,
-				});
+		const parsed = result.structuredOutput as z.infer<typeof checklistResultSchema> | undefined;
+
+		if (parsed?.results) {
+			for (const entry of parsed.results) {
+				if (
+					typeof entry.id === 'number' &&
+					typeof entry.pass === 'boolean' &&
+					validIds.has(entry.id)
+				) {
+					results.push({
+						id: entry.id,
+						pass: entry.pass,
+						reasoning: entry.reasoning ?? '',
+						strategy: 'llm',
+						failureCategory: entry.failureCategory,
+						rootCause: entry.rootCause,
+					});
+				}
 			}
+		} else {
+			console.warn(
+				'[verifier] structuredOutput returned null — LLM did not produce parseable results',
+			);
 		}
 	}
 

diff --git a/packages/@n8n/instance-ai/evaluations/data/workflows/github-notion-sync.json b/packages/@n8n/instance-ai/evaluations/data/workflows/github-notion-sync.json
@@ -7,7 +7,7 @@
 		{
 			"name": "happy-path",
 			"description": "GitHub returns issues, each is synced to Notion",
-			"dataSetup": "The GitHub API returns 3 open bug issues. Issue 1: title='Login timeout on mobile', created_at='2026-03-15T10:00:00Z', assignee.login='alice', html_url='https://github.com/acme-corp/backend/issues/142'. Issue 2: title='API rate limit not enforced', created_at='2026-03-20T14:30:00Z', assignee=null, html_url='https://github.com/acme-corp/backend/issues/155'. Issue 3: title='Memory leak in worker pool', created_at='2026-03-22T09:00:00Z', assignee.login='bob', html_url='https://github.com/acme-corp/backend/issues/158'. Each Notion create-page call returns a success response.",
+			"dataSetup": "The GitHub API returns 3 open bug issues. Issue 1: title='Login timeout on mobile', created_at='2026-03-15T10:00:00Z', assignee.login='alice', html_url='https://github.com/acme-corp/backend/issues/142'. Issue 2: title='API rate limit not enforced', created_at='2026-03-20T14:30:00Z', assignee=null, html_url='https://github.com/acme-corp/backend/issues/155'. Issue 3: title='Memory leak in worker pool', created_at='2026-03-22T09:00:00Z', assignee.login='bob', html_url='https://github.com/acme-corp/backend/issues/158'. Each Notion create-page call returns a minimal success response with just {object: 'page', id: '<uuid>'}.",
 			"successCriteria": "The workflow executes without errors. All 3 issues are fetched from GitHub. 3 pages are created in Notion with the correct titles, URLs, dates, and assignees. The unassigned issue (Issue 2) has 'Unassigned' as the assignee value."
 		},
 		{

diff --git a/packages/@n8n/instance-ai/evaluations/data/workflows/rest-api-data-pipeline.json b/packages/@n8n/instance-ai/evaluations/data/workflows/rest-api-data-pipeline.json
@@ -1,14 +1,14 @@
 {
-	"prompt": "Fetch the latest 10 posts from the JSONPlaceholder API (GET https://jsonplaceholder.typicode.com/posts with query parameter _limit=10). Filter out any posts where the title contains the word 'qui'. Then post a summary message to a Slack channel called #api-digest that says how many posts remain and lists their titles. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
+	"prompt": "Fetch the latest posts from the JSONPlaceholder API (GET https://jsonplaceholder.typicode.com/posts). Filter out any posts where the title contains the word 'qui'. Then post a summary message to a Slack channel called #api-digest that says how many posts remain and lists their titles. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
 	"complexity": "medium",
 	"tags": ["build", "http-request", "slack", "data-transformation", "schedule"],
 	"triggerType": "schedule",
 	"scenarios": [
 		{
 			"name": "happy-path",
-			"description": "API returns 10 posts, some get filtered, summary posted to Slack",
-			"dataSetup": "The HTTP Request node returns 10 JSON objects with fields: id, userId, title, body. Some of them should have 'qui' in the title. The Slack node returns a success response.",
-			"successCriteria": "The workflow executes without errors. The HTTP Request fetches data successfully. Posts containing 'qui' in the title are filtered out — fewer posts remain than the original 10. The Slack message is posted to #api-digest with the count and titles of the remaining posts. None of the titles in the Slack message should contain the word 'qui'."
+			"description": "API returns posts, some contain 'qui' and get filtered, summary posted to Slack",
+			"dataSetup": "The HTTP Request node returns JSON objects with fields: id, userId, title, body. Some of them should have 'qui' in the title (e.g. 'qui est esse', 'nesciunt quid non qui'). Others should NOT contain 'qui'. The Slack node returns a success response.",
+			"successCriteria": "The workflow executes without errors. Posts containing 'qui' in the title are filtered out. The Slack message is posted to #api-digest with the titles of the remaining posts. None of the titles in the Slack message should contain the word 'qui'."
 		},
 		{
 			"name": "empty-response",

diff --git a/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts b/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts
@@ -42,8 +42,8 @@ The verification artifact contains:
 ## Failure categories
 
 When a checklist item fails, categorize the root cause:
-- **builder_issue**: The AI agent that built the workflow misconfigured a node (missing parameters, wrong settings, incomplete config, wrong routing logic, missing nodes). Evidence: configIssues flags, nodes crashing before making HTTP requests, Switch/IF nodes missing required options, workflow structure doesn't match what the prompt asked for.
-- **mock_issue**: The LLM mock handler returned incorrect or missing data. Evidence: _evalMockError in responses, mock response shape doesn't match what the node expects, identical responses for different requests, mock data missing fields that downstream nodes reference.
+- **builder_issue**: The AI agent that built the workflow misconfigured a node (missing parameters, wrong settings, incomplete config, wrong routing logic, missing nodes). Evidence: configIssues flags, nodes crashing before making HTTP requests, Switch/IF nodes missing required options, workflow structure doesn't match what the prompt asked for. Also applies when a filter/code node receives correct input data but produces wrong output — this means the node logic is wrong, not the mock data.
+- **mock_issue**: The LLM mock handler returned incorrect or missing data. Evidence: _evalMockError in responses, mock response shape doesn't match what the node expects, mock data missing fields that downstream nodes reference. IMPORTANT: Trace the data flow carefully — if the mock returned correct data but a downstream filter or code node transformed it incorrectly, that is a builder_issue, not a mock_issue.
 - **legitimate_failure**: The workflow genuinely doesn't meet the success criteria and neither the builder nor mock is at fault. The test is working as designed — for example, the workflow lacks error handling that the scenario tests for.
 - **framework_issue**: The evaluation framework itself failed — Phase 1 returned an error or empty trigger content, causing cascading failures. Evidence: pre-analysis flags starting with "FRAMEWORK ISSUE", empty trigger node output (empty JSON object), "Phase 1 error" warnings. When this happens, downstream node crashes are a consequence of the empty input, NOT a builder or mock problem.
 - **verification_gap**: You don't have enough information in the artifact to make a determination.

diff --git a/packages/cli/src/modules/instance-ai/eval/mock-handler.ts b/packages/cli/src/modules/instance-ai/eval/mock-handler.ts
@@ -72,13 +72,17 @@ Respond with ONLY a JSON object. No explanation, no markdown, no prose.
 // Types
 // ---------------------------------------------------------------------------
 
+const DEFAULT_MAX_RETRIES = 1;
+
 interface MockHandlerOptions {
 	/** Optional scenario description — steers the LLM toward specific behavior (errors, edge cases) */
 	scenarioHints?: string;
 	/** Pre-generated consistent data context from Phase 1 (generateMockHints) */
 	globalContext?: string;
 	/** Per-node data hints from Phase 1, keyed by node name */
 	nodeHints?: Record<string, string>;
+	/** Max retries on mock generation failure (default: 1) */
+	maxRetries?: number;
 }
 
 /** Structured response spec returned by the LLM */
@@ -114,6 +118,7 @@ export function createLlmMockHandler(options?: MockHandlerOptions): EvalLlmMockH
 			globalContext: options?.globalContext,
 			nodeHint: options?.nodeHints?.[node.name],
 			nodeConfig: nodeConfigCache.get(node.name) ?? '',
+			maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES,
 		});
 	};
 }
@@ -127,6 +132,7 @@ interface MockResponseContext {
 	globalContext?: string;
 	nodeHint?: string;
 	nodeConfig: string;
+	maxRetries: number;
 }
 
 async function generateMockResponse(
@@ -191,21 +197,31 @@ async function generateMockResponse(
 
 	const userPrompt = sections.join('\n');
 
-	try {
-		const spec = await callLlm(userPrompt, context.nodeConfig);
-		return materializeSpec(spec);
-	} catch (error) {
-		const errorMsg = error instanceof Error ? error.message : String(error);
-		const safeUrl = extractEndpoint(request.url);
-		Container.get(Logger).error(
-			`[EvalMock] Mock generation failed for ${request.method ?? 'GET'} ${safeUrl}: ${errorMsg}`,
-		);
-		return {
-			body: { _evalMockError: true, message: `Mock generation failed: ${errorMsg}` },
-			headers: { 'content-type': 'application/json' },
-			statusCode: 200,
-		};
+	const safeUrl = extractEndpoint(request.url);
+	let lastError = '';
+
+	for (let attempt = 0; attempt <= context.maxRetries; attempt++) {
+		try {
+			const spec = await callLlm(userPrompt, context.nodeConfig);
+			return materializeSpec(spec);
+		} catch (error) {
+			lastError = error instanceof Error ? error.message : String(error);
+			if (attempt < context.maxRetries) {
+				Container.get(Logger).warn(
+					`[EvalMock] Mock generation failed for ${request.method ?? 'GET'} ${safeUrl}, retrying (${attempt + 1}/${context.maxRetries}): ${lastError}`,
+				);
+			}
+		}
 	}
+
+	Container.get(Logger).error(
+		`[EvalMock] Mock generation failed for ${request.method ?? 'GET'} ${safeUrl} after ${context.maxRetries + 1} attempts: ${lastError}`,
+	);
+	return {
+		body: { _evalMockError: true, message: `Mock generation failed: ${lastError}` },
+		headers: { 'content-type': 'application/json' },
+		statusCode: 200,
+	};
 }
 
 // ---------------------------------------------------------------------------
@@ -252,9 +268,7 @@ async function callLlm(userPrompt: string, nodeConfig: string): Promise<MockResp
 		.tool(apiDocsTool)
 		.tool(createNodeConfigTool(nodeConfig));
 
-	const result = await agent.generate(userPrompt, {
-		providerOptions: { anthropic: { maxTokens: 4096 } },
-	});
+	const result = await agent.generate(userPrompt);
 
 	const text: string = extractText(result);
 	return parseResponseText(text);