Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 39 additions & 53 deletions packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
Original file line number Diff line number Diff line change
@@ -1,39 +1,25 @@
import { createEvalAgent, extractText } from '../../src/utils/eval-agents';
import { z } from 'zod';

import { createEvalAgent } from '../../src/utils/eval-agents';
import type { WorkflowResponse } from '../clients/n8n-client';
import { MOCK_EXECUTION_VERIFY_PROMPT } from '../system-prompts/mock-execution-verify';
import type { ChecklistItem, ChecklistResult } from '../types';

// ---------------------------------------------------------------------------
// JSON parsing helpers
// Structured output schema
// ---------------------------------------------------------------------------

function parseJsonArray(text: string): unknown[] {
// Try fenced code block first
const fenceMatch = text.match(/```(?:json)?\s*\n?([\s\S]*?)```/);
const jsonStr = fenceMatch ? fenceMatch[1].trim() : text.trim();

try {
const parsed: unknown = JSON.parse(jsonStr);
if (Array.isArray(parsed)) return parsed;
} catch {
// Try extracting array from anywhere in the text
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
if (arrayMatch) {
try {
const parsed: unknown = JSON.parse(arrayMatch[0]);
if (Array.isArray(parsed)) return parsed;
} catch {
// fall through
}
}
}

// Log failure for debugging — this causes "No verification result"
console.warn(
`[verifier] Failed to parse JSON array from LLM response (${text.length} chars). First 500 chars: ${text.slice(0, 500)}`,
);
return [];
}
const checklistResultSchema = z.object({
results: z.array(
z.object({
id: z.number(),
pass: z.boolean(),
reasoning: z.string(),
failureCategory: z.string().optional(),
rootCause: z.string().optional(),
}),
),
});

// ---------------------------------------------------------------------------
// Public API
Expand Down Expand Up @@ -61,34 +47,34 @@ Verify each checklist item against the artifact above.`;
const agent = createEvalAgent('eval-checklist-verifier', {
instructions: MOCK_EXECUTION_VERIFY_PROMPT,
cache: true,
});

const result = await agent.generate(userMessage, {
providerOptions: { anthropic: { maxTokens: 16_384 } },
});

const content = extractText(result);
}).structuredOutput(checklistResultSchema);

const rawResults = parseJsonArray(content);
const result = await agent.generate(userMessage);

const validIds = new Set(llmItems.map((i) => i.id));
for (const raw of rawResults) {
const entry = raw as Record<string, unknown>;
if (
typeof entry.id === 'number' &&
typeof entry.pass === 'boolean' &&
validIds.has(entry.id)
) {
results.push({
id: entry.id,
pass: entry.pass,
reasoning: typeof entry.reasoning === 'string' ? entry.reasoning : '',
strategy: 'llm',
failureCategory:
typeof entry.failureCategory === 'string' ? entry.failureCategory : undefined,
rootCause: typeof entry.rootCause === 'string' ? entry.rootCause : undefined,
});
const parsed = result.structuredOutput as z.infer<typeof checklistResultSchema> | undefined;

if (parsed?.results) {
for (const entry of parsed.results) {
if (
typeof entry.id === 'number' &&
typeof entry.pass === 'boolean' &&
validIds.has(entry.id)
) {
results.push({
id: entry.id,
pass: entry.pass,
reasoning: entry.reasoning ?? '',
strategy: 'llm',
failureCategory: entry.failureCategory,
rootCause: entry.rootCause,
});
}
}
} else {
console.warn(
'[verifier] structuredOutput returned null — LLM did not produce parseable results',
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{
"name": "happy-path",
"description": "GitHub returns issues, each is synced to Notion",
"dataSetup": "The GitHub API returns 3 open bug issues. Issue 1: title='Login timeout on mobile', created_at='2026-03-15T10:00:00Z', assignee.login='alice', html_url='https://github.com/acme-corp/backend/issues/142'. Issue 2: title='API rate limit not enforced', created_at='2026-03-20T14:30:00Z', assignee=null, html_url='https://github.com/acme-corp/backend/issues/155'. Issue 3: title='Memory leak in worker pool', created_at='2026-03-22T09:00:00Z', assignee.login='bob', html_url='https://github.com/acme-corp/backend/issues/158'. Each Notion create-page call returns a success response.",
"dataSetup": "The GitHub API returns 3 open bug issues. Issue 1: title='Login timeout on mobile', created_at='2026-03-15T10:00:00Z', assignee.login='alice', html_url='https://github.com/acme-corp/backend/issues/142'. Issue 2: title='API rate limit not enforced', created_at='2026-03-20T14:30:00Z', assignee=null, html_url='https://github.com/acme-corp/backend/issues/155'. Issue 3: title='Memory leak in worker pool', created_at='2026-03-22T09:00:00Z', assignee.login='bob', html_url='https://github.com/acme-corp/backend/issues/158'. Each Notion create-page call returns a minimal success response with just {object: 'page', id: '<uuid>'}.",
"successCriteria": "The workflow executes without errors. All 3 issues are fetched from GitHub. 3 pages are created in Notion with the correct titles, URLs, dates, and assignees. The unassigned issue (Issue 2) has 'Unassigned' as the assignee value."
},
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
"prompt": "Fetch the latest 10 posts from the JSONPlaceholder API (GET https://jsonplaceholder.typicode.com/posts with query parameter _limit=10). Filter out any posts where the title contains the word 'qui'. Then post a summary message to a Slack channel called #api-digest that says how many posts remain and lists their titles. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
"prompt": "Fetch the latest posts from the JSONPlaceholder API (GET https://jsonplaceholder.typicode.com/posts). Filter out any posts where the title contains the word 'qui'. Then post a summary message to a Slack channel called #api-digest that says how many posts remain and lists their titles. Configure all nodes as completely as possible and don't ask me for credentials, I'll set them up later.",
"complexity": "medium",
"tags": ["build", "http-request", "slack", "data-transformation", "schedule"],
"triggerType": "schedule",
"scenarios": [
{
"name": "happy-path",
"description": "API returns 10 posts, some get filtered, summary posted to Slack",
"dataSetup": "The HTTP Request node returns 10 JSON objects with fields: id, userId, title, body. Some of them should have 'qui' in the title. The Slack node returns a success response.",
"successCriteria": "The workflow executes without errors. The HTTP Request fetches data successfully. Posts containing 'qui' in the title are filtered out — fewer posts remain than the original 10. The Slack message is posted to #api-digest with the count and titles of the remaining posts. None of the titles in the Slack message should contain the word 'qui'."
"description": "API returns posts, some contain 'qui' and get filtered, summary posted to Slack",
"dataSetup": "The HTTP Request node returns JSON objects with fields: id, userId, title, body. Some of them should have 'qui' in the title (e.g. 'qui est esse', 'nesciunt quid non qui'). Others should NOT contain 'qui'. The Slack node returns a success response.",
"successCriteria": "The workflow executes without errors. Posts containing 'qui' in the title are filtered out. The Slack message is posted to #api-digest with the titles of the remaining posts. None of the titles in the Slack message should contain the word 'qui'."
},
{
"name": "empty-response",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ The verification artifact contains:
## Failure categories

When a checklist item fails, categorize the root cause:
- **builder_issue**: The AI agent that built the workflow misconfigured a node (missing parameters, wrong settings, incomplete config, wrong routing logic, missing nodes). Evidence: configIssues flags, nodes crashing before making HTTP requests, Switch/IF nodes missing required options, workflow structure doesn't match what the prompt asked for.
- **mock_issue**: The LLM mock handler returned incorrect or missing data. Evidence: _evalMockError in responses, mock response shape doesn't match what the node expects, identical responses for different requests, mock data missing fields that downstream nodes reference.
- **builder_issue**: The AI agent that built the workflow misconfigured a node (missing parameters, wrong settings, incomplete config, wrong routing logic, missing nodes). Evidence: configIssues flags, nodes crashing before making HTTP requests, Switch/IF nodes missing required options, workflow structure doesn't match what the prompt asked for. Also applies when a filter/code node receives correct input data but produces wrong output — this means the node logic is wrong, not the mock data.
- **mock_issue**: The LLM mock handler returned incorrect or missing data. Evidence: _evalMockError in responses, mock response shape doesn't match what the node expects, mock data missing fields that downstream nodes reference. IMPORTANT: Trace the data flow carefully — if the mock returned correct data but a downstream filter or code node transformed it incorrectly, that is a builder_issue, not a mock_issue.
- **legitimate_failure**: The workflow genuinely doesn't meet the success criteria and neither the builder nor mock is at fault. The test is working as designed — for example, the workflow lacks error handling that the scenario tests for.
- **framework_issue**: The evaluation framework itself failed — Phase 1 returned an error or empty trigger content, causing cascading failures. Evidence: pre-analysis flags starting with "FRAMEWORK ISSUE", empty trigger node output (empty JSON object), "Phase 1 error" warnings. When this happens, downstream node crashes are a consequence of the empty input, NOT a builder or mock problem.
- **verification_gap**: You don't have enough information in the artifact to make a determination.
Expand Down
48 changes: 31 additions & 17 deletions packages/cli/src/modules/instance-ai/eval/mock-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,17 @@ Respond with ONLY a JSON object. No explanation, no markdown, no prose.
// Types
// ---------------------------------------------------------------------------

const DEFAULT_MAX_RETRIES = 1;

interface MockHandlerOptions {
/** Optional scenario description — steers the LLM toward specific behavior (errors, edge cases) */
scenarioHints?: string;
/** Pre-generated consistent data context from Phase 1 (generateMockHints) */
globalContext?: string;
/** Per-node data hints from Phase 1, keyed by node name */
nodeHints?: Record<string, string>;
/** Max retries on mock generation failure (default: 1) */
maxRetries?: number;
}

/** Structured response spec returned by the LLM */
Expand Down Expand Up @@ -114,6 +118,7 @@ export function createLlmMockHandler(options?: MockHandlerOptions): EvalLlmMockH
globalContext: options?.globalContext,
nodeHint: options?.nodeHints?.[node.name],
nodeConfig: nodeConfigCache.get(node.name) ?? '',
maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES,
});
};
}
Expand All @@ -127,6 +132,7 @@ interface MockResponseContext {
globalContext?: string;
nodeHint?: string;
nodeConfig: string;
maxRetries: number;
}

async function generateMockResponse(
Expand Down Expand Up @@ -191,21 +197,31 @@ async function generateMockResponse(

const userPrompt = sections.join('\n');

try {
const spec = await callLlm(userPrompt, context.nodeConfig);
return materializeSpec(spec);
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
const safeUrl = extractEndpoint(request.url);
Container.get(Logger).error(
`[EvalMock] Mock generation failed for ${request.method ?? 'GET'} ${safeUrl}: ${errorMsg}`,
);
return {
body: { _evalMockError: true, message: `Mock generation failed: ${errorMsg}` },
headers: { 'content-type': 'application/json' },
statusCode: 200,
};
const safeUrl = extractEndpoint(request.url);
let lastError = '';

for (let attempt = 0; attempt <= context.maxRetries; attempt++) {
try {
const spec = await callLlm(userPrompt, context.nodeConfig);
return materializeSpec(spec);
} catch (error) {
lastError = error instanceof Error ? error.message : String(error);
if (attempt < context.maxRetries) {
Container.get(Logger).warn(
`[EvalMock] Mock generation failed for ${request.method ?? 'GET'} ${safeUrl}, retrying (${attempt + 1}/${context.maxRetries}): ${lastError}`,
);
}
}
}

Container.get(Logger).error(
`[EvalMock] Mock generation failed for ${request.method ?? 'GET'} ${safeUrl} after ${context.maxRetries + 1} attempts: ${lastError}`,
);
return {
body: { _evalMockError: true, message: `Mock generation failed: ${lastError}` },
headers: { 'content-type': 'application/json' },
statusCode: 200,
};
}

// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -252,9 +268,7 @@ async function callLlm(userPrompt: string, nodeConfig: string): Promise<MockResp
.tool(apiDocsTool)
.tool(createNodeConfigTool(nodeConfig));

const result = await agent.generate(userPrompt, {
providerOptions: { anthropic: { maxTokens: 4096 } },
});
const result = await agent.generate(userPrompt);

const text: string = extractText(result);
return parseResponseText(text);
Expand Down
Loading