Skip to content

Commit 5e5838d

Browse files
committed
Add dist for quick local usage
1 parent b798515 commit 5e5838d

42 files changed

Lines changed: 5835 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

dist/assertion.d.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import { AssertionOptions } from "./types";
2+
/**
3+
* Multi-model consensus assertion engine.
4+
* Runs Claude and Gemini in parallel; if they disagree, a third model (arbiter) makes the final call.
5+
* An assertion passes only if both models agree (or the arbiter decides).
6+
* Automatically retries failed assertions once with a fresh page snapshot.
7+
*
8+
* @param options - Assertion configuration
9+
* @param options.page - The Playwright page instance to take snapshots from
10+
* @param options.assertion - Natural language assertion to validate (e.g. "The cart shows 3 items")
11+
* @param options.expect - Playwright expect function, used to fail the test on assertion failure
12+
* @param options.effort - "low" (default) or "high" — high enables thinking mode for deeper analysis
13+
* @param options.images - Optional base64 screenshot images to provide to the models
14+
* @param options.failSilently - When true, returns the result without failing the test
15+
* @param options.test - Playwright test instance for attaching metadata
16+
* @returns A string summary of the assertion result
17+
* @throws Fails the Playwright test via expect when assertion fails (unless failSilently is true)
18+
*
19+
* @example
20+
* ```typescript
21+
* await assert({
22+
* page,
23+
* assertion: "The dashboard shows 3 active projects",
24+
* expect,
25+
* effort: "high",
26+
* });
27+
* ```
28+
*/
29+
export declare const assert: ({ page, assertion, test, expect, effort, images, failSilently, maxRetries, onRetry, }: AssertionOptions) => Promise<string>;

dist/assertion.js

Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
"use strict";
2+
Object.defineProperty(exports, "__esModule", { value: true });
3+
exports.assert = void 0;
4+
const ai_1 = require("ai");
5+
const zod_1 = require("zod");
6+
const config_1 = require("./config");
7+
const constants_1 = require("./constants");
8+
const logger_1 = require("./logger");
9+
const models_1 = require("./models");
10+
const utils_1 = require("./utils");
11+
const assertionSchema = zod_1.z.object({
12+
assertionPassed: zod_1.z.boolean().describe("Indicates whether the assertion passed or not."),
13+
confidenceScore: zod_1.z
14+
.number()
15+
.describe("Confidence score of the assertion, between 0 and 100."),
16+
reasoning: zod_1.z
17+
.string()
18+
.describe("Brief explanation of the reasoning behind your decision - explain why the assertion passed or failed."),
19+
});
20+
/**
21+
* Multi-model consensus assertion engine.
22+
* Runs Claude and Gemini in parallel; if they disagree, a third model (arbiter) makes the final call.
23+
* An assertion passes only if both models agree (or the arbiter decides).
24+
* Automatically retries failed assertions once with a fresh page snapshot.
25+
*
26+
* @param options - Assertion configuration
27+
* @param options.page - The Playwright page instance to take snapshots from
28+
* @param options.assertion - Natural language assertion to validate (e.g. "The cart shows 3 items")
29+
* @param options.expect - Playwright expect function, used to fail the test on assertion failure
30+
* @param options.effort - "low" (default) or "high" — high enables thinking mode for deeper analysis
31+
* @param options.images - Optional base64 screenshot images to provide to the models
32+
* @param options.failSilently - When true, returns the result without failing the test
33+
* @param options.test - Playwright test instance for attaching metadata
34+
* @returns A string summary of the assertion result
35+
* @throws Fails the Playwright test via expect when assertion fails (unless failSilently is true)
36+
*
37+
* @example
38+
* ```typescript
39+
* await assert({
40+
* page,
41+
* assertion: "The dashboard shows 3 active projects",
42+
* expect,
43+
* effort: "high",
44+
* });
45+
* ```
46+
*/
47+
const assert = async ({ page, assertion, test, expect, effort = "low", images, failSilently, maxRetries = 1, onRetry = (retryCount, previousResult) => { }, }) => {
48+
const thinkingEnabled = effort === "high";
49+
const runFullAssertion = async () => {
50+
const snapshot = await (0, utils_1.safeSnapshot)(page);
51+
const imageContent = images
52+
? images.map((image) => ({ type: "image", image }))
53+
: [
54+
{
55+
type: "image",
56+
image: (await (0, utils_1.resolvePage)(page).screenshot({ fullPage: false })).toString("base64"),
57+
},
58+
];
59+
const basePrompt = `
60+
You are an AI-powered QA Agent designed to test web applications.
61+
62+
You have access to the following information. Based on this information, you'll tell us whether the assertion provided below should pass or not.
63+
${!images
64+
? `
65+
- An accessibility snapshot of the current page, which provides a detailed structure of the DOM
66+
- A screenshot of the current page`
67+
: "- Screenshots from various stages of the user flow"}
68+
69+
${!images
70+
? `
71+
<Snapshot>
72+
${snapshot}
73+
</Snapshot>
74+
`
75+
: ""}
76+
77+
<Assertion>
78+
${assertion}
79+
</Assertion>
80+
81+
<Rules>
82+
- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion.
83+
- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion.
84+
- Don't create additional assertion conditions on your own - only consider the exact assertion provided above.
85+
- The assertion should pass if either the screenshot or the accessibility snapshot supports it.
86+
- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching.
87+
- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ.
88+
</Rules>
89+
90+
<OutputFormat>
91+
The output should contain the following information:
92+
- \`assertionPassed\`: A boolean indicating whether the assertion passed or not.
93+
- \`confidenceScore\`: A number between 0 and 100 indicating the confidence score of the assertion.
94+
- \`reasoning\`: A brief string explaining the reasoning behind the assertion.
95+
</OutputFormat>
96+
97+
Never hallucinate. Be truthful and if you are not sure, use a low confidence score.
98+
`;
99+
const messages = [
100+
{
101+
role: "user",
102+
content: [
103+
{
104+
type: "text",
105+
text: basePrompt,
106+
},
107+
...imageContent,
108+
],
109+
},
110+
];
111+
// Claude assertion function
112+
const getClaudeAssertion = async () => {
113+
// First get Claude's text response with thinking if enabled
114+
const { text } = await (0, ai_1.generateText)({
115+
model: (0, models_1.resolveModel)((0, config_1.getModelId)("assertionPrimary")),
116+
temperature: 0,
117+
providerOptions: thinkingEnabled
118+
? {
119+
anthropic: {
120+
thinking: { type: "enabled", budgetTokens: constants_1.THINKING_BUDGET_DEFAULT },
121+
},
122+
openrouter: {
123+
reasoning: { max_tokens: constants_1.THINKING_BUDGET_DEFAULT },
124+
},
125+
}
126+
: undefined,
127+
messages,
128+
});
129+
// Convert Claude's response to structured format using Haiku
130+
const { output } = await (0, ai_1.generateText)({
131+
model: (0, models_1.resolveModel)((0, config_1.getModelId)("assertionPrimary")),
132+
temperature: 0.1,
133+
prompt: `Convert the following text output into a valid JSON object with the specified properties:\n\n${text}`,
134+
output: ai_1.Output.object({ schema: assertionSchema }),
135+
});
136+
return output;
137+
};
138+
// Gemini assertion function
139+
const getGeminiAssertion = async () => {
140+
const { output } = await (0, ai_1.generateText)({
141+
model: (0, models_1.resolveModel)((0, config_1.getModelId)("assertionSecondary")),
142+
temperature: 0,
143+
providerOptions: thinkingEnabled
144+
? {
145+
google: {
146+
thinkingConfig: {
147+
thinkingBudget: constants_1.THINKING_BUDGET_DEFAULT,
148+
},
149+
},
150+
openrouter: {
151+
reasoning: { max_tokens: constants_1.THINKING_BUDGET_DEFAULT },
152+
},
153+
}
154+
: undefined,
155+
messages,
156+
output: ai_1.Output.object({ schema: assertionSchema }),
157+
});
158+
return output;
159+
};
160+
// Arbiter function using Gemini 2.5 Pro with thinking enabled
161+
const getArbiterDecision = async (claudeResult, geminiResult) => {
162+
const arbiterPrompt = `
163+
You are an AI arbiter tasked with resolving a disagreement between two AI models about an assertion.
164+
165+
Claude's Assessment:
166+
- Assertion Passed: ${claudeResult.assertionPassed}
167+
- Confidence: ${claudeResult.confidenceScore}%
168+
- Reasoning: ${claudeResult.reasoning}
169+
170+
Gemini's Assessment:
171+
- Assertion Passed: ${geminiResult.assertionPassed}
172+
- Confidence: ${geminiResult.confidenceScore}%
173+
- Reasoning: ${geminiResult.reasoning}
174+
175+
${!images
176+
? `
177+
<Snapshot>
178+
${snapshot}
179+
</Snapshot>
180+
`
181+
: ""}
182+
183+
<Assertion>
184+
${assertion}
185+
</Assertion>
186+
187+
Please carefully review the evidence (screenshot and accessibility snapshot (when provided)) and make the final determination. Consider both models' reasoning but make your own independent assessment.
188+
189+
<Rules>
190+
- Make your own independent evaluation based on the evidence
191+
- Don't simply pick one model's answer - analyze the situation yourself
192+
- Provide clear reasoning for your decision
193+
- Be decisive - this is the final answer
194+
- First use the attached screenshot(s) to visually inspect the page and try to verify the assertion.
195+
- Only if the screenshot is not sufficient, use the accessibility snapshot (if supplied) to verify the assertion.
196+
- Don't create additional assertion conditions on your own - only consider the exact assertion provided above.
197+
- The assertion should pass if either the screenshot or the accessibility snapshot supports it.
198+
- Don't be overly strict or pedantic about exact wording. Focus on the intent and objective of the assertion rather than literal text matching.
199+
- Think like a practical QA tester - if the core functionality or state being asserted is present, the assertion should pass even if minor details differ.
200+
</Rules>
201+
`;
202+
const arbiterMessages = [
203+
{
204+
role: "user",
205+
content: [
206+
{
207+
type: "text",
208+
text: arbiterPrompt,
209+
},
210+
...imageContent,
211+
],
212+
},
213+
];
214+
const { output } = await (0, ai_1.generateText)({
215+
model: (0, models_1.resolveModel)((0, config_1.getModelId)("assertionArbiter")),
216+
temperature: 0,
217+
providerOptions: {
218+
google: {
219+
thinkingConfig: {
220+
thinkingBudget: constants_1.THINKING_BUDGET_DEFAULT,
221+
},
222+
},
223+
openrouter: {
224+
reasoning: { max_tokens: constants_1.THINKING_BUDGET_DEFAULT },
225+
},
226+
},
227+
messages: arbiterMessages,
228+
output: ai_1.Output.object({ schema: assertionSchema }),
229+
});
230+
return output;
231+
};
232+
const runAssertion = async (attempt = 0) => {
233+
try {
234+
// Run both models in parallel for speed optimization
235+
const [claudeResult, geminiResult] = await Promise.all([
236+
(0, utils_1.withTimeout)(getClaudeAssertion(), constants_1.ASSERTION_MODEL_TIMEOUT),
237+
(0, utils_1.withTimeout)(getGeminiAssertion(), constants_1.ASSERTION_MODEL_TIMEOUT),
238+
]);
239+
// Check if models disagree on assertionPassed
240+
if (claudeResult.assertionPassed !== geminiResult.assertionPassed) {
241+
logger_1.logger.debug("Models disagree on assertion result, consulting arbiter...");
242+
const arbiterResult = await (0, utils_1.withTimeout)(getArbiterDecision(claudeResult, geminiResult), constants_1.ASSERTION_MODEL_TIMEOUT);
243+
return {
244+
assertionPassed: arbiterResult.assertionPassed,
245+
confidenceScore: arbiterResult.confidenceScore,
246+
reasoning: arbiterResult.reasoning,
247+
};
248+
}
249+
// Assertion passes only if both models agree it should pass
250+
const assertionPassed = claudeResult.assertionPassed && geminiResult.assertionPassed;
251+
// Calculate average confidence score
252+
const confidenceScore = (claudeResult.confidenceScore + geminiResult.confidenceScore) / 2;
253+
// For now take Gemini's reasoning for simplicity
254+
const reasoning = geminiResult.reasoning;
255+
return {
256+
assertionPassed,
257+
confidenceScore: Math.round(confidenceScore),
258+
reasoning,
259+
};
260+
}
261+
catch (error) {
262+
if (attempt < 1) {
263+
logger_1.logger.debug("Retrying assertion due to error...");
264+
return await runAssertion(attempt + 1);
265+
}
266+
logger_1.logger.error({ err: error }, "Error running assertions after multiple retries");
267+
throw error;
268+
}
269+
};
270+
return await runAssertion();
271+
};
272+
// Run assertion with retry on failure
273+
let result = await runFullAssertion();
274+
for (let retry = 0; retry < maxRetries && !result.assertionPassed; retry++) {
275+
logger_1.logger.debug("Assertion failed, retrying with fresh snapshot and screenshot...");
276+
onRetry(retry, result);
277+
result = await runFullAssertion();
278+
}
279+
const { assertionPassed, reasoning } = result;
280+
test?.info().annotations.push({
281+
type: "AI Summary",
282+
description: reasoning,
283+
});
284+
const expectStatus = assertionPassed ? "✅ passed" : "❌ failed";
285+
if (!failSilently) {
286+
expect(assertionPassed, reasoning).toBe(true);
287+
}
288+
return `${reasoning}\n\n[Assertion ${expectStatus}]`;
289+
};
290+
exports.assert = assert;

dist/cache.d.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
/**
2+
* Interface for a hash-based cache store.
3+
* Implementations must support hash get/set and key expiration.
4+
*/
5+
export interface CacheStore {
6+
hgetall(key: string): Promise<Record<string, string>>;
7+
hset(key: string, values: Record<string, string>): Promise<void>;
8+
expire(key: string, seconds: number): Promise<void>;
9+
}
10+
export declare const cache: CacheStore | null;

0 commit comments

Comments
 (0)