diff --git a/evals-cli/.gitignore b/evals-cli/.gitignore index 3bdd52e..b3768b2 100644 --- a/evals-cli/.gitignore +++ b/evals-cli/.gitignore @@ -1,3 +1,5 @@ node_modules/ dist/ .DS_Store +.env +report.html diff --git a/evals-cli/README.md b/evals-cli/README.md index 26ca875..3fe4406 100644 --- a/evals-cli/README.md +++ b/evals-cli/README.md @@ -62,6 +62,38 @@ node dist/bin/runevals.js --model=gemini-2.5-flash --tools=examples/travel/tools ```bash node dist/bin/runevals.js --model=qwen3:8b --backend=ollama --tools=examples/travel/tools_schema.json --evals=examples/travel/evals.json ``` + +## Argument Constraints + +You can use constraint operators to match argument values flexibly. A constraint object is identified when **all** its keys start with `$`. + +### Supported Operators + +| Operator | Description | Example | +|---|---|---| +| **`$pattern`** | Regex match | `{"$pattern": "^2026-\\d{2}$"}` | +| **`$contains`** | Substring match | `{"$contains": "York"}` | +| **`$gt`**, **`$gte`** | Greater than (or equal) | `{"$gte": 1}` | +| **`$lt`**, **`$lte`** | Less than (or equal) | `{"$lt": 100}` | +| **`$type`** | Type check | `{"$type": "string"}` | +| **`$any`** | Presence check | `{"$any": true}` | + +### Example + +```json +{ + "expectedCall": { + "functionName": "searchFlights", + "arguments": { + "destination": "NYC", + "outboundDate": { "$pattern": "^2026-01-\\d{2}$" }, + "passengers": { "$gte": 1 }, + "preferences": { "$any": true } + } + } +} +``` + ## License Apache-2.0 diff --git a/evals-cli/examples/events/evals.json b/evals-cli/examples/events/evals.json index 21bc38a..2d98ab6 100644 --- a/evals-cli/examples/events/evals.json +++ b/evals-cli/examples/events/evals.json @@ -58,8 +58,12 @@ "arguments": { "date": "2026-01-20", "time": "19:00", - "location": "Ginno's Pizza", - "description": "Restaurant reservation for 3 at Ginno's pizza" + "location": { + "$contains": "Ginno's" + }, + "description": { + "$contains": "Ginno's" + } } } } diff --git a/evals-cli/package.json b/evals-cli/package.json index 7c51ccd..3ac2cc5 100644 --- a/evals-cli/package.json +++ b/evals-cli/package.json @@ -5,7 +5,8 @@ "license": "Apache-2.0", "type": "module", "scripts": { - "build": "tsc" + "build": "tsc", + "test": "tsc && node --test dist/test/matcher.test.js" }, "devDependencies": { "@types/node": "^25.0.10", diff --git a/evals-cli/src/matcher.ts b/evals-cli/src/matcher.ts new file mode 100644 index 0000000..0109a88 --- /dev/null +++ b/evals-cli/src/matcher.ts @@ -0,0 +1,143 @@ +/** + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Checks if the actual argument matches the expected argument, supporting both exact matching and constraints. + * + * If the expected argument is a constraint object (all keys start with `$`), it evaluates the constraints. + * Otherwise, it performs a recursive deep equality check, allowing nested constraints. + * + * @param expected The expected value or constraint object. + * @param actual The actual value to check. + * @returns True if the actual value matches the expected value or satisfies the constraints. + */ +export function matchesArgument(expected: any, actual: any): boolean { + if (isConstraintObject(expected)) { + return matchesConstraint(expected, actual); + } + + return matchesRecursive(expected, actual); +} + +/** + * Evaluates a constraint object against an actual value. + * Supports operators: + * - `$pattern`: Regex match (string) + * - `$contains`: Substring match (string) + * - `$gt`, `$gte`, `$lt`, `$lte`: Numeric comparisons + * - `$type`: Type assertion ("string", "number", "boolean", "array", "object", "null") + * - `$any`: Presence check (always true if key exists) + * + * @param constraint The constraint object (e.g., { "$gt": 10 }). + * @param actual The value to test. + * @returns True if all constraints in the object are satisfied. + */ +function matchesConstraint(constraint: any, actual: any): boolean { + for (const key of Object.keys(constraint)) { + if (key === "$pattern") { + if (typeof actual !== "string") { + return false; + } + const pattern = new RegExp(constraint[key]); + if (!pattern.test(actual)) { + return false; + } + } else if (key === "$contains") { + if (typeof actual !== "string") { + return false; + } + if (!actual.includes(constraint[key])) { + return false; + } + } else if (["$gt", "$gte", "$lt", "$lte"].includes(key)) { + if (typeof actual !== "number") { + return false; + } + const val = constraint[key]; + if (key === "$gt" && !(actual > val)) return false; + if (key === "$gte" && !(actual >= val)) return false; + if (key === "$lt" && !(actual < val)) return false; + if (key === "$lte" && !(actual <= val)) return false; + } else if (key === "$type") { + const type = constraint[key]; + if (type === "array") { + if (!Array.isArray(actual)) return false; + } else if (type === "null") { + if (actual !== null) return false; + } else if (type === "object") { + if ( + typeof actual !== "object" || + actual === null || + Array.isArray(actual) + ) + return false; + } else { + if (typeof actual !== type) return false; + } + } else if (key === "$any") { + // Always matches if present + } + // Future constraints will go here + } + return true; +} + +/** + * Determines if an object is a constraint object. + * An object is a constraint object if it is non-null, has at least one key, + * and ALL its keys start with `$`. + * + * @param obj The object to check. + * @returns True if strictly a constraint object. + */ +function isConstraintObject(obj: any): boolean { + if (typeof obj !== "object" || obj === null) { + return false; + } + const keys = Object.keys(obj); + if (keys.length === 0) { + return false; + } + return keys.every((key) => key.startsWith("$")); +} + +/** + * Recursively checks equality between two values. + * If values are objects or arrays, it recurses into them. + * Crucially, it calls `matchesArgument` for children, enabling nested constraints. + * + * @param expected The expected structure. + * @param actual The actual structure. + * @returns True if structures match recursively. + */ +function matchesRecursive(expected: any, actual: any): boolean { + if (expected === actual) { + return true; + } + + if ( + expected === null || + actual === null || + typeof expected !== "object" || + typeof actual !== "object" + ) { + return false; + } + + const keys1 = Object.keys(expected); + const keys2 = Object.keys(actual); + + if (keys1.length !== keys2.length) { + return false; + } + + for (const key of keys1) { + if (!keys2.includes(key) || !matchesArgument(expected[key], actual[key])) { + return false; + } + } + + return true; +} diff --git a/evals-cli/src/report/report.ts b/evals-cli/src/report/report.ts index b9d0e4d..e343405 100644 --- a/evals-cli/src/report/report.ts +++ b/evals-cli/src/report/report.ts @@ -5,7 +5,7 @@ import { Config } from "../types/config.js"; import { Message, TestResult, TestResults } from "../types/evals.js"; -import { deepEqual } from "../utils.js"; +import { matchesArgument } from "../matcher.js"; export function renderReport(config: Config, testResults: TestResults): string { return ` @@ -87,7 +87,7 @@ function renderDetail(testNumber: number, testResult: TestResult): string { ? "pass" : "fail"; - const argsOutcome = deepEqual( + const argsOutcome = matchesArgument( testResult.test.expectedCall?.arguments, testResult.response?.args, ) diff --git a/evals-cli/src/test/matcher.test.ts b/evals-cli/src/test/matcher.test.ts new file mode 100644 index 0000000..bd31e22 --- /dev/null +++ b/evals-cli/src/test/matcher.test.ts @@ -0,0 +1,179 @@ +/** + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it } from "node:test"; +import * as assert from "node:assert"; +import { matchesArgument } from "../matcher.js"; + +describe("matcher", () => { + describe("exact matching", () => { + it("matches primitive values", () => { + assert.strictEqual(matchesArgument(1, 1), true); + assert.strictEqual(matchesArgument("hello", "hello"), true); + assert.strictEqual(matchesArgument(true, true), true); + assert.strictEqual(matchesArgument(null, null), true); + + assert.strictEqual(matchesArgument(1, 2), false); + assert.strictEqual(matchesArgument("hello", "world"), false); + assert.strictEqual(matchesArgument(true, false), false); + assert.strictEqual(matchesArgument(null, undefined), false); + }); + + it("matches objects deeply", () => { + assert.strictEqual(matchesArgument({ a: 1 }, { a: 1 }), true); + assert.strictEqual( + matchesArgument({ a: { b: 2 } }, { a: { b: 2 } }), + true, + ); + + assert.strictEqual(matchesArgument({ a: 1 }, { a: 2 }), false); + assert.strictEqual(matchesArgument({ a: 1 }, { b: 1 }), false); + assert.strictEqual( + matchesArgument({ a: { b: 2 } }, { a: { b: 3 } }), + false, + ); + }); + + it("matches arrays deeply", () => { + assert.strictEqual(matchesArgument([1, 2], [1, 2]), true); + assert.strictEqual(matchesArgument([1, [2]], [1, [2]]), true); + + assert.strictEqual(matchesArgument([1, 2], [1, 3]), false); + assert.strictEqual(matchesArgument([1, 2], [1, 2, 3]), false); + }); + }); + + describe("constraints", () => { + describe("$pattern", () => { + it("matches strings against regex", () => { + assert.strictEqual( + matchesArgument({ $pattern: "^2026-\\d{2}$" }, "2026-01"), + true, + ); + assert.strictEqual( + matchesArgument({ $pattern: "foo" }, "foobar"), + true, + ); + + assert.strictEqual( + matchesArgument({ $pattern: "^2026-\\d{2}$" }, "2025-01"), + false, + ); + assert.strictEqual( + matchesArgument({ $pattern: "^foo$" }, "foobar"), + false, + ); + }); + + it("fails if actual is not a string", () => { + assert.strictEqual(matchesArgument({ $pattern: ".*" }, 123), false); + assert.strictEqual(matchesArgument({ $pattern: ".*" }, null), false); + }); + }); + + describe("$contains", () => { + it("matches strings containing substring", () => { + assert.strictEqual( + matchesArgument({ $contains: "bar" }, "foobar"), + true, + ); + assert.strictEqual(matchesArgument({ $contains: "foo" }, "foo"), true); + + assert.strictEqual( + matchesArgument({ $contains: "baz" }, "foobar"), + false, + ); + }); + + it("fails if actual is not a string", () => { + assert.strictEqual(matchesArgument({ $contains: "foo" }, 123), false); + assert.strictEqual(matchesArgument({ $contains: "foo" }, null), false); + }); + }); + + describe("numeric comparisons", () => { + it("$gt", () => { + assert.strictEqual(matchesArgument({ $gt: 10 }, 11), true); + assert.strictEqual(matchesArgument({ $gt: 10 }, 10), false); + }); + it("$gte", () => { + assert.strictEqual(matchesArgument({ $gte: 10 }, 10), true); + assert.strictEqual(matchesArgument({ $gte: 10 }, 9), false); + }); + it("$lt", () => { + assert.strictEqual(matchesArgument({ $lt: 10 }, 9), true); + assert.strictEqual(matchesArgument({ $lt: 10 }, 10), false); + }); + it("$lte", () => { + assert.strictEqual(matchesArgument({ $lte: 10 }, 10), true); + assert.strictEqual(matchesArgument({ $lte: 10 }, 11), false); + }); + it("fails if actual is not a number", () => { + assert.strictEqual(matchesArgument({ $gt: 10 }, "11"), false); + assert.strictEqual(matchesArgument({ $gt: 10 }, null), false); + }); + }); + + describe("$type", () => { + it("matches specific types", () => { + assert.strictEqual(matchesArgument({ $type: "string" }, "foo"), true); + assert.strictEqual(matchesArgument({ $type: "number" }, 123), true); + assert.strictEqual(matchesArgument({ $type: "boolean" }, true), true); + assert.strictEqual(matchesArgument({ $type: "object" }, {}), true); + assert.strictEqual(matchesArgument({ $type: "array" }, []), true); + assert.strictEqual(matchesArgument({ $type: "null" }, null), true); + + assert.strictEqual(matchesArgument({ $type: "string" }, 123), false); + assert.strictEqual(matchesArgument({ $type: "number" }, "123"), false); + assert.strictEqual(matchesArgument({ $type: "array" }, {}), false); + assert.strictEqual(matchesArgument({ $type: "object" }, []), false); + }); + }); + + describe("$any", () => { + it("matches anything", () => { + assert.strictEqual(matchesArgument({ $any: true }, "foo"), true); + assert.strictEqual(matchesArgument({ $any: true }, null), true); + assert.strictEqual(matchesArgument({ $any: true }, undefined), true); // undefined usually shouldn't happen in JSON arguments but good to check + }); + }); + + describe("recursive constraints", () => { + it("matches nested constraints", () => { + const schema = { + a: { $gt: 10 }, + b: { c: { $contains: "hello" } }, + }; + assert.strictEqual( + matchesArgument(schema, { a: 11, b: { c: "hello world" } }), + true, + ); + assert.strictEqual( + matchesArgument(schema, { a: 10, b: { c: "hello world" } }), + false, + ); + assert.strictEqual( + matchesArgument(schema, { a: 11, b: { c: "bye world" } }), + false, + ); + }); + + it("matches array elements with constraints", () => { + const schema = { + list: [{ $gt: 10 }, { $type: "string" }], + }; + assert.strictEqual( + matchesArgument(schema, { list: [11, "foo"] }), + true, + ); + assert.strictEqual( + matchesArgument(schema, { list: [10, "foo"] }), + false, + ); + assert.strictEqual(matchesArgument(schema, { list: [11, 123] }), false); + }); + }); + }); +}); diff --git a/evals-cli/src/utils.ts b/evals-cli/src/utils.ts index 88939e6..e3c2ee6 100644 --- a/evals-cli/src/utils.ts +++ b/evals-cli/src/utils.ts @@ -3,51 +3,25 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { FunctionCall } from './types/evals.js'; -import { ToolCall } from './types/tools.js'; - -export function functionCallOutcome(expected: FunctionCall | null, actual: ToolCall | null): "pass" | "fail" { +import { FunctionCall } from "./types/evals.js"; +import { ToolCall } from "./types/tools.js"; +import { matchesArgument } from "./matcher.js"; + +export function functionCallOutcome( + expected: FunctionCall | null, + actual: ToolCall | null, +): "pass" | "fail" { if (expected === null && actual === null) { return "pass"; - } + } if (expected?.functionName !== actual?.functionName) { return "fail"; } - if (!deepEqual(expected?.arguments, actual?.args)) { + if (!matchesArgument(expected?.arguments, actual?.args)) { return "fail"; } return "pass"; } - -export function deepEqual(obj1: any, obj2: any): boolean { - if (obj1 === obj2) { - return true; - } - - if ( - obj1 === null || - obj2 === null || - typeof obj1 !== "object" || - typeof obj2 !== "object" - ) { - return false; - } - - const keys1 = Object.keys(obj1); - const keys2 = Object.keys(obj2); - - if (keys1.length !== keys2.length) { - return false; - } - - for (const key of keys1) { - if (!keys2.includes(key) || !deepEqual(obj1[key], obj2[key])) { - return false; - } - } - - return true; -}