|
6 | 6 |
|
7 | 7 | import { describe, expect } from 'vitest'; |
8 | 8 | import { evalTest } from './test-helper.js'; |
9 | | -import { validateModelOutput } from '../integration-tests/test-helper.js'; |
| 9 | +import { |
| 10 | + assertModelHasOutput, |
| 11 | + checkModelOutputContent, |
| 12 | +} from '../integration-tests/test-helper.js'; |
10 | 13 |
|
11 | 14 | describe('save_memory', () => { |
| 15 | + const TEST_PREFIX = 'Save memory test: '; |
| 16 | + const rememberingFavoriteColor = "Agent remembers user's favorite color"; |
12 | 17 | evalTest('ALWAYS_PASSES', { |
13 | | - name: 'should be able to save to memory', |
| 18 | + name: rememberingFavoriteColor, |
14 | 19 | params: { |
15 | 20 | settings: { tools: { core: ['save_memory'] } }, |
16 | 21 | }, |
17 | 22 | prompt: `remember that my favorite color is blue. |
18 | 23 | |
19 | 24 | what is my favorite color? tell me that and surround it with $ symbol`, |
20 | 25 | assert: async (rig, result) => { |
21 | | - const foundToolCall = await rig.waitForToolCall('save_memory'); |
| 26 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 27 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 28 | + true, |
| 29 | + ); |
| 30 | + |
| 31 | + assertModelHasOutput(result); |
| 32 | + checkModelOutputContent(result, { |
| 33 | + expectedContent: 'blue', |
| 34 | + testName: `${TEST_PREFIX}${rememberingFavoriteColor}`, |
| 35 | + }); |
| 36 | + }, |
| 37 | + }); |
| 38 | + const rememberingCommandRestrictions = 'Agent remembers command restrictions'; |
| 39 | + evalTest('ALWAYS_PASSES', { |
| 40 | + name: rememberingCommandRestrictions, |
| 41 | + params: { |
| 42 | + settings: { tools: { core: ['save_memory'] } }, |
| 43 | + }, |
| 44 | + prompt: `I don't want you to ever run npm commands.`, |
| 45 | + assert: async (rig, result) => { |
| 46 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 47 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 48 | + true, |
| 49 | + ); |
| 50 | + |
| 51 | + assertModelHasOutput(result); |
| 52 | + checkModelOutputContent(result, { |
| 53 | + expectedContent: [/not run npm commands|remember|ok/i], |
| 54 | + testName: `${TEST_PREFIX}${rememberingCommandRestrictions}`, |
| 55 | + }); |
| 56 | + }, |
| 57 | + }); |
| 58 | + |
| 59 | + const rememberingWorkflow = 'Agent remembers workflow preferences'; |
| 60 | + evalTest('ALWAYS_PASSES', { |
| 61 | + name: rememberingWorkflow, |
| 62 | + params: { |
| 63 | + settings: { tools: { core: ['save_memory'] } }, |
| 64 | + }, |
| 65 | + prompt: `I want you to always lint after building.`, |
| 66 | + assert: async (rig, result) => { |
| 67 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 68 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 69 | + true, |
| 70 | + ); |
| 71 | + |
| 72 | + assertModelHasOutput(result); |
| 73 | + checkModelOutputContent(result, { |
| 74 | + expectedContent: [/always|ok|remember|will do/i], |
| 75 | + testName: `${TEST_PREFIX}${rememberingWorkflow}`, |
| 76 | + }); |
| 77 | + }, |
| 78 | + }); |
| 79 | + |
| 80 | + const ignoringTemporaryInformation = |
| 81 | + 'Agent ignores temporary conversation details'; |
| 82 | + evalTest('ALWAYS_PASSES', { |
| 83 | + name: ignoringTemporaryInformation, |
| 84 | + params: { |
| 85 | + settings: { tools: { core: ['save_memory'] } }, |
| 86 | + }, |
| 87 | + prompt: `I'm going to get a coffee.`, |
| 88 | + assert: async (rig, result) => { |
| 89 | + await rig.waitForTelemetryReady(); |
| 90 | + const wasToolCalled = rig |
| 91 | + .readToolLogs() |
| 92 | + .some((log) => log.toolRequest.name === 'save_memory'); |
22 | 93 | expect( |
23 | | - foundToolCall, |
24 | | - 'Expected to find a save_memory tool call', |
25 | | - ).toBeTruthy(); |
| 94 | + wasToolCalled, |
| 95 | + 'save_memory should not be called for temporary information', |
| 96 | + ).toBe(false); |
| 97 | + |
| 98 | + assertModelHasOutput(result); |
| 99 | + checkModelOutputContent(result, { |
| 100 | + testName: `${TEST_PREFIX}${ignoringTemporaryInformation}`, |
| 101 | + forbiddenContent: [/remember|will do/i], |
| 102 | + }); |
| 103 | + }, |
| 104 | + }); |
| 105 | + |
| 106 | + const rememberingPetName = "Agent remembers user's pet's name"; |
| 107 | + evalTest('ALWAYS_PASSES', { |
| 108 | + name: rememberingPetName, |
| 109 | + params: { |
| 110 | + settings: { tools: { core: ['save_memory'] } }, |
| 111 | + }, |
| 112 | + prompt: `My dog's name is Buddy. What is my dog's name?`, |
| 113 | + assert: async (rig, result) => { |
| 114 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 115 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 116 | + true, |
| 117 | + ); |
| 118 | + |
| 119 | + assertModelHasOutput(result); |
| 120 | + checkModelOutputContent(result, { |
| 121 | + expectedContent: [/Buddy/i], |
| 122 | + testName: `${TEST_PREFIX}${rememberingPetName}`, |
| 123 | + }); |
| 124 | + }, |
| 125 | + }); |
| 126 | + |
| 127 | + const rememberingCommandAlias = 'Agent remembers custom command aliases'; |
| 128 | + evalTest('ALWAYS_PASSES', { |
| 129 | + name: rememberingCommandAlias, |
| 130 | + params: { |
| 131 | + settings: { tools: { core: ['save_memory'] } }, |
| 132 | + }, |
| 133 | + prompt: `When I say 'start server', you should run 'npm run dev'.`, |
| 134 | + assert: async (rig, result) => { |
| 135 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 136 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 137 | + true, |
| 138 | + ); |
| 139 | + |
| 140 | + assertModelHasOutput(result); |
| 141 | + checkModelOutputContent(result, { |
| 142 | + expectedContent: [/npm run dev|start server|ok|remember|will do/i], |
| 143 | + testName: `${TEST_PREFIX}${rememberingCommandAlias}`, |
| 144 | + }); |
| 145 | + }, |
| 146 | + }); |
| 147 | + |
| 148 | + const rememberingDbSchemaLocation = |
| 149 | + "Agent remembers project's database schema location"; |
| 150 | + evalTest('ALWAYS_PASSES', { |
| 151 | + name: rememberingDbSchemaLocation, |
| 152 | + params: { |
| 153 | + settings: { tools: { core: ['save_memory'] } }, |
| 154 | + }, |
| 155 | + prompt: `The database schema for this project is located in \`db/schema.sql\`.`, |
| 156 | + assert: async (rig, result) => { |
| 157 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 158 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 159 | + true, |
| 160 | + ); |
| 161 | + |
| 162 | + assertModelHasOutput(result); |
| 163 | + checkModelOutputContent(result, { |
| 164 | + expectedContent: [/database schema|ok|remember|will do/i], |
| 165 | + testName: `${TEST_PREFIX}${rememberingDbSchemaLocation}`, |
| 166 | + }); |
| 167 | + }, |
| 168 | + }); |
| 169 | + |
| 170 | + const rememberingCodingStyle = |
| 171 | + "Agent remembers user's coding style preference"; |
| 172 | + evalTest('ALWAYS_PASSES', { |
| 173 | + name: rememberingCodingStyle, |
| 174 | + params: { |
| 175 | + settings: { tools: { core: ['save_memory'] } }, |
| 176 | + }, |
| 177 | + prompt: `I prefer to use tabs instead of spaces for indentation.`, |
| 178 | + assert: async (rig, result) => { |
| 179 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 180 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 181 | + true, |
| 182 | + ); |
| 183 | + |
| 184 | + assertModelHasOutput(result); |
| 185 | + checkModelOutputContent(result, { |
| 186 | + expectedContent: [/tabs instead of spaces|ok|remember|will do/i], |
| 187 | + testName: `${TEST_PREFIX}${rememberingCodingStyle}`, |
| 188 | + }); |
| 189 | + }, |
| 190 | + }); |
| 191 | + |
| 192 | + const rememberingTestCommand = |
| 193 | + 'Agent remembers specific project test command'; |
| 194 | + evalTest('ALWAYS_PASSES', { |
| 195 | + name: rememberingTestCommand, |
| 196 | + params: { |
| 197 | + settings: { tools: { core: ['save_memory'] } }, |
| 198 | + }, |
| 199 | + prompt: `The command to run all backend tests is \`npm run test:backend\`.`, |
| 200 | + assert: async (rig, result) => { |
| 201 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 202 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 203 | + true, |
| 204 | + ); |
| 205 | + |
| 206 | + assertModelHasOutput(result); |
| 207 | + checkModelOutputContent(result, { |
| 208 | + expectedContent: [ |
| 209 | + /command to run all backend tests|ok|remember|will do/i, |
| 210 | + ], |
| 211 | + testName: `${TEST_PREFIX}${rememberingTestCommand}`, |
| 212 | + }); |
| 213 | + }, |
| 214 | + }); |
| 215 | + |
| 216 | + const rememberingMainEntryPoint = |
| 217 | + "Agent remembers project's main entry point"; |
| 218 | + evalTest('ALWAYS_PASSES', { |
| 219 | + name: rememberingMainEntryPoint, |
| 220 | + params: { |
| 221 | + settings: { tools: { core: ['save_memory'] } }, |
| 222 | + }, |
| 223 | + prompt: `The main entry point for this project is \`src/index.js\`.`, |
| 224 | + assert: async (rig, result) => { |
| 225 | + const wasToolCalled = await rig.waitForToolCall('save_memory'); |
| 226 | + expect(wasToolCalled, 'Expected save_memory tool to be called').toBe( |
| 227 | + true, |
| 228 | + ); |
26 | 229 |
|
27 | | - validateModelOutput(result, 'blue', 'Save memory test'); |
| 230 | + assertModelHasOutput(result); |
| 231 | + checkModelOutputContent(result, { |
| 232 | + expectedContent: [ |
| 233 | + /main entry point for this project|ok|remember|will do/i, |
| 234 | + ], |
| 235 | + testName: `${TEST_PREFIX}${rememberingMainEntryPoint}`, |
| 236 | + }); |
28 | 237 | }, |
29 | 238 | }); |
30 | 239 | }); |
0 commit comments