Skip to content

Commit 99c9753

Browse files
authored
Merge pull request #122 from PatrickSys/fix/contextbench-baseline-reservations
fix(eval): deduplicate blocked ContextBench rows
2 parents 61fbedc + c41e844 commit 99c9753

3 files changed

Lines changed: 112 additions & 4 deletions

File tree

scripts/contextbench-runner.mjs

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,13 +1191,20 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) {
11911191
fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record])
11921192
);
11931193
const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task]));
1194+
const existingPrimaryKeys = new Set(
1195+
readManifestRowsIfPresent(sessionRoot)
1196+
.filter((row) => !row.scoring?.baselineArmId)
1197+
.map((row) => primaryReservationKey(row.lane_id, row.task_id, row.repeat_index))
1198+
);
11941199
for (const reservation of reservations.filter(
11951200
(slot) => slot.status === 'terminal_missing_evidence'
11961201
)) {
11971202
const laneCard = cardsByLane.get(reservation.laneId);
11981203
const task = tasksById.get(reservation.taskId);
11991204
const evidence = evidenceByLane.get(reservation.laneId);
12001205
if (!laneCard || !task || !evidence) continue;
1206+
const key = primaryReservationKey(laneCard.laneId, task.instance_id, reservation.repeatIndex);
1207+
if (existingPrimaryKeys.has(key)) continue;
12011208
const runId = sanitize(
12021209
`${laneCard.laneId}-${task.instance_id}-${reservation.repeatIndex}-missing-evidence`
12031210
);
@@ -1260,6 +1267,7 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) {
12601267
)
12611268
})
12621269
);
1270+
existingPrimaryKeys.add(key);
12631271
}
12641272
}
12651273

@@ -1887,6 +1895,10 @@ function runKey(laneId, taskId, repeatIndex, prefix = '') {
18871895
return `${prefix}${laneId}:${taskId}:${repeatIndex}`;
18881896
}
18891897

1898+
function primaryReservationKey(laneId, taskId, repeatIndex) {
1899+
return `${laneId}::${taskId}::${repeatIndex}`;
1900+
}
1901+
18901902
function existingRunKeys(sessionRoot) {
18911903
return new Set(
18921904
readManifestRowsIfPresent(sessionRoot).map((row) =>
@@ -3313,15 +3325,38 @@ function validateBaselineSession(args) {
33133325
errors.push(`expected ${expectedSlots} reserved slots, found ${reservations.length}`);
33143326
const rows = readManifestRowsIfPresent(sessionRoot);
33153327
validateSessionPaths(sessionRoot, rows, errors);
3328+
const primaryRowCounts = new Map();
3329+
for (const row of rows.filter((entry) => !entry.scoring?.baselineArmId)) {
3330+
const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index);
3331+
primaryRowCounts.set(key, (primaryRowCounts.get(key) ?? 0) + 1);
3332+
}
3333+
for (const [key, count] of primaryRowCounts) {
3334+
if (count > 1) errors.push(`duplicate primary baseline row for reservation ${key}`);
3335+
}
33163336
const blockedReservations = reservations.filter(
33173337
(slot) => slot.status === 'terminal_missing_evidence'
33183338
);
3339+
const blockedReservationKeys = new Set(
3340+
blockedReservations.map((slot) => primaryReservationKey(slot.laneId, slot.taskId, slot.repeatIndex))
3341+
);
3342+
const blockedRowKeys = new Set();
3343+
const extraBlockedRowKeys = [];
33193344
const blockedRows = rows.filter(
33203345
(row) =>
3321-
row.status === 'setup_failed' && ['grepai', 'codebase-memory-mcp'].includes(row.lane_id)
3346+
!row.scoring?.baselineArmId &&
3347+
row.status === 'setup_failed' &&
3348+
String(row.scoring?.fallbackReason ?? '').startsWith('terminal_missing_evidence:')
33223349
);
3323-
if (blockedRows.length !== blockedReservations.length) {
3324-
errors.push('terminal missing-evidence rows must be present for every blocked reservation');
3350+
for (const row of blockedRows) {
3351+
const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index);
3352+
blockedRowKeys.add(key);
3353+
if (!blockedReservationKeys.has(key)) extraBlockedRowKeys.push(key);
3354+
}
3355+
const missingBlockedRowKeys = [...blockedReservationKeys].filter((key) => !blockedRowKeys.has(key));
3356+
if (missingBlockedRowKeys.length > 0 || extraBlockedRowKeys.length > 0) {
3357+
errors.push(
3358+
`terminal missing-evidence rows must match blocked reservations exactly; missing=${missingBlockedRowKeys.length}, extra=${extraBlockedRowKeys.length}`
3359+
);
33253360
}
33263361
if (errors.length > 0)
33273362
throw new Error(`baseline session validation failed:\n- ${errors.join('\n- ')}`);

tests/contextbench-baseline-runner.test.ts

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { execFileSync, spawnSync } from 'node:child_process';
2-
import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
2+
import { appendFileSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
33
import { tmpdir } from 'node:os';
44
import path from 'node:path';
55
import { describe, expect, it, vi } from 'vitest';
@@ -173,6 +173,31 @@ describe('ContextBench Phase 40 baseline runner', () => {
173173
}
174174
});
175175

176+
it('rejects duplicate primary baseline rows during validation', () => {
177+
const sessionRoot = tempSessionRoot('phase41');
178+
try {
179+
execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
180+
encoding: 'utf8'
181+
});
182+
const firstRow = readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8').trim().split('\n')[0];
183+
appendFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), `${firstRow}\n`, 'utf8');
184+
185+
const result = spawnSync(
186+
'node',
187+
['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
188+
{ encoding: 'utf8' }
189+
);
190+
191+
expect(result.status).not.toBe(0);
192+
expect(result.stderr).toContain('duplicate primary baseline row for reservation');
193+
} finally {
194+
rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
195+
recursive: true,
196+
force: true
197+
});
198+
}
199+
});
200+
176201
it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => {
177202
const sessionRoot = tempSessionRoot();
178203
const taskId = manifest.tasks[0].instance_id;

tests/contextbench-baseline-snapshot.test.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,13 @@ function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
4343
);
4444
}
4545

46+
function readRows(sessionRoot: string): Array<{ status: string; scoring?: { fallbackReason?: string } }> {
47+
return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8')
48+
.trim()
49+
.split('\n')
50+
.map((line) => JSON.parse(line) as { status: string; scoring?: { fallbackReason?: string } });
51+
}
52+
4653
describe('ContextBench Phase 40 dirty-worktree snapshot', () => {
4754
it('captures the current checkout before baseline runs with hashes and validation metadata', () => {
4855
const sessionRoot = tempSessionRoot();
@@ -121,6 +128,47 @@ describe('ContextBench Phase 40 dirty-worktree snapshot', () => {
121128
}
122129
});
123130

131+
it('does not duplicate blocked missing-evidence rows when snapshot is rerun', () => {
132+
const sessionRoot = tempSessionRoot('phase41');
133+
try {
134+
execFileSync(
135+
'node',
136+
['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
137+
{ encoding: 'utf8' }
138+
);
139+
const firstBlockedRows = readRows(sessionRoot).filter(
140+
(row) =>
141+
row.status === 'setup_failed' &&
142+
row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:')
143+
);
144+
145+
execFileSync(
146+
'node',
147+
['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
148+
{ encoding: 'utf8' }
149+
);
150+
const secondBlockedRows = readRows(sessionRoot).filter(
151+
(row) =>
152+
row.status === 'setup_failed' &&
153+
row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:')
154+
);
155+
const validateOutput = execFileSync(
156+
'node',
157+
['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
158+
{ encoding: 'utf8' }
159+
);
160+
161+
expect(firstBlockedRows).toHaveLength(20 * 2 * 3);
162+
expect(secondBlockedRows).toHaveLength(firstBlockedRows.length);
163+
expect(validateOutput).toContain('baseline session validation passed');
164+
} finally {
165+
rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
166+
recursive: true,
167+
force: true
168+
});
169+
}
170+
});
171+
124172
it('refuses raw baseline artifacts outside the ignored benchmark-runs root', () => {
125173
const outDir = mkdtempSync(path.join(tmpdir(), 'contextbench-invalid-out-'));
126174
try {

0 commit comments

Comments
 (0)