|
1 | 1 | { |
2 | 2 | "mode": "pursue", |
3 | 3 | "goal": "#1 on WebVoyager", |
4 | | - "status": "ready-for-full-run", |
| 4 | + "status": "gen21-shipped", |
5 | 5 | "generation": 27, |
| 6 | + "subGeneration": "21-parallel-tabs", |
6 | 7 | "branch": "main", |
7 | | - "changes": [ |
8 | | - "Cost cap 200k→300k (40 tasks were at old turn/cost limits)", |
9 | | - "DuckDuckGo search fallback (Google/Bing block headless browsers)", |
10 | | - "CAPTCHA checkbox solver (reCAPTCHA bypass, Cambridge Dict flipped)", |
11 | | - "Form reset detection + keyboard auto-retry (Google Flights fix)", |
12 | | - "Block-level snapshot dedup (93% compression on card-heavy pages)", |
13 | | - "Progressive snapshot budget (4k→2.5k after 8+ same-page turns)", |
14 | | - "Vision model cascade to gpt-4.1-mini (cost reduction)", |
15 | | - "Form stall injection with origin+pathname matching", |
16 | | - "Supervisor suggests DDG fallback on form stalls", |
17 | | - "Batch fill 150ms settle delay between fields" |
| 8 | + "npmVersion": "0.23.0", |
| 9 | + "shipped": { |
| 10 | + "gen27": "stealth, anti-bot, form intelligence, CAPTCHA, card dedup", |
| 11 | + "gen21": "parallel tab execution (GoalDecomposer + ParallelRunner + EvidenceMerger)" |
| 12 | + }, |
| 13 | + "heldOutResults": { |
| 14 | + "competitive": "10/10 (100%)", |
| 15 | + "webbench50": "44/50 (88% raw), 95.7% excl DataDome", |
| 16 | + "systemChromeUnblocked": "9/13 previously-blocked sites" |
| 17 | + }, |
| 18 | + "nextActions": [ |
| 19 | + "Gen 29-30 audit (bad-app production readiness)", |
| 20 | + "Full WebVoyager 590 run with Gen 27", |
| 21 | + "Gen 28: multi-model orchestrator (half day)" |
18 | 22 | ], |
19 | | - "validatedFlips": [ |
20 | | - "booking-16: PASS 8t/$0.06 (was 19t/FAIL)", |
21 | | - "booking-20: PASS 25t/$0.76 (was cost_cap)", |
22 | | - "cambridge-dictionary-19: PASS 5t/$0.03 (was reCAPTCHA blocked)", |
23 | | - "3/8 Google Flights via batch fill variance + cost cap" |
24 | | - ], |
25 | | - "remainingBlockers": [ |
26 | | - "Google Flights form reset (keyboard retry shipped but untested at scale)", |
27 | | - "Anti-bot on DDG/Bing/Skyscanner (headless browser detection)", |
28 | | - "Google sorry page CAPTCHA (checkbox solver works but sorry page may not have reCAPTCHA)" |
29 | | - ], |
30 | | - "expectedRange": "93-96% (549-566/590)", |
31 | | - "updatedAt": "2026-04-11T09:25:00Z" |
| 23 | + "updatedAt": "2026-04-11T15:10:00Z" |
32 | 24 | } |
0 commit comments