Skip to content

Commit 72fb797

Browse files
committed
Add Browser Use run compare CLI
1 parent 5f2a2f6 commit 72fb797

8 files changed

Lines changed: 264 additions & 4 deletions

File tree

LAUNCH.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ Current latest audit:
120120

121121
| Captured at | Stars | To 1001 | Forks | Watchers | Issues | PRs | Release downloads | Note |
122122
|---|---:|---:|---:|---:|---:|---:|---:|---|
123-
| 2026-05-12T22:41:46+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after #370 closed by eac3637 and Browser Use compare-run metadata checklist reached main; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
123+
| 2026-05-12T22:48:24+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after Browser Use compare CLI prototype and docs landed locally; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
124124

125125
The active objective is incomplete until `stargazerCount > 1000`.
126126

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ Useful local checks:
113113
- `browsertrace list --json` prints recent runs as JSON with id, name, status, and created timestamp.
114114
- `browsertrace list --status failed` filters recent runs by status; combine with JSON, for example `browsertrace list --status completed --json`.
115115
- `browsertrace demo` prints a `Run ID:` line you can copy into `browsertrace show` or `browsertrace export`.
116+
- `browsertrace compare <failed_run_id> <success_run_id>` finds the first step divergence between a failed Browser Use run and a known-good run; add `--json` for automation with `browsertrace compare <failed_run_id> <success_run_id> --json`.
116117

117118
For scripts, CI, or AI/coding-agent troubleshooting, use the JSON CLI checks:
118119

@@ -122,6 +123,10 @@ browsertrace list --status failed --json
122123
browsertrace show <run_id> --json
123124
```
124125

126+
When you have a failed run and a known-good run for the same Browser Use task,
127+
use `browsertrace compare <failed_run_id> <success_run_id> --json` to locate
128+
the first divergent action, URL, status, or error field.
129+
125130
For automation-oriented usage notes, see
126131
[JSON CLI checks for automation](examples/#json-cli-checks-for-automation).
127132

browsertrace/cli.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
browsertrace demo # create a deterministic failed demo run
66
browsertrace list # list runs in the terminal
77
browsertrace show <run_id> # print a run's timeline
8+
browsertrace compare <failed_id> <success_id> # find the first step divergence
89
browsertrace doctor # print local install and trace-store status
910
browsertrace export <id> # write a portable HTML bundle to ./<id>.html
1011
browsertrace export <id> --redact # omit model I/O from the HTML export
@@ -251,6 +252,126 @@ def cmd_show(args) -> int:
251252
return 0
252253

253254

255+
def _run_summary(run: sqlite3.Row) -> dict[str, str]:
256+
return {
257+
"id": run["id"],
258+
"name": run["name"] or "",
259+
"status": run["status"],
260+
}
261+
262+
263+
def _step_for_compare(step: sqlite3.Row | None) -> dict[str, object] | None:
264+
if step is None:
265+
return None
266+
return {
267+
"step_index": step["step_index"],
268+
"action": step["action"] or "",
269+
"url": step["url"] or "",
270+
"status": step["status"] or "ok",
271+
"error": step["error"],
272+
}
273+
274+
275+
def _compare_runs(
276+
left_run: sqlite3.Row,
277+
left_steps: list[sqlite3.Row],
278+
right_run: sqlite3.Row,
279+
right_steps: list[sqlite3.Row],
280+
) -> dict[str, object]:
281+
payload: dict[str, object] = {
282+
"left": _run_summary(left_run),
283+
"right": _run_summary(right_run),
284+
"step_counts": {"left": len(left_steps), "right": len(right_steps)},
285+
"first_divergence": None,
286+
}
287+
288+
fields = ["action", "url", "status", "error"]
289+
for i in range(max(len(left_steps), len(right_steps))):
290+
left_step = _step_for_compare(left_steps[i] if i < len(left_steps) else None)
291+
right_step = _step_for_compare(right_steps[i] if i < len(right_steps) else None)
292+
293+
if left_step is None or right_step is None:
294+
payload["first_divergence"] = {
295+
"step_index": i,
296+
"fields": {
297+
"presence": {
298+
"left": left_step is not None,
299+
"right": right_step is not None,
300+
}
301+
},
302+
"left_step": left_step,
303+
"right_step": right_step,
304+
}
305+
break
306+
307+
changed = {
308+
field: {"left": left_step[field], "right": right_step[field]}
309+
for field in fields
310+
if left_step[field] != right_step[field]
311+
}
312+
if changed:
313+
payload["first_divergence"] = {
314+
"step_index": left_step["step_index"],
315+
"fields": changed,
316+
"left_step": left_step,
317+
"right_step": right_step,
318+
}
319+
break
320+
321+
return payload
322+
323+
324+
def _print_compare_human(payload: dict[str, object]) -> None:
325+
left = payload["left"]
326+
right = payload["right"]
327+
assert isinstance(left, dict)
328+
assert isinstance(right, dict)
329+
330+
print(f"Left: {left['id']} {_fmt_status(str(left['status']))} {left['name'] or '(unnamed)'}")
331+
print(f"Right: {right['id']} {_fmt_status(str(right['status']))} {right['name'] or '(unnamed)'}")
332+
step_counts = payload["step_counts"]
333+
assert isinstance(step_counts, dict)
334+
print(f"Steps: left={step_counts['left']} right={step_counts['right']}")
335+
336+
divergence = payload["first_divergence"]
337+
if divergence is None:
338+
print("No step divergence found.")
339+
return
340+
341+
assert isinstance(divergence, dict)
342+
print(f"First divergence at step {divergence['step_index']}")
343+
fields = divergence["fields"]
344+
assert isinstance(fields, dict)
345+
for field, values in fields.items():
346+
assert isinstance(values, dict)
347+
print(f"{field}:")
348+
print(f" left: {values['left']}")
349+
print(f" right: {values['right']}")
350+
351+
352+
def cmd_compare(args) -> int:
353+
with _open() as c:
354+
left_run, rc = _find_run(c, args.left_run_id)
355+
if left_run is None:
356+
return rc
357+
right_run, rc = _find_run(c, args.right_run_id)
358+
if right_run is None:
359+
return rc
360+
left_steps = c.execute(
361+
"SELECT * FROM steps WHERE run_id=? ORDER BY step_index", (left_run["id"],)
362+
).fetchall()
363+
right_steps = c.execute(
364+
"SELECT * FROM steps WHERE run_id=? ORDER BY step_index", (right_run["id"],)
365+
).fetchall()
366+
367+
payload = _compare_runs(left_run, left_steps, right_run, right_steps)
368+
if args.json:
369+
print(json.dumps(payload, indent=2))
370+
else:
371+
_print_compare_human(payload)
372+
return 0
373+
374+
254375
def cmd_export(args) -> int:
255376
"""Write a self-contained HTML bundle for a run (screenshots inline as base64)."""
256377
import base64
@@ -396,6 +517,12 @@ def main(argv: Optional[list[str]] = None) -> int:
396517
p_show.add_argument("--json", action="store_true", help="Print the run timeline as JSON")
397518
p_show.set_defaults(func=cmd_show)
398519

520+
p_compare = sub.add_parser("compare", help="Compare two run timelines")
521+
p_compare.add_argument("left_run_id", help="Full id or unique prefix for the left run")
522+
p_compare.add_argument("right_run_id", help="Full id or unique prefix for the right run")
523+
p_compare.add_argument("--json", action="store_true", help="Print the comparison as JSON")
524+
p_compare.set_defaults(func=cmd_compare)
525+
399526
p_export = sub.add_parser("export", help="Write a self-contained HTML bundle for a run")
400527
p_export.add_argument("run_id")
401528
p_export.add_argument("-o", "--out", help="Output path (default: <run_id>.html)")

docs/browser-use-debugging.html

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,12 @@ <h2 id="debug-empty-model-responses">Debug empty model responses</h2>
388388

389389
<section>
390390
<h2 id="compare-run-metadata-checklist">Compare failed and successful Browser Use runs</h2>
391-
<p>A future compare view is most useful when the failed run and a known successful run carry enough shared metadata to explain where they diverged. This checklist does not implement run comparison; it only describes what to preserve today so a later comparison can be meaningful.</p>
391+
<p>When the same Browser Use task has one failed run and one known-good run, compare them from the terminal to find the first divergent step before opening the local UI.</p>
392+
<pre><code>browsertrace list
393+
browsertrace compare &lt;failed_run_id&gt; &lt;success_run_id&gt;
394+
browsertrace compare &lt;failed_run_id&gt; &lt;success_run_id&gt; --json</code></pre>
395+
<p>The CLI compares action, URL, status, and error fields and reports the first divergent step. It does not replace the local UI: use the comparison to locate the boundary, then open the failed run timeline to inspect screenshots, model input/output, and surrounding context.</p>
396+
<p>Run comparison is most useful when the failed run and a successful run carry enough shared metadata to explain where they diverged. Preserve these fields when you can:</p>
392397
<ul>
393398
<li>Keep a stable task or run id when Browser Use or your app exposes one.</li>
394399
<li>Record the Browser Use version and BrowserTrace version for each run.</li>

docs/launch/metrics-log.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,3 +1102,4 @@ uv run --python 3.11 python scripts/launch_metrics.py --json
11021102
| 2026-05-12T22:21:19+00:00 | 3 | 998 | 21 | 0 | 11 | 0 | 16 | current monitor pass after v0.1.18 demo export assets reached 8 downloads each while package/media assets remain 0; owner-channel publishing remains the blocker; traffic views 354/132 unique, clones 16382/2819 unique |
11031103
| 2026-05-12T22:40:20+00:00 | 3 | 998 | 21 | 0 | 11 | 0 | 16 | current monitor pass after #370 Browser Use compare-run metadata checklist documented; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
11041104
| 2026-05-12T22:41:46+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after #370 closed by eac3637 and Browser Use compare-run metadata checklist reached main; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
1105+
| 2026-05-12T22:48:24+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after Browser Use compare CLI prototype and docs landed locally; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |

examples/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ browsertrace export <run_id> --public -o public.html
7272
```
7373

7474
A healthy `browsertrace --help` output should list the main commands, including
75-
`serve`, `doctor`, `demo`, `list`, `show`, and `export`.
75+
`serve`, `doctor`, `demo`, `list`, `show`, `compare`, and `export`.
7676
`browsertrace export --help` should mention public-safe export options such as
7777
`--public` and an output path like `-o public.html` or `--out public.html`.
7878

@@ -84,6 +84,7 @@ A healthy `browsertrace --help` output should list the main commands, including
8484
| `browsertrace demo` | Create a deterministic failed run |
8585
| `browsertrace list` | Find recent run IDs |
8686
| `browsertrace show <run_id>` | Inspect a run timeline in the terminal |
87+
| `browsertrace compare <failed_run_id> <success_run_id>` | Find the first step divergence between a failed run and a known-good run |
8788
| `browsertrace export <run_id> --public -o public.html` | Create a public-safe HTML export |
8889

8990
## Example Matrix

tests/test_cli.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,44 @@ def _seed(tmp_path, name, fail=False):
4040
return run.id
4141

4242

43+
def _seed_compare_runs(tmp_path):
44+
tracer = Tracer(home=tmp_path)
45+
46+
with tracer.run("browser-use success") as run:
47+
run.step(
48+
action="navigate",
49+
url="https://example.com/start",
50+
model_output={"thought": "open start"},
51+
)
52+
run.step(
53+
action="click(selector=#checkout)",
54+
url="https://example.com/done",
55+
model_output={"result": "checkout complete"},
56+
)
57+
success_id = run.id
58+
59+
try:
60+
with tracer.run("browser-use failure") as run:
61+
run.step(
62+
action="navigate",
63+
url="https://example.com/start",
64+
model_output={"thought": "open start"},
65+
)
66+
run.step(
67+
action="click(selector=#cancel)",
68+
url="https://example.com/cart",
69+
status="error",
70+
error="wrong target",
71+
model_output={"result": "cancelled"},
72+
)
73+
failed_id = run.id
74+
raise RuntimeError("wrong target")
75+
except RuntimeError:
76+
pass
77+
78+
return failed_id, success_id
79+
80+
4381
def test_cli_module_compiles_on_python311():
4482
"""Guard against Python 3.11 f-string syntax regressions.
4583
@@ -149,6 +187,71 @@ def test_cli_show_json_prints_run_and_steps_as_json(cli):
149187
assert payload["steps"][0]["status"] == "ok"
150188

151189

190+
def test_cli_compare_json_reports_first_divergent_step(cli):
191+
cli_mod, tmp_path = cli
192+
failed_id, success_id = _seed_compare_runs(tmp_path)
193+
194+
buf = StringIO()
195+
with redirect_stdout(buf):
196+
rc = cli_mod.main(["compare", failed_id[:8], success_id[:8], "--json"])
197+
198+
payload = json.loads(buf.getvalue())
199+
200+
assert rc == 0
201+
assert payload["left"]["id"] == failed_id
202+
assert payload["left"]["status"] == "failed"
203+
assert payload["right"]["id"] == success_id
204+
assert payload["right"]["status"] == "completed"
205+
assert payload["first_divergence"]["step_index"] == 1
206+
assert payload["first_divergence"]["left_step"]["action"] == "click(selector=#cancel)"
207+
assert payload["first_divergence"]["right_step"]["action"] == "click(selector=#checkout)"
208+
assert payload["first_divergence"]["fields"]["action"] == {
209+
"left": "click(selector=#cancel)",
210+
"right": "click(selector=#checkout)",
211+
}
212+
assert payload["first_divergence"]["fields"]["url"] == {
213+
"left": "https://example.com/cart",
214+
"right": "https://example.com/done",
215+
}
216+
assert payload["first_divergence"]["fields"]["error"] == {
217+
"left": "wrong target",
218+
"right": None,
219+
}
220+
221+
222+
def test_cli_compare_human_output_mentions_first_divergence(cli):
223+
cli_mod, tmp_path = cli
224+
failed_id, success_id = _seed_compare_runs(tmp_path)
225+
226+
buf = StringIO()
227+
with redirect_stdout(buf):
228+
rc = cli_mod.main(["compare", failed_id[:8], success_id[:8]])
229+
230+
out = buf.getvalue()
231+
232+
assert rc == 0
233+
assert "First divergence at step 1" in out
234+
assert "action:" in out
235+
assert "click(selector=#cancel)" in out
236+
assert "click(selector=#checkout)" in out
237+
assert not out.lstrip().startswith("{")
238+
239+
240+
def test_cli_compare_json_reports_no_divergence_for_same_run(cli):
241+
cli_mod, tmp_path = cli
242+
run_id = _seed(tmp_path, "same-run")
243+
244+
buf = StringIO()
245+
with redirect_stdout(buf):
246+
rc = cli_mod.main(["compare", run_id, run_id, "--json"])
247+
248+
payload = json.loads(buf.getvalue())
249+
250+
assert rc == 0
251+
assert payload["first_divergence"] is None
252+
assert payload["step_counts"] == {"left": 1, "right": 1}
253+
254+
152255
def test_cli_show_unknown_run_id_returns_2(cli):
153256
cli_mod, _ = cli
154257
# Need a real DB file or _open() will exit 1, so seed an empty one.

tests/test_metadata.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2033,6 +2033,20 @@ def test_readme_explains_list_status_filter_near_install_checks():
20332033
assert "hosted sharing" not in readme
20342034

20352035

2036+
def test_readme_explains_compare_near_install_checks():
2037+
project_root = Path(__file__).resolve().parents[1]
2038+
readme = (project_root / "README.md").read_text()
2039+
install_section = readme.split("## Install From PyPI", 1)[1].split(
2040+
"For a walkthrough", 1
2041+
)[0]
2042+
2043+
assert "`browsertrace compare <failed_run_id> <success_run_id>`" in install_section
2044+
assert "finds the first step divergence" in install_section
2045+
assert "`browsertrace compare <failed_run_id> <success_run_id> --json`" in install_section
2046+
assert "`browsertrace show <run_id> --json`" in install_section
2047+
assert "hosted sharing" not in readme
2048+
2049+
20362050
def test_readme_includes_json_cli_automation_recipe_near_install_checks():
20372051
project_root = Path(__file__).resolve().parents[1]
20382052
readme = (project_root / "README.md").read_text()
@@ -2510,6 +2524,7 @@ def test_readme_groups_install_tips_as_compact_list():
25102524
"- `browsertrace list --json` prints recent runs as JSON",
25112525
"- `browsertrace list --status failed` filters recent runs by status",
25122526
"- `browsertrace demo` prints a `Run ID:` line",
2527+
"- `browsertrace compare <failed_run_id> <success_run_id>` finds the first step divergence between a failed Browser Use run and a known-good run",
25132528
"- The first-run troubleshooting checklist walks through `browsertrace doctor`, `browsertrace demo`, `browsertrace list`, `browsertrace show`, and public-safe export",
25142529
"- The live static demo and public-safe demo export let you inspect a trace before installing anything",
25152530
"- The command cheat sheet summarizes `browsertrace doctor`, `browsertrace demo`, `browsertrace list`, `browsertrace show`, and public-safe export commands",
@@ -3084,6 +3099,7 @@ def test_examples_readme_includes_command_cheat_sheet():
30843099
assert "`browsertrace demo`" in examples_readme
30853100
assert "`browsertrace list`" in examples_readme
30863101
assert "`browsertrace show <run_id>`" in examples_readme
3102+
assert "`browsertrace compare <failed_run_id> <success_run_id>`" in examples_readme
30873103
assert "`browsertrace export <run_id> --public -o public.html`" in examples_readme
30883104
assert 'pip install "browsertrace[ui]"' in examples_readme
30893105
assert "hosted sharing" not in examples_readme
@@ -3816,6 +3832,8 @@ def test_browser_use_guide_documents_compare_run_metadata_checklist():
38163832
"</section>",
38173833
1,
38183834
)[0]
3835+
assert "browsertrace compare &lt;failed_run_id&gt; &lt;success_run_id&gt;" in section
3836+
assert "first divergent step" in section
38193837

38203838
for expected in [
38213839
"stable task or run id",
@@ -3830,7 +3848,7 @@ def test_browser_use_guide_documents_compare_run_metadata_checklist():
38303848
]:
38313849
assert expected in section
38323850

3833-
assert "does not implement run comparison" in section
3851+
assert "does not replace the local UI" in section
38343852
assert "stars" not in section.lower()
38353853
assert "upvotes" not in section.lower()
38363854
assert "reposts" not in section.lower()

0 commit comments

Comments
 (0)