Add Browser Use run compare CLI

aaronlab · aaronlab · commit 72fb797120f2 · 2026-05-13T06:49:06.000+08:00
diff --git a/LAUNCH.md b/LAUNCH.md
@@ -120,7 +120,7 @@ Current latest audit:
 
 | Captured at | Stars | To 1001 | Forks | Watchers | Issues | PRs | Release downloads | Note |
 |---|---:|---:|---:|---:|---:|---:|---:|---|
-| 2026-05-12T22:41:46+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after #370 closed by eac3637 and Browser Use compare-run metadata checklist reached main; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
+| 2026-05-12T22:48:24+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after Browser Use compare CLI prototype and docs landed locally; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
 
 The active objective is incomplete until `stargazerCount > 1000`.
 
diff --git a/README.md b/README.md
@@ -113,6 +113,7 @@ Useful local checks:
 - `browsertrace list --json` prints recent runs as JSON with id, name, status, and created timestamp.
 - `browsertrace list --status failed` filters recent runs by status; combine with JSON, for example `browsertrace list --status completed --json`.
 - `browsertrace demo` prints a `Run ID:` line you can copy into `browsertrace show` or `browsertrace export`.
+- `browsertrace compare <failed_run_id> <success_run_id>` finds the first step divergence between a failed Browser Use run and a known-good run; add `--json` for automation with `browsertrace compare <failed_run_id> <success_run_id> --json`.
 
 For scripts, CI, or AI/coding-agent troubleshooting, use the JSON CLI checks:
 
@@ -122,6 +123,10 @@ browsertrace list --status failed --json
 browsertrace show <run_id> --json
 ```
 
+When you have a failed run and a known-good run for the same Browser Use task,
+use `browsertrace compare <failed_run_id> <success_run_id> --json` to locate
+the first divergent action, URL, status, or error field.
+
 For automation-oriented usage notes, see
 [JSON CLI checks for automation](examples/#json-cli-checks-for-automation).
 
diff --git a/browsertrace/cli.py b/browsertrace/cli.py
@@ -5,6 +5,7 @@
     browsertrace demo            # create a deterministic failed demo run
     browsertrace list            # list runs in the terminal
     browsertrace show <run_id>   # print a run's timeline
+    browsertrace compare <failed_id> <success_id>  # find the first step divergence
     browsertrace doctor          # print local install and trace-store status
     browsertrace export <id>     # write a portable HTML bundle to ./<id>.html
     browsertrace export <id> --redact  # omit model I/O from the HTML export
@@ -251,6 +252,126 @@ def cmd_show(args) -> int:
     return 0
 
 
+def _run_summary(run: sqlite3.Row) -> dict[str, str]:
+    return {
+        "id": run["id"],
+        "name": run["name"] or "",
+        "status": run["status"],
+    }
+
+
+def _step_for_compare(step: sqlite3.Row | None) -> dict[str, object] | None:
+    if step is None:
+        return None
+    return {
+        "step_index": step["step_index"],
+        "action": step["action"] or "",
+        "url": step["url"] or "",
+        "status": step["status"] or "ok",
+        "error": step["error"],
+    }
+
+
+def _compare_runs(
+    left_run: sqlite3.Row,
+    left_steps: list[sqlite3.Row],
+    right_run: sqlite3.Row,
+    right_steps: list[sqlite3.Row],
+) -> dict[str, object]:
+    payload: dict[str, object] = {
+        "left": _run_summary(left_run),
+        "right": _run_summary(right_run),
+        "step_counts": {"left": len(left_steps), "right": len(right_steps)},
+        "first_divergence": None,
+    }
+
+    fields = ["action", "url", "status", "error"]
+    for i in range(max(len(left_steps), len(right_steps))):
+        left_step = _step_for_compare(left_steps[i] if i < len(left_steps) else None)
+        right_step = _step_for_compare(right_steps[i] if i < len(right_steps) else None)
+
+        if left_step is None or right_step is None:
+            payload["first_divergence"] = {
+                "step_index": i,
+                "fields": {
+                    "presence": {
+                        "left": left_step is not None,
+                        "right": right_step is not None,
+                    }
+                },
+                "left_step": left_step,
+                "right_step": right_step,
+            }
+            break
+
+        changed = {
+            field: {"left": left_step[field], "right": right_step[field]}
+            for field in fields
+            if left_step[field] != right_step[field]
+        }
+        if changed:
+            payload["first_divergence"] = {
+                "step_index": left_step["step_index"],
+                "fields": changed,
+                "left_step": left_step,
+                "right_step": right_step,
+            }
+            break
+
+    return payload
+
+
+def _print_compare_human(payload: dict[str, object]) -> None:
+    left = payload["left"]
+    right = payload["right"]
+    assert isinstance(left, dict)
+    assert isinstance(right, dict)
+
+    print(f"Left:   {left['id']}  {_fmt_status(str(left['status']))}  {left['name'] or '(unnamed)'}")
+    print(f"Right:  {right['id']}  {_fmt_status(str(right['status']))}  {right['name'] or '(unnamed)'}")
+    step_counts = payload["step_counts"]
+    assert isinstance(step_counts, dict)
+    print(f"Steps:  left={step_counts['left']} right={step_counts['right']}")
+
+    divergence = payload["first_divergence"]
+    if divergence is None:
+        print("No step divergence found.")
+        return
+
+    assert isinstance(divergence, dict)
+    print(f"First divergence at step {divergence['step_index']}")
+    fields = divergence["fields"]
+    assert isinstance(fields, dict)
+    for field, values in fields.items():
+        assert isinstance(values, dict)
+        print(f"{field}:")
+        print(f"  left:  {values['left']}")
+        print(f"  right: {values['right']}")
+
+
+def cmd_compare(args) -> int:
+    with _open() as c:
+        left_run, rc = _find_run(c, args.left_run_id)
+        if left_run is None:
+            return rc
+        right_run, rc = _find_run(c, args.right_run_id)
+        if right_run is None:
+            return rc
+        left_steps = c.execute(
+            "SELECT * FROM steps WHERE run_id=? ORDER BY step_index", (left_run["id"],)
+        ).fetchall()
+        right_steps = c.execute(
+            "SELECT * FROM steps WHERE run_id=? ORDER BY step_index", (right_run["id"],)
+        ).fetchall()
+
+    payload = _compare_runs(left_run, left_steps, right_run, right_steps)
+    if args.json:
+        print(json.dumps(payload, indent=2))
+    else:
+        _print_compare_human(payload)
+    return 0
+
+
 def cmd_export(args) -> int:
     """Write a self-contained HTML bundle for a run (screenshots inline as base64)."""
     import base64
@@ -396,6 +517,12 @@ def main(argv: Optional[list[str]] = None) -> int:
     p_show.add_argument("--json", action="store_true", help="Print the run timeline as JSON")
     p_show.set_defaults(func=cmd_show)
 
+    p_compare = sub.add_parser("compare", help="Compare two run timelines")
+    p_compare.add_argument("left_run_id", help="Full id or unique prefix for the left run")
+    p_compare.add_argument("right_run_id", help="Full id or unique prefix for the right run")
+    p_compare.add_argument("--json", action="store_true", help="Print the comparison as JSON")
+    p_compare.set_defaults(func=cmd_compare)
+
     p_export = sub.add_parser("export", help="Write a self-contained HTML bundle for a run")
     p_export.add_argument("run_id")
     p_export.add_argument("-o", "--out", help="Output path (default: <run_id>.html)")
diff --git a/docs/browser-use-debugging.html b/docs/browser-use-debugging.html
@@ -388,7 +388,12 @@ <h2 id="debug-empty-model-responses">Debug empty model responses</h2>
 
     <section>
       <h2 id="compare-run-metadata-checklist">Compare failed and successful Browser Use runs</h2>
-      <p>A future compare view is most useful when the failed run and a known successful run carry enough shared metadata to explain where they diverged. This checklist does not implement run comparison; it only describes what to preserve today so a later comparison can be meaningful.</p>
+      <p>When the same Browser Use task has one failed run and one known-good run, compare them from the terminal to find the first divergent step before opening the local UI.</p>
+      <pre><code>browsertrace list
+browsertrace compare &lt;failed_run_id&gt; &lt;success_run_id&gt;
+browsertrace compare &lt;failed_run_id&gt; &lt;success_run_id&gt; --json</code></pre>
+      <p>The CLI compares action, URL, status, and error fields and reports the first divergent step. It does not replace the local UI: use the comparison to locate the boundary, then open the failed run timeline to inspect screenshots, model input/output, and surrounding context.</p>
+      <p>Run comparison is most useful when the failed run and a successful run carry enough shared metadata to explain where they diverged. Preserve these fields when you can:</p>
       <ul>
         <li>Keep a stable task or run id when Browser Use or your app exposes one.</li>
         <li>Record the Browser Use version and BrowserTrace version for each run.</li>
diff --git a/docs/launch/metrics-log.md b/docs/launch/metrics-log.md
@@ -1102,3 +1102,4 @@ uv run --python 3.11 python scripts/launch_metrics.py --json
 | 2026-05-12T22:21:19+00:00 | 3 | 998 | 21 | 0 | 11 | 0 | 16 | current monitor pass after v0.1.18 demo export assets reached 8 downloads each while package/media assets remain 0; owner-channel publishing remains the blocker; traffic views 354/132 unique, clones 16382/2819 unique |
 | 2026-05-12T22:40:20+00:00 | 3 | 998 | 21 | 0 | 11 | 0 | 16 | current monitor pass after #370 Browser Use compare-run metadata checklist documented; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
 | 2026-05-12T22:41:46+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after #370 closed by eac3637 and Browser Use compare-run metadata checklist reached main; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
+| 2026-05-12T22:48:24+00:00 | 3 | 998 | 21 | 0 | 10 | 0 | 16 | current monitor pass after Browser Use compare CLI prototype and docs landed locally; owner-channel publishing remains the blocker; star goal remains incomplete; traffic views 354/132 unique, clones 16382/2819 unique |
diff --git a/examples/README.md b/examples/README.md
@@ -72,7 +72,7 @@ browsertrace export <run_id> --public -o public.html
 ```
 
 A healthy `browsertrace --help` output should list the main commands, including
-`serve`, `doctor`, `demo`, `list`, `show`, and `export`.
+`serve`, `doctor`, `demo`, `list`, `show`, `compare`, and `export`.
 `browsertrace export --help` should mention public-safe export options such as
 `--public` and an output path like `-o public.html` or `--out public.html`.
 
@@ -84,6 +84,7 @@ A healthy `browsertrace --help` output should list the main commands, including
 | `browsertrace demo` | Create a deterministic failed run |
 | `browsertrace list` | Find recent run IDs |
 | `browsertrace show <run_id>` | Inspect a run timeline in the terminal |
+| `browsertrace compare <failed_run_id> <success_run_id>` | Find the first step divergence between a failed run and a known-good run |
 | `browsertrace export <run_id> --public -o public.html` | Create a public-safe HTML export |
 
 ## Example Matrix
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -40,6 +40,44 @@ def _seed(tmp_path, name, fail=False):
     return run.id
 
 
+def _seed_compare_runs(tmp_path):
+    tracer = Tracer(home=tmp_path)
+
+    with tracer.run("browser-use success") as run:
+        run.step(
+            action="navigate",
+            url="https://example.com/start",
+            model_output={"thought": "open start"},
+        )
+        run.step(
+            action="click(selector=#checkout)",
+            url="https://example.com/done",
+            model_output={"result": "checkout complete"},
+        )
+        success_id = run.id
+
+    try:
+        with tracer.run("browser-use failure") as run:
+            run.step(
+                action="navigate",
+                url="https://example.com/start",
+                model_output={"thought": "open start"},
+            )
+            run.step(
+                action="click(selector=#cancel)",
+                url="https://example.com/cart",
+                status="error",
+                error="wrong target",
+                model_output={"result": "cancelled"},
+            )
+            failed_id = run.id
+            raise RuntimeError("wrong target")
+    except RuntimeError:
+        pass
+
+    return failed_id, success_id
+
+
 def test_cli_module_compiles_on_python311():
     """Guard against Python 3.11 f-string syntax regressions.
 
@@ -149,6 +187,71 @@ def test_cli_show_json_prints_run_and_steps_as_json(cli):
     assert payload["steps"][0]["status"] == "ok"
 
 
+def test_cli_compare_json_reports_first_divergent_step(cli):
+    cli_mod, tmp_path = cli
+    failed_id, success_id = _seed_compare_runs(tmp_path)
+
+    buf = StringIO()
+    with redirect_stdout(buf):
+        rc = cli_mod.main(["compare", failed_id[:8], success_id[:8], "--json"])
+
+    payload = json.loads(buf.getvalue())
+
+    assert rc == 0
+    assert payload["left"]["id"] == failed_id
+    assert payload["left"]["status"] == "failed"
+    assert payload["right"]["id"] == success_id
+    assert payload["right"]["status"] == "completed"
+    assert payload["first_divergence"]["step_index"] == 1
+    assert payload["first_divergence"]["left_step"]["action"] == "click(selector=#cancel)"
+    assert payload["first_divergence"]["right_step"]["action"] == "click(selector=#checkout)"
+    assert payload["first_divergence"]["fields"]["action"] == {
+        "left": "click(selector=#cancel)",
+        "right": "click(selector=#checkout)",
+    }
+    assert payload["first_divergence"]["fields"]["url"] == {
+        "left": "https://example.com/cart",
+        "right": "https://example.com/done",
+    }
+    assert payload["first_divergence"]["fields"]["error"] == {
+        "left": "wrong target",
+        "right": None,
+    }
+
+
+def test_cli_compare_human_output_mentions_first_divergence(cli):
+    cli_mod, tmp_path = cli
+    failed_id, success_id = _seed_compare_runs(tmp_path)
+
+    buf = StringIO()
+    with redirect_stdout(buf):
+        rc = cli_mod.main(["compare", failed_id[:8], success_id[:8]])
+
+    out = buf.getvalue()
+
+    assert rc == 0
+    assert "First divergence at step 1" in out
+    assert "action:" in out
+    assert "click(selector=#cancel)" in out
+    assert "click(selector=#checkout)" in out
+    assert not out.lstrip().startswith("{")
+
+
+def test_cli_compare_json_reports_no_divergence_for_same_run(cli):
+    cli_mod, tmp_path = cli
+    run_id = _seed(tmp_path, "same-run")
+
+    buf = StringIO()
+    with redirect_stdout(buf):
+        rc = cli_mod.main(["compare", run_id, run_id, "--json"])
+
+    payload = json.loads(buf.getvalue())
+
+    assert rc == 0
+    assert payload["first_divergence"] is None
+    assert payload["step_counts"] == {"left": 1, "right": 1}
+
+
 def test_cli_show_unknown_run_id_returns_2(cli):
     cli_mod, _ = cli
     # Need a real DB file or _open() will exit 1, so seed an empty one.
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -2033,6 +2033,20 @@ def test_readme_explains_list_status_filter_near_install_checks():
     assert "hosted sharing" not in readme
 
 
+def test_readme_explains_compare_near_install_checks():
+    project_root = Path(__file__).resolve().parents[1]
+    readme = (project_root / "README.md").read_text()
+    install_section = readme.split("## Install From PyPI", 1)[1].split(
+        "For a walkthrough", 1
+    )[0]
+
+    assert "`browsertrace compare <failed_run_id> <success_run_id>`" in install_section
+    assert "finds the first step divergence" in install_section
+    assert "`browsertrace compare <failed_run_id> <success_run_id> --json`" in install_section
+    assert "`browsertrace show <run_id> --json`" in install_section
+    assert "hosted sharing" not in readme
+
+
 def test_readme_includes_json_cli_automation_recipe_near_install_checks():
     project_root = Path(__file__).resolve().parents[1]
     readme = (project_root / "README.md").read_text()
@@ -2510,6 +2524,7 @@ def test_readme_groups_install_tips_as_compact_list():
         "- `browsertrace list --json` prints recent runs as JSON",
         "- `browsertrace list --status failed` filters recent runs by status",
         "- `browsertrace demo` prints a `Run ID:` line",
+        "- `browsertrace compare <failed_run_id> <success_run_id>` finds the first step divergence between a failed Browser Use run and a known-good run",
         "- The first-run troubleshooting checklist walks through `browsertrace doctor`, `browsertrace demo`, `browsertrace list`, `browsertrace show`, and public-safe export",
         "- The live static demo and public-safe demo export let you inspect a trace before installing anything",
         "- The command cheat sheet summarizes `browsertrace doctor`, `browsertrace demo`, `browsertrace list`, `browsertrace show`, and public-safe export commands",
@@ -3084,6 +3099,7 @@ def test_examples_readme_includes_command_cheat_sheet():
     assert "`browsertrace demo`" in examples_readme
     assert "`browsertrace list`" in examples_readme
     assert "`browsertrace show <run_id>`" in examples_readme
+    assert "`browsertrace compare <failed_run_id> <success_run_id>`" in examples_readme
     assert "`browsertrace export <run_id> --public -o public.html`" in examples_readme
     assert 'pip install "browsertrace[ui]"' in examples_readme
     assert "hosted sharing" not in examples_readme
@@ -3816,6 +3832,8 @@ def test_browser_use_guide_documents_compare_run_metadata_checklist():
         "</section>",
         1,
     )[0]
+    assert "browsertrace compare &lt;failed_run_id&gt; &lt;success_run_id&gt;" in section
+    assert "first divergent step" in section
 
     for expected in [
         "stable task or run id",
@@ -3830,7 +3848,7 @@ def test_browser_use_guide_documents_compare_run_metadata_checklist():
     ]:
         assert expected in section
 
-    assert "does not implement run comparison" in section
+    assert "does not replace the local UI" in section
     assert "stars" not in section.lower()
     assert "upvotes" not in section.lower()
     assert "reposts" not in section.lower()