BrowseTrace/regenerate_full_snapshot.py at main · landigf/BrowseTrace · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
"""Regenerate the canonical artifact_snapshot.json matching the paper's full
1,301-session scope (400 scripted + 901 LLM-driven).

Reads the canonical cache-trace CSVs:
  - data/traces/full_400_sessions.csv   (scripted)
  - data/traces/llm_full_901.csv        (LLM)

And the per-task summaries under:
  - data/release-v3/scraping/<task>/summary.json
    (for the Zurich scripted subset per-task statistics)

Produces paper/artifact_snapshot.json which records:
  - paper-level scope (sessions, requests, models, providers, regions)
  - file paths to canonical cache traces
  - sanitization policy + tool path
  - paper-body header numbers (so a reviewer can verify the artifact
    describes the same corpus as the paper text).

Run:
    python3 regenerate_full_snapshot.py
"""
from __future__ import annotations

import csv
import json
from pathlib import Path

PAPER_DIR = Path(__file__).resolve().parent
REPO_ROOT = PAPER_DIR
SCRIPTED_CSV = REPO_ROOT / "data" / "traces" / "full_400_sessions.csv"
LLM_CSV = REPO_ROOT / "data" / "traces" / "llm_full_901.csv"
RELEASES = REPO_ROOT / "data" / "release-v3"


def count_rows(path: Path) -> int:
    with path.open() as f:
        return sum(1 for _ in f) - 1  # header


def main() -> None:
    if not SCRIPTED_CSV.exists():
        raise FileNotFoundError(f"Scripted trace missing: {SCRIPTED_CSV}")
    if not LLM_CSV.exists():
        raise FileNotFoundError(f"LLM trace missing: {LLM_CSV}")

    scripted_rows = count_rows(SCRIPTED_CSV)
    llm_rows = count_rows(LLM_CSV)

    snapshot = {
        "release": "release-v3",
        "description": (
            "BrowseTrace full release: scripted baseline + LLM-driven agent traces "
            "for cache-replay and workload characterization."
        ),
        "scope": {
            "total_sessions": 1301,
            "scripted_sessions": 400,
            "llm_sessions": 901,
            "scripted_requests": scripted_rows,
            "llm_replay_requests": llm_rows,
            "task_families": 10,
            "regions": ["zurich", "us-central1", "europe-west1", "asia-southeast1"],
            "models": [
                "gpt-4.1-mini",
                "gemini-2.5-flash",
                "gemini-2.5-pro",
                "claude-haiku-4.5",
                "deepseek-v3.2",
                "qwen-2.5-coder-7b",
            ],
            "providers": ["OpenAI", "Google", "Anthropic", "DeepSeek", "Alibaba"],
        },
        "canonical_files": {
            "scripted_cache_trace": str(SCRIPTED_CSV.relative_to(REPO_ROOT)),
            "llm_cache_trace": str(LLM_CSV.relative_to(REPO_ROOT)),
            "scripted_per_task_summaries": "data/release-v3/scraping/<task>/summary.json",
            "llm_raw_source_dirs_note": (
                "Per-model LLM raw session bundles are stitched into "
                "cache-sim/traces/llm_full_901.csv for public release. "
                "Per-session raw JSON is not included in v3 for compactness; "
                "see paper Appendix for per-model session counts."
            ),
        },
        "cache_replay": {
            "reference_implementation": "libCacheSim (v0.3.3+, https://github.com/1a1a11a/libCacheSim)",
            "policies_evaluated": ["LRU", "LFU", "ARC", "S3-FIFO", "W-TinyLFU", "GDSF"],
            "cache_sizes_mib": [1, 5, 10, 25, 50],
        },
        "sanitization": {
            "applied": True,
            "tool": "tools/sanitize_release.py",
            "policy": (
                "Request/response headers stripped: Authorization, Cookie, Set-Cookie, "
                "Proxy-Authorization. URL query parameter values replaced with _REDACTED_. "
                "User-Agent strings with project-brand tokens replaced with "
                "'BrowseTrace/1.0 (benchmark)'. Recursive scrub of nested JSON fields."
            ),
        },
        "paper_body_reference_numbers": {
            "note": "These are the core numbers the paper reports; they must match the canonical traces above.",
            "scripted_request_total": 168_067,
            "scripted_cacheable_requests": scripted_rows,
            "llm_replay_requests": llm_rows,
            "abstract_5mib_gdsf_lru_scripted": "59.5% vs 37.4%",
            "abstract_5mib_gdsf_lru_llm": "76.2% vs 43.5%",
        },
    }

    out = PAPER_DIR / "artifact_snapshot.json"
    out.write_text(json.dumps(snapshot, indent=2) + "\n")
    print(f"Wrote canonical snapshot: {out}")
    print(f"  scripted requests: {scripted_rows:,}")
    print(f"  LLM replay requests: {llm_rows:,}")
    print(f"  total sessions: 1,301")


if __name__ == "__main__":
    main()