karpathy · AntDX316 · Mar 28, 2026
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ The repo includes scrapers, parsers, and a pipeline for writing custom LLM promp
 
 ## Data pipeline
 
-1. **Scrape** (`scrape.py`) — Playwright (non-headless, BLS blocks bots) downloads raw HTML for all 342 occupation pages into `html/`.
+1. **Scrape** (`scrape.py`) — Playwright downloads raw HTML for all 342 occupation pages into `html/`. By default it runs non-headless (BLS blocks bots more aggressively), with an optional `--headless` flag for testing/experimentation.
 2. **Parse** (`parse_detail.py`, `process.py`) — BeautifulSoup converts raw HTML into clean Markdown files in `pages/`.
 3. **Tabulate** (`make_csv.py`) — Extracts structured fields (pay, education, job count, growth outlook, SOC code) into `occupations.csv`.
 4. **Score** (`score.py`) — Sends each occupation's Markdown description to an LLM with a scoring rubric. Each occupation gets an AI Exposure score from 0-10 with a rationale. Results saved to `scores.json`. Fork this to write your own prompts.
@@ -60,6 +60,9 @@ OPENROUTER_API_KEY=your_key_here
 # Scrape BLS pages (only needed once, results are cached in html/)
 uv run python scrape.py
 
+# Optional: run Chromium headlessly for testing/experimentation
+uv run python scrape.py --headless
+
 # Generate Markdown from HTML
 uv run python process.py
 

diff --git a/scrape.py b/scrape.py
@@ -5,10 +5,11 @@
 Run process.py afterwards to derive data/<slug>.json and pages/<slug>.md.
 
 Usage:
-    uv run python scrape.py                      # scrape all (0 to 342)
+    uv run python scrape.py                       # scrape all (0 to 342)
     uv run python scrape.py --start 0 --end 5    # scrape first 5
     uv run python scrape.py --start 10 --end 20  # scrape indices 10-19
-    uv run python scrape.py --force               # re-scrape ignoring cache
+    uv run python scrape.py --force              # re-scrape ignoring cache
+    uv run python scrape.py --headless           # opt into headless Chromium
 
 Caching: skips any occupation where html/<slug>.html already exists.
 """
@@ -28,6 +29,7 @@ def main():
     parser.add_argument("--end", type=int, default=None, help="End index (exclusive)")
     parser.add_argument("--force", action="store_true", help="Re-scrape even if cached")
     parser.add_argument("--delay", type=float, default=1.0, help="Seconds between requests")
+    parser.add_argument("--headless", action="store_true", help="Launch Chromium in headless mode (default: non-headless)")
     args = parser.parse_args()
 
     # Load master list
@@ -55,10 +57,11 @@ def main():
         print("Nothing to scrape — all cached.")
         return
 
-    print(f"\nScraping {len(to_scrape)} occupations (non-headless Chromium)...\n")
+    mode = "headless" if args.headless else "non-headless"
+    print(f"\nScraping {len(to_scrape)} occupations ({mode} Chromium)...\n")
 
     with sync_playwright() as p:
-        browser = p.chromium.launch(headless=False)
+        browser = p.chromium.launch(headless=args.headless)
         page = browser.new_page()
 
         for idx, (i, occ) in enumerate(to_scrape):