|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Scrape wiki.csswg.org and produce a static site. |
| 4 | +
|
| 5 | +This script crawls the CSS Working Group Wiki (DokuWiki-based) and generates |
| 6 | +a static HTML archive suitable for hosting on GitHub Pages or any static host. |
| 7 | +
|
| 8 | +Usage: |
| 9 | + python3 bin/scrape.py [output_dir] |
| 10 | +
|
| 11 | +If output_dir is not specified, outputs to the current directory. |
| 12 | +""" |
| 13 | + |
| 14 | +import os |
| 15 | +import re |
| 16 | +import sys |
| 17 | +import time |
| 18 | +import urllib.request |
| 19 | +from pathlib import Path |
| 20 | + |
| 21 | +BASE_URL = "https://wiki.csswg.org" |
| 22 | +USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" |
| 23 | +DELAY = 0.3 # Seconds between requests (be nice to the server) |
| 24 | + |
| 25 | + |
| 26 | +def fetch(url): |
| 27 | + """Fetch a URL with proper headers.""" |
| 28 | + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) |
| 29 | + try: |
| 30 | + with urllib.request.urlopen(req, timeout=30) as resp: |
| 31 | + return resp.read().decode("utf-8", errors="replace") |
| 32 | + except Exception as e: |
| 33 | + print(f" Error: {e}") |
| 34 | + return None |
| 35 | + |
| 36 | + |
| 37 | +def get_all_pages(): |
| 38 | + """Discover all wiki pages by crawling the index.""" |
| 39 | + pages = set() |
| 40 | + namespaces = set() |
| 41 | + |
| 42 | + print("Fetching main index...") |
| 43 | + html = fetch(f"{BASE_URL}/?do=index") |
| 44 | + if not html: |
| 45 | + return [] |
| 46 | + |
| 47 | + # Find namespace links like ?idx=ideas |
| 48 | + for m in re.finditer(r'\?idx=([a-z0-9_-]+)', html): |
| 49 | + namespaces.add(m.group(1)) |
| 50 | + |
| 51 | + # Find top-level page links |
| 52 | + for m in re.finditer(r'href="/([a-z0-9_-]+)"', html): |
| 53 | + page = m.group(1) |
| 54 | + if page not in ('lib', '_export', '_detail', '_media') and not page.startswith('feed'): |
| 55 | + pages.add(page) |
| 56 | + |
| 57 | + # Expand each namespace to find all pages within it |
| 58 | + for ns in sorted(namespaces): |
| 59 | + print(f"Expanding: {ns}") |
| 60 | + time.sleep(DELAY) |
| 61 | + html = fetch(f"{BASE_URL}/?do=index&idx={ns}") |
| 62 | + if html: |
| 63 | + for m in re.finditer(rf'href="/([^"?#]+)"', html): |
| 64 | + p = m.group(1) |
| 65 | + if not p.startswith(('lib/', '_', 'feed')) and '?' not in p: |
| 66 | + pages.add(p) |
| 67 | + # Check for sub-namespaces |
| 68 | + for m in re.finditer(rf'\?idx=({ns}:[a-z0-9_:-]+)', html): |
| 69 | + sub_ns = m.group(1) |
| 70 | + time.sleep(DELAY) |
| 71 | + sub_html = fetch(f"{BASE_URL}/?do=index&idx={sub_ns}") |
| 72 | + if sub_html: |
| 73 | + for m2 in re.finditer(rf'href="/([^"?#]+)"', sub_html): |
| 74 | + p = m2.group(1) |
| 75 | + if not p.startswith(('lib/', '_', 'feed')) and '?' not in p: |
| 76 | + pages.add(p) |
| 77 | + |
| 78 | + return sorted(pages) |
| 79 | + |
| 80 | + |
| 81 | +PAGE_TEMPLATE = '''<!DOCTYPE html> |
| 82 | +<html lang="en"> |
| 83 | +<head> |
| 84 | +<meta charset="utf-8"> |
| 85 | +<meta name="viewport" content="width=device-width, initial-scale=1"> |
| 86 | +<title>{title} - CSS Working Group Wiki (Archive)</title> |
| 87 | +<style> |
| 88 | +*, *::before, *::after {{ box-sizing: border-box; }} |
| 89 | +body {{ |
| 90 | + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; |
| 91 | + max-width: 900px; margin: 0 auto; padding: 1.5em 1em; line-height: 1.6; |
| 92 | + color: #1f2328; background: #fff; |
| 93 | +}} |
| 94 | +.archive-banner {{ |
| 95 | + background: #fff8c5; border: 1px solid #d4a72c; border-radius: 6px; |
| 96 | + padding: 0.75em 1em; margin-bottom: 1.5em; font-size: 0.9em; |
| 97 | +}} |
| 98 | +.archive-banner strong {{ color: #6e5600; }} |
| 99 | +header {{ border-bottom: 1px solid #d1d5db; padding-bottom: 1em; margin-bottom: 1.5em; }} |
| 100 | +header h1 {{ margin: 0; font-size: 1.25em; }} |
| 101 | +header h1 a {{ color: #0366d6; text-decoration: none; }} |
| 102 | +header h1 a:hover {{ text-decoration: underline; }} |
| 103 | +nav {{ margin-top: 0.5em; font-size: 0.9em; }} |
| 104 | +nav a {{ color: #656d76; text-decoration: none; margin-right: 1em; }} |
| 105 | +nav a:hover {{ color: #0366d6; }} |
| 106 | +h1, h2, h3, h4 {{ color: #1f2328; margin-top: 1.5em; }} |
| 107 | +h1:first-child {{ margin-top: 0; }} |
| 108 | +a {{ color: #0366d6; }} |
| 109 | +code {{ background: #f6f8fa; padding: 0.15em 0.3em; border-radius: 3px; font-size: 0.9em; }} |
| 110 | +pre {{ background: #f6f8fa; padding: 1em; overflow: auto; border-radius: 6px; }} |
| 111 | +pre code {{ background: none; padding: 0; }} |
| 112 | +table {{ border-collapse: collapse; margin: 1em 0; }} |
| 113 | +th, td {{ border: 1px solid #d1d5db; padding: 0.4em 0.8em; }} |
| 114 | +th {{ background: #f6f8fa; }} |
| 115 | +img {{ max-width: 100%; }} |
| 116 | +.breadcrumb {{ font-size: 0.85em; color: #656d76; margin-bottom: 1em; }} |
| 117 | +.breadcrumb a {{ color: #656d76; }} |
| 118 | +ul, ol {{ padding-left: 1.5em; }} |
| 119 | +li {{ margin: 0.25em 0; }} |
| 120 | +.plugin_note {{ background: #f0f4f8; border-left: 4px solid #0366d6; padding: 0.75em 1em; margin: 1em 0; border-radius: 3px; }} |
| 121 | +abbr {{ text-decoration: underline dotted; cursor: help; }} |
| 122 | +@media (prefers-color-scheme: dark) {{ |
| 123 | + body {{ background: #0d1117; color: #e6edf3; }} |
| 124 | + .archive-banner {{ background: #3d2e00; border-color: #6e5600; }} |
| 125 | + .archive-banner strong {{ color: #f0c000; }} |
| 126 | + header {{ border-bottom-color: #30363d; }} |
| 127 | + header h1 a {{ color: #58a6ff; }} |
| 128 | + nav a {{ color: #8b949e; }} |
| 129 | + nav a:hover {{ color: #58a6ff; }} |
| 130 | + h1, h2, h3, h4 {{ color: #e6edf3; }} |
| 131 | + a {{ color: #58a6ff; }} |
| 132 | + code, pre {{ background: #161b22; }} |
| 133 | + th, td {{ border-color: #30363d; }} |
| 134 | + th {{ background: #161b22; }} |
| 135 | + .breadcrumb, .breadcrumb a {{ color: #8b949e; }} |
| 136 | + .plugin_note {{ background: #161b22; border-color: #58a6ff; }} |
| 137 | +}} |
| 138 | +</style> |
| 139 | +</head> |
| 140 | +<body> |
| 141 | +<div class="archive-banner"> |
| 142 | +<strong>Archive Notice:</strong> This is a read-only archive of the CSS Working Group Wiki. |
| 143 | +The original wiki was hosted at wiki.csswg.org. |
| 144 | +</div> |
| 145 | +<header> |
| 146 | +<h1><a href="{home_path}">CSS Working Group Wiki</a></h1> |
| 147 | +<nav> |
| 148 | +<a href="{home_path}">Home</a> |
| 149 | +<a href="{home_path}spec/">Specs</a> |
| 150 | +<a href="{home_path}ideas/">Ideas</a> |
| 151 | +<a href="{home_path}test/">Testing</a> |
| 152 | +<a href="{home_path}wiki/">About</a> |
| 153 | +</nav> |
| 154 | +</header> |
| 155 | +{breadcrumb} |
| 156 | +<main> |
| 157 | +{content} |
| 158 | +</main> |
| 159 | +</body> |
| 160 | +</html> |
| 161 | +''' |
| 162 | + |
| 163 | + |
| 164 | +def extract_content(html, page_path): |
| 165 | + """Extract the main content from a DokuWiki page.""" |
| 166 | + # Find content between wikipage start/stop comments |
| 167 | + m = re.search(r'<!-- wikipage start -->\s*(.*?)\s*<!-- wikipage stop -->', html, re.DOTALL) |
| 168 | + if not m: |
| 169 | + # Fallback: find the page div |
| 170 | + m = re.search(r'<div class="page"[^>]*>(.*?)</div>\s*(?:<div class="docInfo"|</div>\s*</div>\s*<div class="clearer")', html, re.DOTALL) |
| 171 | + |
| 172 | + content = m.group(1).strip() if m else "<p>Content could not be extracted.</p>" |
| 173 | + |
| 174 | + # Extract title from first h1 or page title |
| 175 | + title_m = re.search(r'<h1[^>]*>([^<]+)</h1>', content) |
| 176 | + if not title_m: |
| 177 | + title_m = re.search(r'<title>\s*([^<\[]+)', html) |
| 178 | + title = title_m.group(1).strip() if title_m else page_path |
| 179 | + |
| 180 | + # Clean up content |
| 181 | + # Remove edit section buttons |
| 182 | + content = re.sub(r'<div class="secedit[^>]*>.*?</div>', '', content, flags=re.DOTALL) |
| 183 | + # Remove TOC toggle buttons |
| 184 | + content = re.sub(r'<div class="tocheader[^>]*>.*?</div>', '', content, flags=re.DOTALL) |
| 185 | + # Remove section edit IDs |
| 186 | + content = re.sub(r' id="[^"]*sectionedit[^"]*"', '', content) |
| 187 | + content = re.sub(r' class="sectionedit\d+"', '', content) |
| 188 | + |
| 189 | + # Remove wiki-specific link classes |
| 190 | + content = re.sub(r' class="wikilink\d?"', '', content) |
| 191 | + content = re.sub(r' data-wiki-id="[^"]*"', '', content) |
| 192 | + # External link class cleanup |
| 193 | + content = re.sub(r' class="urlextern"', '', content) |
| 194 | + content = re.sub(r' rel="ugc nofollow"', ' rel="noopener"', content) |
| 195 | + |
| 196 | + # Clean up div wrappers that are just for layout |
| 197 | + content = re.sub(r'<div class="level\d+">\s*', '', content) |
| 198 | + content = re.sub(r'\s*</div>\s*(?=<h[1-6]|<ul|<ol|<p|$)', '', content) |
| 199 | + content = re.sub(r'<div class="li">\s*', '', content) |
| 200 | + content = re.sub(r'\s*</div>\s*</li>', '</li>', content) |
| 201 | + |
| 202 | + return title, content |
| 203 | + |
| 204 | + |
| 205 | +def make_breadcrumb(page_path, home_path): |
| 206 | + """Generate breadcrumb navigation.""" |
| 207 | + if page_path in ('main', ''): |
| 208 | + return '' |
| 209 | + |
| 210 | + parts = page_path.split('/') |
| 211 | + crumbs = [f'<a href="{home_path}">Home</a>'] |
| 212 | + for i, part in enumerate(parts[:-1]): |
| 213 | + path = "../" * (len(parts) - i - 1) |
| 214 | + crumbs.append(f'<a href="{path}">{part}</a>') |
| 215 | + crumbs.append(parts[-1]) |
| 216 | + |
| 217 | + return f'<div class="breadcrumb">{" / ".join(crumbs)}</div>' |
| 218 | + |
| 219 | + |
| 220 | +def fix_internal_links(content, home_path): |
| 221 | + """Convert absolute wiki links to relative paths.""" |
| 222 | + # Fix internal wiki links: /page/ -> {home_path}page/ |
| 223 | + def fix_link(m): |
| 224 | + path = m.group(1) |
| 225 | + return f'href="{home_path}{path}' |
| 226 | + content = re.sub(r'href="/([a-z])', fix_link, content) |
| 227 | + return content |
| 228 | + |
| 229 | + |
| 230 | +def save_page(output_dir, page_path, html): |
| 231 | + """Process and save a page.""" |
| 232 | + title, content = extract_content(html, page_path) |
| 233 | + |
| 234 | + # Calculate depth for relative paths |
| 235 | + if page_path == 'main': |
| 236 | + depth = 0 |
| 237 | + home_path = "./" |
| 238 | + out_path = output_dir / 'index.html' |
| 239 | + else: |
| 240 | + depth = len(page_path.split('/')) |
| 241 | + home_path = "../" * depth |
| 242 | + out_dir = output_dir / page_path |
| 243 | + out_dir.mkdir(parents=True, exist_ok=True) |
| 244 | + out_path = out_dir / 'index.html' |
| 245 | + |
| 246 | + breadcrumb = make_breadcrumb(page_path, home_path) |
| 247 | + content = fix_internal_links(content, home_path) |
| 248 | + |
| 249 | + output = PAGE_TEMPLATE.format( |
| 250 | + title=title, |
| 251 | + content=content, |
| 252 | + breadcrumb=breadcrumb, |
| 253 | + home_path=home_path |
| 254 | + ) |
| 255 | + |
| 256 | + out_path.write_text(output, encoding='utf-8') |
| 257 | + |
| 258 | + |
| 259 | +def main(): |
| 260 | + if len(sys.argv) > 1: |
| 261 | + output_dir = Path(sys.argv[1]) |
| 262 | + else: |
| 263 | + output_dir = Path(".") |
| 264 | + |
| 265 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 266 | + |
| 267 | + pages = get_all_pages() |
| 268 | + print(f"\nFound {len(pages)} pages. Starting download...\n") |
| 269 | + |
| 270 | + # Always include 'main' as the homepage |
| 271 | + if 'main' not in pages: |
| 272 | + pages = ['main'] + list(pages) |
| 273 | + |
| 274 | + for i, page in enumerate(pages): |
| 275 | + print(f"[{i+1}/{len(pages)}] {page}") |
| 276 | + time.sleep(DELAY) |
| 277 | + |
| 278 | + html = fetch(f"{BASE_URL}/{page}") |
| 279 | + if html: |
| 280 | + save_page(output_dir, page, html) |
| 281 | + |
| 282 | + print(f"\nDone! Static site written to {output_dir}") |
| 283 | + print(f"Preview with: python3 -m http.server -d {output_dir}") |
| 284 | + |
| 285 | + |
| 286 | +if __name__ == "__main__": |
| 287 | + main() |
0 commit comments