Skip to content

Commit 89be674

Browse files
Add scraping script and README documentation
- bin/scrape.py: Python script to crawl wiki.csswg.org and generate static HTML archive - README.md: Documentation on contents, regeneration, and deployment
1 parent 649200c commit 89be674

2 files changed

Lines changed: 347 additions & 0 deletions

File tree

README.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# CSS Working Group Wiki Archive
2+
3+
This is a read-only static archive of the [CSS Working Group Wiki](https://wiki.csswg.org/),
4+
originally hosted on DokuWiki at wiki.csswg.org.
5+
6+
**Live site:** https://w3c.github.io/csswg-wiki-archive/
7+
8+
## Contents
9+
10+
The archive contains 294 pages covering:
11+
12+
- **Specification Issues and Planning** (`/spec/`) — Wiki pages for tracking spec-related thoughts
13+
- **Ideas and Resolutions** (`/ideas/`) — Ideas not yet in a spec
14+
- **Testing** (`/test/`) — CSS testing documentation
15+
- **Meeting Planning** (`/planning/`) — Face-to-face meeting plans and schedules
16+
- **CSSWG Tools** (`/tools/`) — Documentation about working group tools
17+
18+
## Regenerating the Archive
19+
20+
The archive was generated using a Python scraper that crawls the DokuWiki site
21+
and produces static HTML files.
22+
23+
### Requirements
24+
25+
- Python 3.8+
26+
- Network access to wiki.csswg.org
27+
28+
### Usage
29+
30+
```bash
31+
# Scrape the wiki and output to a directory
32+
python3 bin/scrape.py /path/to/output
33+
34+
# Or output to current directory
35+
python3 bin/scrape.py .
36+
37+
# Preview locally
38+
python3 -m http.server -d /path/to/output
39+
```
40+
41+
The scraper:
42+
1. Discovers all pages by crawling the DokuWiki index
43+
2. Fetches each page and extracts the main content
44+
3. Generates clean HTML with modern styling and dark mode support
45+
4. Creates proper relative links for hosting at any URL path
46+
47+
### Rate Limiting
48+
49+
The scraper includes a 0.3-second delay between requests to avoid overwhelming
50+
the server. A full scrape of ~294 pages takes approximately 2 minutes.
51+
52+
## Deployment
53+
54+
The site is automatically deployed to GitHub Pages via the workflow in
55+
`.github/workflows/deploy.yml` whenever changes are pushed to the `main` branch.
56+
57+
## License
58+
59+
The wiki content is governed by the
60+
[W3C Document License](https://www.w3.org/Consortium/Legal/2015/doc-license).

bin/scrape.py

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Scrape wiki.csswg.org and produce a static site.
4+
5+
This script crawls the CSS Working Group Wiki (DokuWiki-based) and generates
6+
a static HTML archive suitable for hosting on GitHub Pages or any static host.
7+
8+
Usage:
9+
python3 bin/scrape.py [output_dir]
10+
11+
If output_dir is not specified, outputs to the current directory.
12+
"""
13+
14+
import os
15+
import re
16+
import sys
17+
import time
18+
import urllib.request
19+
from pathlib import Path
20+
21+
BASE_URL = "https://wiki.csswg.org"
22+
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
23+
DELAY = 0.3 # Seconds between requests (be nice to the server)
24+
25+
26+
def fetch(url):
27+
"""Fetch a URL with proper headers."""
28+
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
29+
try:
30+
with urllib.request.urlopen(req, timeout=30) as resp:
31+
return resp.read().decode("utf-8", errors="replace")
32+
except Exception as e:
33+
print(f" Error: {e}")
34+
return None
35+
36+
37+
def get_all_pages():
38+
"""Discover all wiki pages by crawling the index."""
39+
pages = set()
40+
namespaces = set()
41+
42+
print("Fetching main index...")
43+
html = fetch(f"{BASE_URL}/?do=index")
44+
if not html:
45+
return []
46+
47+
# Find namespace links like ?idx=ideas
48+
for m in re.finditer(r'\?idx=([a-z0-9_-]+)', html):
49+
namespaces.add(m.group(1))
50+
51+
# Find top-level page links
52+
for m in re.finditer(r'href="/([a-z0-9_-]+)"', html):
53+
page = m.group(1)
54+
if page not in ('lib', '_export', '_detail', '_media') and not page.startswith('feed'):
55+
pages.add(page)
56+
57+
# Expand each namespace to find all pages within it
58+
for ns in sorted(namespaces):
59+
print(f"Expanding: {ns}")
60+
time.sleep(DELAY)
61+
html = fetch(f"{BASE_URL}/?do=index&idx={ns}")
62+
if html:
63+
for m in re.finditer(rf'href="/([^"?#]+)"', html):
64+
p = m.group(1)
65+
if not p.startswith(('lib/', '_', 'feed')) and '?' not in p:
66+
pages.add(p)
67+
# Check for sub-namespaces
68+
for m in re.finditer(rf'\?idx=({ns}:[a-z0-9_:-]+)', html):
69+
sub_ns = m.group(1)
70+
time.sleep(DELAY)
71+
sub_html = fetch(f"{BASE_URL}/?do=index&idx={sub_ns}")
72+
if sub_html:
73+
for m2 in re.finditer(rf'href="/([^"?#]+)"', sub_html):
74+
p = m2.group(1)
75+
if not p.startswith(('lib/', '_', 'feed')) and '?' not in p:
76+
pages.add(p)
77+
78+
return sorted(pages)
79+
80+
81+
PAGE_TEMPLATE = '''<!DOCTYPE html>
82+
<html lang="en">
83+
<head>
84+
<meta charset="utf-8">
85+
<meta name="viewport" content="width=device-width, initial-scale=1">
86+
<title>{title} - CSS Working Group Wiki (Archive)</title>
87+
<style>
88+
*, *::before, *::after {{ box-sizing: border-box; }}
89+
body {{
90+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
91+
max-width: 900px; margin: 0 auto; padding: 1.5em 1em; line-height: 1.6;
92+
color: #1f2328; background: #fff;
93+
}}
94+
.archive-banner {{
95+
background: #fff8c5; border: 1px solid #d4a72c; border-radius: 6px;
96+
padding: 0.75em 1em; margin-bottom: 1.5em; font-size: 0.9em;
97+
}}
98+
.archive-banner strong {{ color: #6e5600; }}
99+
header {{ border-bottom: 1px solid #d1d5db; padding-bottom: 1em; margin-bottom: 1.5em; }}
100+
header h1 {{ margin: 0; font-size: 1.25em; }}
101+
header h1 a {{ color: #0366d6; text-decoration: none; }}
102+
header h1 a:hover {{ text-decoration: underline; }}
103+
nav {{ margin-top: 0.5em; font-size: 0.9em; }}
104+
nav a {{ color: #656d76; text-decoration: none; margin-right: 1em; }}
105+
nav a:hover {{ color: #0366d6; }}
106+
h1, h2, h3, h4 {{ color: #1f2328; margin-top: 1.5em; }}
107+
h1:first-child {{ margin-top: 0; }}
108+
a {{ color: #0366d6; }}
109+
code {{ background: #f6f8fa; padding: 0.15em 0.3em; border-radius: 3px; font-size: 0.9em; }}
110+
pre {{ background: #f6f8fa; padding: 1em; overflow: auto; border-radius: 6px; }}
111+
pre code {{ background: none; padding: 0; }}
112+
table {{ border-collapse: collapse; margin: 1em 0; }}
113+
th, td {{ border: 1px solid #d1d5db; padding: 0.4em 0.8em; }}
114+
th {{ background: #f6f8fa; }}
115+
img {{ max-width: 100%; }}
116+
.breadcrumb {{ font-size: 0.85em; color: #656d76; margin-bottom: 1em; }}
117+
.breadcrumb a {{ color: #656d76; }}
118+
ul, ol {{ padding-left: 1.5em; }}
119+
li {{ margin: 0.25em 0; }}
120+
.plugin_note {{ background: #f0f4f8; border-left: 4px solid #0366d6; padding: 0.75em 1em; margin: 1em 0; border-radius: 3px; }}
121+
abbr {{ text-decoration: underline dotted; cursor: help; }}
122+
@media (prefers-color-scheme: dark) {{
123+
body {{ background: #0d1117; color: #e6edf3; }}
124+
.archive-banner {{ background: #3d2e00; border-color: #6e5600; }}
125+
.archive-banner strong {{ color: #f0c000; }}
126+
header {{ border-bottom-color: #30363d; }}
127+
header h1 a {{ color: #58a6ff; }}
128+
nav a {{ color: #8b949e; }}
129+
nav a:hover {{ color: #58a6ff; }}
130+
h1, h2, h3, h4 {{ color: #e6edf3; }}
131+
a {{ color: #58a6ff; }}
132+
code, pre {{ background: #161b22; }}
133+
th, td {{ border-color: #30363d; }}
134+
th {{ background: #161b22; }}
135+
.breadcrumb, .breadcrumb a {{ color: #8b949e; }}
136+
.plugin_note {{ background: #161b22; border-color: #58a6ff; }}
137+
}}
138+
</style>
139+
</head>
140+
<body>
141+
<div class="archive-banner">
142+
<strong>Archive Notice:</strong> This is a read-only archive of the CSS Working Group Wiki.
143+
The original wiki was hosted at wiki.csswg.org.
144+
</div>
145+
<header>
146+
<h1><a href="{home_path}">CSS Working Group Wiki</a></h1>
147+
<nav>
148+
<a href="{home_path}">Home</a>
149+
<a href="{home_path}spec/">Specs</a>
150+
<a href="{home_path}ideas/">Ideas</a>
151+
<a href="{home_path}test/">Testing</a>
152+
<a href="{home_path}wiki/">About</a>
153+
</nav>
154+
</header>
155+
{breadcrumb}
156+
<main>
157+
{content}
158+
</main>
159+
</body>
160+
</html>
161+
'''
162+
163+
164+
def extract_content(html, page_path):
165+
"""Extract the main content from a DokuWiki page."""
166+
# Find content between wikipage start/stop comments
167+
m = re.search(r'<!-- wikipage start -->\s*(.*?)\s*<!-- wikipage stop -->', html, re.DOTALL)
168+
if not m:
169+
# Fallback: find the page div
170+
m = re.search(r'<div class="page"[^>]*>(.*?)</div>\s*(?:<div class="docInfo"|</div>\s*</div>\s*<div class="clearer")', html, re.DOTALL)
171+
172+
content = m.group(1).strip() if m else "<p>Content could not be extracted.</p>"
173+
174+
# Extract title from first h1 or page title
175+
title_m = re.search(r'<h1[^>]*>([^<]+)</h1>', content)
176+
if not title_m:
177+
title_m = re.search(r'<title>\s*([^<\[]+)', html)
178+
title = title_m.group(1).strip() if title_m else page_path
179+
180+
# Clean up content
181+
# Remove edit section buttons
182+
content = re.sub(r'<div class="secedit[^>]*>.*?</div>', '', content, flags=re.DOTALL)
183+
# Remove TOC toggle buttons
184+
content = re.sub(r'<div class="tocheader[^>]*>.*?</div>', '', content, flags=re.DOTALL)
185+
# Remove section edit IDs
186+
content = re.sub(r' id="[^"]*sectionedit[^"]*"', '', content)
187+
content = re.sub(r' class="sectionedit\d+"', '', content)
188+
189+
# Remove wiki-specific link classes
190+
content = re.sub(r' class="wikilink\d?"', '', content)
191+
content = re.sub(r' data-wiki-id="[^"]*"', '', content)
192+
# External link class cleanup
193+
content = re.sub(r' class="urlextern"', '', content)
194+
content = re.sub(r' rel="ugc nofollow"', ' rel="noopener"', content)
195+
196+
# Clean up div wrappers that are just for layout
197+
content = re.sub(r'<div class="level\d+">\s*', '', content)
198+
content = re.sub(r'\s*</div>\s*(?=<h[1-6]|<ul|<ol|<p|$)', '', content)
199+
content = re.sub(r'<div class="li">\s*', '', content)
200+
content = re.sub(r'\s*</div>\s*</li>', '</li>', content)
201+
202+
return title, content
203+
204+
205+
def make_breadcrumb(page_path, home_path):
206+
"""Generate breadcrumb navigation."""
207+
if page_path in ('main', ''):
208+
return ''
209+
210+
parts = page_path.split('/')
211+
crumbs = [f'<a href="{home_path}">Home</a>']
212+
for i, part in enumerate(parts[:-1]):
213+
path = "../" * (len(parts) - i - 1)
214+
crumbs.append(f'<a href="{path}">{part}</a>')
215+
crumbs.append(parts[-1])
216+
217+
return f'<div class="breadcrumb">{" / ".join(crumbs)}</div>'
218+
219+
220+
def fix_internal_links(content, home_path):
221+
"""Convert absolute wiki links to relative paths."""
222+
# Fix internal wiki links: /page/ -> {home_path}page/
223+
def fix_link(m):
224+
path = m.group(1)
225+
return f'href="{home_path}{path}'
226+
content = re.sub(r'href="/([a-z])', fix_link, content)
227+
return content
228+
229+
230+
def save_page(output_dir, page_path, html):
231+
"""Process and save a page."""
232+
title, content = extract_content(html, page_path)
233+
234+
# Calculate depth for relative paths
235+
if page_path == 'main':
236+
depth = 0
237+
home_path = "./"
238+
out_path = output_dir / 'index.html'
239+
else:
240+
depth = len(page_path.split('/'))
241+
home_path = "../" * depth
242+
out_dir = output_dir / page_path
243+
out_dir.mkdir(parents=True, exist_ok=True)
244+
out_path = out_dir / 'index.html'
245+
246+
breadcrumb = make_breadcrumb(page_path, home_path)
247+
content = fix_internal_links(content, home_path)
248+
249+
output = PAGE_TEMPLATE.format(
250+
title=title,
251+
content=content,
252+
breadcrumb=breadcrumb,
253+
home_path=home_path
254+
)
255+
256+
out_path.write_text(output, encoding='utf-8')
257+
258+
259+
def main():
260+
if len(sys.argv) > 1:
261+
output_dir = Path(sys.argv[1])
262+
else:
263+
output_dir = Path(".")
264+
265+
output_dir.mkdir(parents=True, exist_ok=True)
266+
267+
pages = get_all_pages()
268+
print(f"\nFound {len(pages)} pages. Starting download...\n")
269+
270+
# Always include 'main' as the homepage
271+
if 'main' not in pages:
272+
pages = ['main'] + list(pages)
273+
274+
for i, page in enumerate(pages):
275+
print(f"[{i+1}/{len(pages)}] {page}")
276+
time.sleep(DELAY)
277+
278+
html = fetch(f"{BASE_URL}/{page}")
279+
if html:
280+
save_page(output_dir, page, html)
281+
282+
print(f"\nDone! Static site written to {output_dir}")
283+
print(f"Preview with: python3 -m http.server -d {output_dir}")
284+
285+
286+
if __name__ == "__main__":
287+
main()

0 commit comments

Comments
 (0)