-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdump_wiki_pages.py
More file actions
58 lines (51 loc) · 1.59 KB
/
dump_wiki_pages.py
File metadata and controls
58 lines (51 loc) · 1.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import requests
import os
API_ENDPOINT = "https://docs.alliancecan.ca/mediawiki/api.php"
OUTPUT_DIR = "wiki_pages"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def get_all_pages():
pages = []
params = {
"action": "query",
"list": "allpages",
"format": "json",
"aplimit": "max"
}
while True:
res = requests.get(API_ENDPOINT, params=params).json()
pages += res["query"]["allpages"]
if "continue" in res:
params.update(res["continue"])
else:
break
return pages
def get_page_content(title):
params = {
"action": "query",
"prop": "revisions",
"rvprop": "content",
"format": "json",
"titles": title
}
res = requests.get(API_ENDPOINT, params=params).json()
pages = res.get("query", {}).get("pages", {})
for page_id, data in pages.items():
if "revisions" in data:
return data["revisions"][0]["*"] if "*" in data["revisions"][0] else data["revisions"][0]["slots"]["main"]["*"]
return ""
def save_page(title, content):
safe_title = title.replace("/", "_")
with open(os.path.join(OUTPUT_DIR, f"{safe_title}.txt"), "w") as f:
f.write(content)
def main():
print("🔍 Fetching page list...")
pages = get_all_pages()
print(f"📄 Found {len(pages)} pages. Downloading...")
for page in pages:
title = page["title"]
print(f"→ {title}")
content = get_page_content(title)
save_page(title, content)
print(f"✅ Done. Saved to ./{OUTPUT_DIR}/")
if __name__ == "__main__":
main()