-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_csm.py
More file actions
38 lines (29 loc) · 1.11 KB
/
scrape_csm.py
File metadata and controls
38 lines (29 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
import requests
import time
def scrape_csm():
base_url = "https://csm.mml.ox.ac.uk/index.php?p=poemdata_view&rec="
output_dir = "DATABASE CSM"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Validated range 1 to 427
for i in range(1, 428):
filename = os.path.join(output_dir, f"rec_{i}.html")
if os.path.exists(filename):
print(f"Record {i} already exists. Skipping.")
continue
url = f"{base_url}{i}"
try:
print(f"Scraping record {i}...", end="", flush=True)
response = requests.get(url)
response.raise_for_status()
filename = os.path.join(output_dir, f"rec_{i}.html")
with open(filename, "w", encoding="utf-8") as f:
f.write(response.text)
print(f" Done. Saved to {filename}")
# Be polite to the server
time.sleep(0.5)
except requests.exceptions.RequestException as e:
print(f"\nError scraping {url}: {e}")
if __name__ == "__main__":
scrape_csm()