Cantigas-de-Santa-Maria/scrape_csm.py at master · Gabrieljoseg/Cantigas-de-Santa-Maria · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
import requests
import time

def scrape_csm():
    base_url = "https://csm.mml.ox.ac.uk/index.php?p=poemdata_view&rec="
    output_dir = "DATABASE CSM"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Validated range 1 to 427
    for i in range(1, 428):
        filename = os.path.join(output_dir, f"rec_{i}.html")
        if os.path.exists(filename):
            print(f"Record {i} already exists. Skipping.")
            continue

        url = f"{base_url}{i}"
        try:
            print(f"Scraping record {i}...", end="", flush=True)
            response = requests.get(url)
            response.raise_for_status()

            filename = os.path.join(output_dir, f"rec_{i}.html")
            with open(filename, "w", encoding="utf-8") as f:
                f.write(response.text)

            print(f" Done. Saved to {filename}")

            # Be polite to the server
            time.sleep(0.5)

        except requests.exceptions.RequestException as e:
            print(f"\nError scraping {url}: {e}")

if __name__ == "__main__":
    scrape_csm()