onkelos_booklet/scraper.py at main · minchaminder/onkelos_booklet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
"""
Fetch Targum Onkelos (Taj edition) from Hebrew Wikisource and split by parsha and perek.
"""
import re
import time
import urllib.parse

import requests
from bs4 import BeautifulSoup

from parsha_list import BOOKS, PARSHIYOT, perek_he

WIKISOURCE_API = "https://he.wikisource.org/w/api.php"
EDITION_PREFIX = "תרגום_אונקלוס_(תאג')"


def _page_title(book_slug: str, parsha_slug: str) -> str:
    return f"{EDITION_PREFIX}/ספר_{book_slug}/פרשת_{parsha_slug}"


def fetch_parsha_html(book_slug: str, parsha_slug: str) -> str | None:
    """Fetch parsed HTML for one parsha page from Wikisource API."""
    title = _page_title(book_slug, parsha_slug)
    params = {
        "action": "parse",
        "page": title,
        "prop": "text",
        "format": "json",
        "origin": "*",
    }
    headers = {
        "User-Agent": "OnkelosBooklet/1.0 (https://github.com/kashrus-library; educational PDF booklet)",
    }
    try:
        r = requests.get(WIKISOURCE_API, params=params, headers=headers, timeout=30)
        r.raise_for_status()
        data = r.json()
        if "parse" in data and "text" in data["parse"]:
            return data["parse"]["text"]["*"]
    except Exception as e:
        print(f"Error fetching {title}: {e}")
    return None


def _split_by_perakim(html: str) -> list[tuple[str, str]]:
    """
    Split page HTML into perakim. Returns list of (perek_he, content_html).
    perek_he is e.g. 'א', 'ב'. Content is the HTML for that perek (verses).
    """
    soup = BeautifulSoup(html, "html.parser")
    # Remove edit links and nav for cleaner text
    for a in soup.find_all("a", href=re.compile(r"edit|עריכה")):
        a.decompose()
    text = str(soup)
    # Split by "פרק X" where X is Hebrew letter(s). Pattern: פרק optionally with [edit] etc
    perek_pattern = re.compile(
        r"פרק\s+([א-ת׳״]+)", re.UNICODE
    )
    parts = perek_pattern.split(text)
    # parts[0] = intro (parsha intro, maybe "פרשת X"); parts[1]=א, parts[2]=content, parts[3]=ב, parts[4]=content, ...
    if len(parts) < 2:
        return []
    result = []
    i = 1
    while i + 1 < len(parts):
        perek_letter = parts[i].strip()
        content = parts[i + 1].strip()
        # Trim trailing "פרק" start from next section if any
        if content:
            result.append((perek_letter, content))
        i += 2
    return result


def _clean_perek_content(html_fragment: str) -> str:
    """Remove leading junk from previous heading (e.g. </a></h3><span...)."""
    # Strip leading closing tags and edit-section spans until we hit a <p> or text
    text = html_fragment.strip()
    while text and (text.startswith("</") or text.startswith("<span") or text.startswith("<div")):
        if text.startswith("</"):
            end = text.find(">", 2) + 1
            if end > 0:
                text = text[end:].strip()
            else:
                break
        elif text.startswith("<span"):
            end = text.find("</span>") + 7
            if end > 6:
                text = text[end:].strip()
            else:
                break
        else:
            end = text.find(">", 4) + 1
            if end > 0:
                text = text[end:].strip()
            else:
                break
    return text


def scrape_parsha(book_he: str, book_slug: str, parsha_he: str, parsha_slug: str):
    """
    Scrape one parsha and return list of perakim with content.
    Yields dicts: { parsha_he, parsha_slug, book_he, book_slug, perek_num, perek_he, content, next_perek_he }.
    """
    html = fetch_parsha_html(book_slug, parsha_slug)
    if not html:
        return
    perakim = _split_by_perakim(html)
    for idx, (perek_he_str, content) in enumerate(perakim):
        perek_num = idx + 1
        next_perek_he = f"פרק {perakim[idx + 1][0]}" if idx + 1 < len(perakim) else None
        content_clean = _clean_perek_content(content)
        yield {
            "book_he": book_he,
            "book_slug": book_slug,
            "parsha_he": parsha_he,
            "parsha_slug": parsha_slug,
            "perek_num": perek_num,
            "perek_he": perek_he_str,
            "content": content_clean,
            "next_perek_he": next_perek_he,
        }


def scrape_all(limit_parshiyot: int | None = None):
    """
    Scrape all books/parshiyot. Yields same dicts as scrape_parsha, in order.
    limit_parshiyot: if set, only fetch first N parshiyot (for testing).
    """
    total = 0
    for book in BOOKS:
        book_slug = book["slug"]
        book_he = book["he"]
        parsha_list = PARSHIYOT.get(book_slug, [])
        for parsha_he, parsha_slug in parsha_list:
            if limit_parshiyot is not None and total >= limit_parshiyot:
                return
            total += 1
            print(f"Fetching {book_he} / {parsha_he} ...")
            for item in scrape_parsha(book_he, book_slug, parsha_he, parsha_slug):
                yield item
            time.sleep(0.5)  # be nice to the server


def fetch_data(limit_parshiyot: int | None = None):
    """
    Fetch all Onkelos data and return a list of blocks for the template.
    Each block has: book_he, parsha_he, perek_num, perek_he, content, next_perek_he.
    """
    return list(scrape_all(limit_parshiyot=limit_parshiyot))


if __name__ == "__main__":
    # Test: first parsha only
    data = fetch_data(limit_parshiyot=1)
    print(f"Got {len(data)} perakim")
    for b in data[:3]:
        print(b["parsha_he"], b["perek_he"], "next:", b.get("next_perek_he"), "content len:", len(b["content"]))