-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
160 lines (141 loc) · 5.48 KB
/
Copy pathscraper.py
File metadata and controls
160 lines (141 loc) · 5.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
"""
Fetch Targum Onkelos (Taj edition) from Hebrew Wikisource and split by parsha and perek.
"""
import re
import time
import urllib.parse
import requests
from bs4 import BeautifulSoup
from parsha_list import BOOKS, PARSHIYOT, perek_he
WIKISOURCE_API = "https://he.wikisource.org/w/api.php"
EDITION_PREFIX = "תרגום_אונקלוס_(תאג')"
def _page_title(book_slug: str, parsha_slug: str) -> str:
return f"{EDITION_PREFIX}/ספר_{book_slug}/פרשת_{parsha_slug}"
def fetch_parsha_html(book_slug: str, parsha_slug: str) -> str | None:
"""Fetch parsed HTML for one parsha page from Wikisource API."""
title = _page_title(book_slug, parsha_slug)
params = {
"action": "parse",
"page": title,
"prop": "text",
"format": "json",
"origin": "*",
}
headers = {
"User-Agent": "OnkelosBooklet/1.0 (https://github.com/kashrus-library; educational PDF booklet)",
}
try:
r = requests.get(WIKISOURCE_API, params=params, headers=headers, timeout=30)
r.raise_for_status()
data = r.json()
if "parse" in data and "text" in data["parse"]:
return data["parse"]["text"]["*"]
except Exception as e:
print(f"Error fetching {title}: {e}")
return None
def _split_by_perakim(html: str) -> list[tuple[str, str]]:
"""
Split page HTML into perakim. Returns list of (perek_he, content_html).
perek_he is e.g. 'א', 'ב'. Content is the HTML for that perek (verses).
"""
soup = BeautifulSoup(html, "html.parser")
# Remove edit links and nav for cleaner text
for a in soup.find_all("a", href=re.compile(r"edit|עריכה")):
a.decompose()
text = str(soup)
# Split by "פרק X" where X is Hebrew letter(s). Pattern: פרק optionally with [edit] etc
perek_pattern = re.compile(
r"פרק\s+([א-ת׳״]+)", re.UNICODE
)
parts = perek_pattern.split(text)
# parts[0] = intro (parsha intro, maybe "פרשת X"); parts[1]=א, parts[2]=content, parts[3]=ב, parts[4]=content, ...
if len(parts) < 2:
return []
result = []
i = 1
while i + 1 < len(parts):
perek_letter = parts[i].strip()
content = parts[i + 1].strip()
# Trim trailing "פרק" start from next section if any
if content:
result.append((perek_letter, content))
i += 2
return result
def _clean_perek_content(html_fragment: str) -> str:
"""Remove leading junk from previous heading (e.g. </a></h3><span...)."""
# Strip leading closing tags and edit-section spans until we hit a <p> or text
text = html_fragment.strip()
while text and (text.startswith("</") or text.startswith("<span") or text.startswith("<div")):
if text.startswith("</"):
end = text.find(">", 2) + 1
if end > 0:
text = text[end:].strip()
else:
break
elif text.startswith("<span"):
end = text.find("</span>") + 7
if end > 6:
text = text[end:].strip()
else:
break
else:
end = text.find(">", 4) + 1
if end > 0:
text = text[end:].strip()
else:
break
return text
def scrape_parsha(book_he: str, book_slug: str, parsha_he: str, parsha_slug: str):
"""
Scrape one parsha and return list of perakim with content.
Yields dicts: { parsha_he, parsha_slug, book_he, book_slug, perek_num, perek_he, content, next_perek_he }.
"""
html = fetch_parsha_html(book_slug, parsha_slug)
if not html:
return
perakim = _split_by_perakim(html)
for idx, (perek_he_str, content) in enumerate(perakim):
perek_num = idx + 1
next_perek_he = f"פרק {perakim[idx + 1][0]}" if idx + 1 < len(perakim) else None
content_clean = _clean_perek_content(content)
yield {
"book_he": book_he,
"book_slug": book_slug,
"parsha_he": parsha_he,
"parsha_slug": parsha_slug,
"perek_num": perek_num,
"perek_he": perek_he_str,
"content": content_clean,
"next_perek_he": next_perek_he,
}
def scrape_all(limit_parshiyot: int | None = None):
"""
Scrape all books/parshiyot. Yields same dicts as scrape_parsha, in order.
limit_parshiyot: if set, only fetch first N parshiyot (for testing).
"""
total = 0
for book in BOOKS:
book_slug = book["slug"]
book_he = book["he"]
parsha_list = PARSHIYOT.get(book_slug, [])
for parsha_he, parsha_slug in parsha_list:
if limit_parshiyot is not None and total >= limit_parshiyot:
return
total += 1
print(f"Fetching {book_he} / {parsha_he} ...")
for item in scrape_parsha(book_he, book_slug, parsha_he, parsha_slug):
yield item
time.sleep(0.5) # be nice to the server
def fetch_data(limit_parshiyot: int | None = None):
"""
Fetch all Onkelos data and return a list of blocks for the template.
Each block has: book_he, parsha_he, perek_num, perek_he, content, next_perek_he.
"""
return list(scrape_all(limit_parshiyot=limit_parshiyot))
if __name__ == "__main__":
# Test: first parsha only
data = fetch_data(limit_parshiyot=1)
print(f"Got {len(data)} perakim")
for b in data[:3]:
print(b["parsha_he"], b["perek_he"], "next:", b.get("next_perek_he"), "content len:", len(b["content"]))