Skip to content

Commit 7fd7197

Browse files
committed
-moved fetch to be per item, not fetch all then
1 parent 93561b1 commit 7fd7197

File tree

3 files changed

+128
-49
lines changed

3 files changed

+128
-49
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
install:
22
pip install -r requirements.txt
33

4+
install-scispacy:
5+
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
6+
47
start:
58
python ./web/manage.py runserver
69

web/slurper/keyword_util.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,22 @@
11
import spacy
22

33
# TODO SST: Move to readme.md
4-
# TODO SST: Also it should be lazy-loaded
54
# Load the scientific English model from scispacy
65
# Note: You need to download this model first with:
7-
# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
8-
nlp = spacy.load("en_core_sci_lg")
6+
# make install-scispacy
7+
# Or directly:
8+
# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
9+
10+
# Lazy-loaded spaCy model
11+
_nlp = None
12+
13+
14+
def _get_nlp():
15+
"""Lazy-load the spaCy model only when needed."""
16+
global _nlp
17+
if _nlp is None:
18+
_nlp = spacy.load("en_core_sci_lg")
19+
return _nlp
920

1021

1122
def extract_keywords(text):
@@ -21,5 +32,6 @@ def extract_keywords(text):
2132
if not text:
2233
return []
2334

35+
nlp = _get_nlp()
2436
doc = nlp(text)
2537
return doc.ents

web/slurper/source_wikidata.py

Lines changed: 110 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
import logging
2+
import time
3+
import urllib.parse
24

35
import requests
46
from concepts.models import Item
57
from django.db.utils import IntegrityError
68
from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
79

810

11+
# Wikipedia API contact email (required by Wikipedia API guidelines)
12+
# Set to None to disable Wikipedia article fetching
13+
WIKIPEDIA_CONTACT_EMAIL = None
14+
915
# Wikidata entities to exclude from queries (natural numbers and positive integers)
10-
# TODO SST: Ask Katja: whether to add all found
11-
# 1. Should I put all found? Most likely yes
12-
# 2. Use categorization results to exclude them in further uses
1316
KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199", "wd:Q28920044"]
1417

18+
# Flag to track if we've logged the missing email warning
19+
_missing_email_logged = False
20+
1521

1622
# These are added to every query:
1723
# - Optional image: Fetches image if available
@@ -73,7 +79,6 @@ def __init__(self, source, query, limit=None):
7379
+ (f"LIMIT {limit}" if limit is not None else "")
7480
)
7581
self.raw_data = self.fetch_json()
76-
self.article_text = self.fetch_articles()
7782

7883

7984
def _sparql_source_vars_select(self):
@@ -99,75 +104,134 @@ def fetch_json(self):
99104
)
100105
return response.json()["results"]["bindings"]
101106

102-
def fetch_articles(self):
103-
"""Fetch Wikipedia article text for items with wp_en links."""
104-
article_texts = {}
107+
def fetch_article(self, json_item, index=None, total=None):
108+
global _missing_email_logged
105109

106-
for json_item in self.raw_data:
107-
# Only fetch if Wikipedia link exists
108-
if "wp_en" not in json_item:
109-
continue
110-
111-
wp_url = json_item["wp_en"]["value"]
112-
article_title = wp_url.split("/wiki/")[-1]
113-
114-
api_url = "https://en.wikipedia.org/w/api.php"
115-
params = {
116-
"action": "query",
117-
"format": "json",
118-
"titles": article_title,
119-
"prop": "extracts",
120-
"explaintext": True,
121-
"exsectionformat": "plain",
122-
}
123-
headers = {
124-
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
125-
"Accept": "application/json",
126-
"Accept-Language": "en-US,en;q=0.9",
127-
}
110+
# Check if contact email is configured
111+
if WIKIPEDIA_CONTACT_EMAIL is None:
112+
if not _missing_email_logged:
113+
logging.log(
114+
logging.WARNING,
115+
"WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. "
116+
"Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching.",
117+
)
118+
_missing_email_logged = True
119+
return None
120+
121+
wp_url = json_item["wp_en"]["value"]
122+
# Decode URL-encoded characters (e.g., %E2%80%93 becomes –)
123+
article_title = urllib.parse.unquote(wp_url.split("/wiki/")[-1])
128124

125+
if index is not None and total is not None:
126+
logging.log(
127+
logging.INFO,
128+
f"Fetching Wikipedia article [{index}/{total}]: {article_title}",
129+
)
130+
else:
131+
logging.log(
132+
logging.INFO,
133+
f"Fetching Wikipedia article: {article_title}",
134+
)
135+
api_url = "https://en.wikipedia.org/w/api.php"
136+
params = {
137+
"action": "query",
138+
"format": "json",
139+
"titles": article_title,
140+
"prop": "extracts",
141+
"explaintext": True,
142+
"exsectionformat": "plain",
143+
}
144+
headers = {
145+
"User-Agent": f"MathSwitch/1.0 ({WIKIPEDIA_CONTACT_EMAIL})",
146+
"Accept": "application/json",
147+
"Accept-Language": "en-US,en;q=0.9",
148+
}
149+
# Retry logic with exponential backoff
150+
max_retries = 3
151+
retry_delay = 1 # Start with 1 second
152+
success = False
153+
for attempt in range(max_retries):
129154
try:
130-
response = requests.get(api_url, params=params, headers=headers)
155+
# Rate limiting: delay between requests (100 req/s max)
156+
time.sleep(0.01)
157+
158+
# Timeout: (connect_timeout, read_timeout) in seconds
159+
response = requests.get(api_url, params=params, headers=headers, timeout=(5, 30))
160+
161+
# Handle rate limiting
162+
if response.status_code in (429, 403):
163+
if attempt < max_retries - 1:
164+
logging.log(
165+
logging.WARNING,
166+
f"Rate limited for {article_title}, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})",
167+
)
168+
time.sleep(retry_delay)
169+
retry_delay *= 2 # Exponential backoff
170+
continue
171+
else:
172+
logging.log(
173+
logging.ERROR,
174+
f"Failed to fetch {article_title} after {max_retries} attempts (rate limited). Skipping article.",
175+
)
176+
break
177+
131178
response.raise_for_status()
132179

133180
if not response.text:
134181
logging.log(
135182
logging.WARNING,
136-
f"Empty response for Wikipedia article: {article_title}",
183+
f"Empty response for Wikipedia article: {article_title}. Skipping article.",
137184
)
138-
continue
185+
break
139186

140187
data = response.json()
141188
pages = data.get("query", {}).get("pages", {})
142189

143190
# Get the first (and only) page
144191
for page_id, page_data in pages.items():
145192
if "extract" in page_data:
146-
# Use Wikidata ID as key
147-
wd_id = json_item["item"]["value"]
148-
article_texts[wd_id] = page_data["extract"]
149-
break
150-
except Exception as e:
151-
logging.log(
152-
logging.WARNING,
153-
f"Failed to fetch Wikipedia article for {article_title}: {e}",
154-
)
193+
success = True
194+
return page_data["extract"]
155195

156-
return article_texts
196+
# Success, break retry loop
197+
break
198+
199+
except requests.exceptions.RequestException as e:
200+
if attempt < max_retries - 1:
201+
logging.log(
202+
logging.WARNING,
203+
f"Request failed for {article_title}: {e}, retrying in {retry_delay}s",
204+
)
205+
time.sleep(retry_delay)
206+
retry_delay *= 2
207+
else:
208+
logging.log(
209+
logging.ERROR,
210+
f"Failed to fetch {article_title} after {max_retries} attempts: {e}. Skipping article.",
211+
)
212+
if not success and "wp_en" in json_item:
213+
logging.log(
214+
logging.INFO,
215+
f"Article {article_title} will have null value (fetch failed or empty)",
216+
)
217+
218+
return None
157219

158220
def get_items(self):
159221
for json_item in self.raw_data:
160-
wd_id = json_item["item"]["value"]
161-
if wd_id in self.article_text:
162-
json_item["article_text"] = {"value": self.article_text[wd_id]}
163-
164222
raw_item = BaseWdRawItem.raw_item(self.source, json_item)
165223
yield raw_item.to_item()
166224
if self.source != Item.Source.WIKIDATA:
167225
raw_item_wd = raw_item.switch_source_to(Item.Source.WIKIDATA)
168226
if not raw_item_wd.item_exists():
169227
yield raw_item_wd.to_item()
170228
if raw_item.has_source(Item.Source.WIKIPEDIA_EN):
229+
# Fetch Wikipedia article if available
230+
if "wp_en" in json_item and "article_text" not in json_item:
231+
article_text = self.fetch_article(json_item)
232+
if article_text is not None:
233+
json_item["article_text"] = {"value": article_text}
234+
171235
raw_item_wp_en = raw_item.switch_source_to(Item.Source.WIKIPEDIA_EN)
172236
if not raw_item_wp_en.item_exists():
173237
yield raw_item_wp_en.to_item()

0 commit comments

Comments
 (0)