11import logging
2+ import time
3+ import urllib .parse
24
35import requests
46from concepts .models import Item
57from django .db .utils import IntegrityError
68from slurper .wd_raw_item import WD_OTHER_SOURCES , BaseWdRawItem
79
810
11+ # Wikipedia API contact email (required by Wikipedia API guidelines)
12+ # Set to None to disable Wikipedia article fetching
13+ WIKIPEDIA_CONTACT_EMAIL = None
14+
915# Wikidata entities to exclude from queries (natural numbers and positive integers)
10- # TODO SST: Ask Katja: whether to add all found
11- # 1. Should I put all found? Most likely yes
12- # 2. Use categorization results to exclude them in further uses
1316KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199" , "wd:Q28920044" ]
1417
18+ # Flag to track if we've logged the missing email warning
19+ _missing_email_logged = False
20+
1521
1622# These are added to every query:
1723# - Optional image: Fetches image if available
@@ -73,7 +79,6 @@ def __init__(self, source, query, limit=None):
7379 + (f"LIMIT { limit } " if limit is not None else "" )
7480 )
7581 self .raw_data = self .fetch_json ()
76- self .article_text = self .fetch_articles ()
7782
7883
7984 def _sparql_source_vars_select (self ):
@@ -99,75 +104,134 @@ def fetch_json(self):
99104 )
100105 return response .json ()["results" ]["bindings" ]
101106
102- def fetch_articles (self ):
103- """Fetch Wikipedia article text for items with wp_en links."""
104- article_texts = {}
107+ def fetch_article (self , json_item , index = None , total = None ):
108+ global _missing_email_logged
105109
106- for json_item in self .raw_data :
107- # Only fetch if Wikipedia link exists
108- if "wp_en" not in json_item :
109- continue
110-
111- wp_url = json_item ["wp_en" ]["value" ]
112- article_title = wp_url .split ("/wiki/" )[- 1 ]
113-
114- api_url = "https://en.wikipedia.org/w/api.php"
115- params = {
116- "action" : "query" ,
117- "format" : "json" ,
118- "titles" : article_title ,
119- "prop" : "extracts" ,
120- "explaintext" : True ,
121- "exsectionformat" : "plain" ,
122- }
123- headers = {
124- "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ,
125- "Accept" : "application/json" ,
126- "Accept-Language" : "en-US,en;q=0.9" ,
127- }
110+ # Check if contact email is configured
111+ if WIKIPEDIA_CONTACT_EMAIL is None :
112+ if not _missing_email_logged :
113+ logging .log (
114+ logging .WARNING ,
115+ "WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. "
116+ "Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching." ,
117+ )
118+ _missing_email_logged = True
119+ return None
120+
121+ wp_url = json_item ["wp_en" ]["value" ]
122+ # Decode URL-encoded characters (e.g., %E2%80%93 becomes –)
123+ article_title = urllib .parse .unquote (wp_url .split ("/wiki/" )[- 1 ])
128124
125+ if index is not None and total is not None :
126+ logging .log (
127+ logging .INFO ,
128+ f"Fetching Wikipedia article [{ index } /{ total } ]: { article_title } " ,
129+ )
130+ else :
131+ logging .log (
132+ logging .INFO ,
133+ f"Fetching Wikipedia article: { article_title } " ,
134+ )
135+ api_url = "https://en.wikipedia.org/w/api.php"
136+ params = {
137+ "action" : "query" ,
138+ "format" : "json" ,
139+ "titles" : article_title ,
140+ "prop" : "extracts" ,
141+ "explaintext" : True ,
142+ "exsectionformat" : "plain" ,
143+ }
144+ headers = {
145+ "User-Agent" : f"MathSwitch/1.0 ({ WIKIPEDIA_CONTACT_EMAIL } )" ,
146+ "Accept" : "application/json" ,
147+ "Accept-Language" : "en-US,en;q=0.9" ,
148+ }
149+ # Retry logic with exponential backoff
150+ max_retries = 3
151+ retry_delay = 1 # Start with 1 second
152+ success = False
153+ for attempt in range (max_retries ):
129154 try :
130- response = requests .get (api_url , params = params , headers = headers )
155+ # Rate limiting: delay between requests (100 req/s max)
156+ time .sleep (0.01 )
157+
158+ # Timeout: (connect_timeout, read_timeout) in seconds
159+ response = requests .get (api_url , params = params , headers = headers , timeout = (5 , 30 ))
160+
161+ # Handle rate limiting
162+ if response .status_code in (429 , 403 ):
163+ if attempt < max_retries - 1 :
164+ logging .log (
165+ logging .WARNING ,
166+ f"Rate limited for { article_title } , retrying in { retry_delay } s (attempt { attempt + 1 } /{ max_retries } )" ,
167+ )
168+ time .sleep (retry_delay )
169+ retry_delay *= 2 # Exponential backoff
170+ continue
171+ else :
172+ logging .log (
173+ logging .ERROR ,
174+ f"Failed to fetch { article_title } after { max_retries } attempts (rate limited). Skipping article." ,
175+ )
176+ break
177+
131178 response .raise_for_status ()
132179
133180 if not response .text :
134181 logging .log (
135182 logging .WARNING ,
136- f"Empty response for Wikipedia article: { article_title } " ,
183+ f"Empty response for Wikipedia article: { article_title } . Skipping article. " ,
137184 )
138- continue
185+ break
139186
140187 data = response .json ()
141188 pages = data .get ("query" , {}).get ("pages" , {})
142189
143190 # Get the first (and only) page
144191 for page_id , page_data in pages .items ():
145192 if "extract" in page_data :
146- # Use Wikidata ID as key
147- wd_id = json_item ["item" ]["value" ]
148- article_texts [wd_id ] = page_data ["extract" ]
149- break
150- except Exception as e :
151- logging .log (
152- logging .WARNING ,
153- f"Failed to fetch Wikipedia article for { article_title } : { e } " ,
154- )
193+ success = True
194+ return page_data ["extract" ]
155195
156- return article_texts
196+ # Success, break retry loop
197+ break
198+
199+ except requests .exceptions .RequestException as e :
200+ if attempt < max_retries - 1 :
201+ logging .log (
202+ logging .WARNING ,
203+ f"Request failed for { article_title } : { e } , retrying in { retry_delay } s" ,
204+ )
205+ time .sleep (retry_delay )
206+ retry_delay *= 2
207+ else :
208+ logging .log (
209+ logging .ERROR ,
210+ f"Failed to fetch { article_title } after { max_retries } attempts: { e } . Skipping article." ,
211+ )
212+ if not success and "wp_en" in json_item :
213+ logging .log (
214+ logging .INFO ,
215+ f"Article { article_title } will have null value (fetch failed or empty)" ,
216+ )
217+
218+ return None
157219
158220 def get_items (self ):
159221 for json_item in self .raw_data :
160- wd_id = json_item ["item" ]["value" ]
161- if wd_id in self .article_text :
162- json_item ["article_text" ] = {"value" : self .article_text [wd_id ]}
163-
164222 raw_item = BaseWdRawItem .raw_item (self .source , json_item )
165223 yield raw_item .to_item ()
166224 if self .source != Item .Source .WIKIDATA :
167225 raw_item_wd = raw_item .switch_source_to (Item .Source .WIKIDATA )
168226 if not raw_item_wd .item_exists ():
169227 yield raw_item_wd .to_item ()
170228 if raw_item .has_source (Item .Source .WIKIPEDIA_EN ):
229+ # Fetch Wikipedia article if available
230+ if "wp_en" in json_item and "article_text" not in json_item :
231+ article_text = self .fetch_article (json_item )
232+ if article_text is not None :
233+ json_item ["article_text" ] = {"value" : article_text }
234+
171235 raw_item_wp_en = raw_item .switch_source_to (Item .Source .WIKIPEDIA_EN )
172236 if not raw_item_wp_en .item_exists ():
173237 yield raw_item_wp_en .to_item ()
0 commit comments