77from django .db .utils import IntegrityError
88from slurper .wd_raw_item import WD_OTHER_SOURCES , BaseWdRawItem
99
10-
1110# Wikipedia API contact email (required by Wikipedia API guidelines)
1211# Set to None to disable Wikipedia article fetching
1312WIKIPEDIA_CONTACT_EMAIL = None
2625# - Excludes humans (FILTER NOT EXISTS)
2726# - Label service: Automatically fetches English labels and descriptions
2827#
29- # The class fetches mathematical concepts from Wikidata while filtering out unwanted items like people and natural numbers.
28+ # The class fetches mathematical concepts from Wikidata while
29+ # filtering out unwanted items like people and natural numbers.
30+
3031
3132class WikidataSlurper :
3233 SPARQL_URL = "https://query.wikidata.org/sparql"
3334
34- SPARQL_QUERY_OPTIONS = """
35+ SPARQL_QUERY_OPTIONS = (
36+ """
3537 OPTIONAL
3638 { ?item wdt:P18 ?image . }
3739 OPTIONAL
@@ -44,7 +46,9 @@ class WikidataSlurper:
4446 { ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") }
4547 # except for natural numbers and positive integers
4648 FILTER NOT EXISTS {
47- VALUES ?excludedType { """ + " " .join (KNOWN_EXCLUDED_CATEGORIES ) + """ }
49+ VALUES ?excludedType { """
50+ + " " .join (KNOWN_EXCLUDED_CATEGORIES )
51+ + """ }
4852 ?item wdt:P31 ?excludedType .
4953 }
5054 # except for humans
@@ -53,6 +57,7 @@ class WikidataSlurper:
5357 SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
5458}
5559"""
60+ )
5661
5762 def __init__ (self , source , query , limit = None ):
5863 self .source = source
@@ -71,16 +76,13 @@ def __init__(self, source, query, limit=None):
7176 + self .SPARQL_QUERY_OPTIONS
7277 + """
7378GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """
74- + " " .join (
75- [f"?{ src ['json_key' ]} " for src in WD_OTHER_SOURCES .values ()]
76- )
79+ + " " .join ([f"?{ src ['json_key' ]} " for src in WD_OTHER_SOURCES .values ()])
7780 + """
7881"""
7982 + (f"LIMIT { limit } " if limit is not None else "" )
8083 )
8184 self .raw_data = self .fetch_json ()
8285
83-
8486 def _sparql_source_vars_select (self ):
8587 def to_var (source_dict ):
8688 return " ?" + source_dict ["json_key" ]
@@ -112,8 +114,10 @@ def fetch_article(self, json_item, index=None, total=None):
112114 if not _missing_email_logged :
113115 logging .log (
114116 logging .WARNING ,
115- "WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. "
116- "Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching." ,
117+ "WIKIPEDIA_CONTACT_EMAIL is not set. "
118+ "Wikipedia article fetching is disabled. "
119+ "Please set WIKIPEDIA_CONTACT_EMAIL at the top of "
120+ "source_wikidata.py to enable article fetching." ,
117121 )
118122 _missing_email_logged = True
119123 return None
@@ -156,22 +160,26 @@ def fetch_article(self, json_item, index=None, total=None):
156160 time .sleep (0.01 )
157161
158162 # Timeout: (connect_timeout, read_timeout) in seconds
159- response = requests .get (api_url , params = params , headers = headers , timeout = (5 , 30 ))
163+ response = requests .get (
164+ api_url , params = params , headers = headers , timeout = (5 , 30 )
165+ )
160166
161167 # Handle rate limiting
162168 if response .status_code in (429 , 403 ):
163169 if attempt < max_retries - 1 :
164170 logging .log (
165171 logging .WARNING ,
166- f"Rate limited for { article_title } , retrying in { retry_delay } s (attempt { attempt + 1 } /{ max_retries } )" ,
172+ f"Rate limited for { article_title } , retrying in "
173+ f"{ retry_delay } s (attempt { attempt + 1 } /{ max_retries } )" ,
167174 )
168175 time .sleep (retry_delay )
169176 retry_delay *= 2 # Exponential backoff
170177 continue
171178 else :
172179 logging .log (
173180 logging .ERROR ,
174- f"Failed to fetch { article_title } after { max_retries } attempts (rate limited). Skipping article." ,
181+ f"Failed to fetch { article_title } after "
182+ f"{ max_retries } attempts (rate limited). Skipping article." ,
175183 )
176184 break
177185
@@ -180,7 +188,8 @@ def fetch_article(self, json_item, index=None, total=None):
180188 if not response .text :
181189 logging .log (
182190 logging .WARNING ,
183- f"Empty response for Wikipedia article: { article_title } . Skipping article." ,
191+ f"Empty response for Wikipedia article: "
192+ f"{ article_title } . Skipping article." ,
184193 )
185194 break
186195
@@ -200,14 +209,16 @@ def fetch_article(self, json_item, index=None, total=None):
200209 if attempt < max_retries - 1 :
201210 logging .log (
202211 logging .WARNING ,
203- f"Request failed for { article_title } : { e } , retrying in { retry_delay } s" ,
212+ f"Request failed for { article_title } : "
213+ f"{ e } , retrying in { retry_delay } s" ,
204214 )
205215 time .sleep (retry_delay )
206216 retry_delay *= 2
207217 else :
208218 logging .log (
209219 logging .ERROR ,
210- f"Failed to fetch { article_title } after { max_retries } attempts: { e } . Skipping article." ,
220+ f"Failed to fetch { article_title } "
221+ f" after { max_retries } attempts: { e } . Skipping article." ,
211222 )
212223 if not success and "wp_en" in json_item :
213224 logging .log (
0 commit comments