Skip to content

Commit 4a1517e

Browse files
Lint with ruff
1 parent 2ed4a1a commit 4a1517e

13 files changed

+489
-242
lines changed

extract/classify_document_types.py

Lines changed: 98 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,59 @@
1919

2020
# Filter out common non-language words and short words
2121
NON_LANGUAGE_WORDS = {
22-
"pdf", "download", "file", "document", "report", "mb", "kb", "gb",
23-
"summary", "full", "main", "background", "note", "overview",
24-
"the", "and", "or", "for", "in", "with", "of", "a", "an", "is", "are",
25-
"executive", "technical", "appendix", "annex", "chapter", "section",
22+
"pdf",
23+
"download",
24+
"file",
25+
"document",
26+
"report",
27+
"mb",
28+
"kb",
29+
"gb",
30+
"summary",
31+
"full",
32+
"main",
33+
"background",
34+
"note",
35+
"overview",
36+
"the",
37+
"and",
38+
"or",
39+
"for",
40+
"in",
41+
"with",
42+
"of",
43+
"a",
44+
"an",
45+
"is",
46+
"are",
47+
"executive",
48+
"technical",
49+
"appendix",
50+
"annex",
51+
"chapter",
52+
"section",
2653
# Add common English words that were causing false positives
27-
"climate", "change", "economic", "damage", "environmental", "risks",
28-
"financial", "private", "sector", "forestry", "agroforestry",
29-
"assessment", "groundwater", "irrigation", "indicative", "total",
30-
"development", "financing", "needs", "estimating", "country"
54+
"climate",
55+
"change",
56+
"economic",
57+
"damage",
58+
"environmental",
59+
"risks",
60+
"financial",
61+
"private",
62+
"sector",
63+
"forestry",
64+
"agroforestry",
65+
"assessment",
66+
"groundwater",
67+
"irrigation",
68+
"indicative",
69+
"total",
70+
"development",
71+
"financing",
72+
"needs",
73+
"estimating",
74+
"country",
3175
}
3276

3377
NON_PDF_INDICATORS = [" text "]
@@ -45,19 +89,21 @@
4589
"brief",
4690
]
4791

92+
4893
class DownloadLinkWithClassification(DownloadLinkWithFileInfo):
4994
classification: DocumentType
5095
language_detected: Optional[str]
5196
reasoning: str
5297

98+
5399
class PublicationDetailsWithClassification(PublicationDetailsBase):
54100
download_links: List[DownloadLinkWithClassification]
55101

56102

57103
def detect_language_in_text(text: str) -> Optional[str]:
58104
"""
59105
Detect if text contains explicit non-English language names.
60-
106+
61107
This function looks for actual language names in English using langcodes.find(),
62108
which is much more precise than fuzzy matching language codes.
63109
@@ -66,19 +112,21 @@ def detect_language_in_text(text: str) -> Optional[str]:
66112
"""
67113
# Extract words, removing punctuation and size info
68114
words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
69-
70-
filtered_words = [word for word in words if word not in NON_LANGUAGE_WORDS and len(word) > 2]
71-
115+
116+
filtered_words = [
117+
word for word in words if word not in NON_LANGUAGE_WORDS and len(word) > 2
118+
]
119+
72120
for word in filtered_words:
73121
try:
74122
# Try to find the word as a language name in English
75-
lang = langcodes.find(word, language='en')
76-
if lang and lang.language != 'en': # Not English
123+
lang = langcodes.find(word, language="en")
124+
if lang and lang.language != "en": # Not English
77125
return lang.language
78126
except LookupError:
79127
# Word is not a language name, continue checking other words
80128
continue
81-
129+
82130
return None
83131

84132

@@ -98,12 +146,14 @@ def classify_download_link(
98146
text_lower = input.text.lower()
99147

100148
# Detect language and set default to English
101-
detected_lang = detect_language_in_text(input.text) or 'en'
149+
detected_lang = detect_language_in_text(input.text) or "en"
102150
is_pdf = not any(indicator in text_lower for indicator in NON_PDF_INDICATORS)
103151

104-
if detected_lang != 'en' or not is_pdf:
152+
if detected_lang != "en" or not is_pdf:
105153
if verbose:
106-
print(f"Skipping link: {input.text} (language: {detected_lang}, PDF: {is_pdf})")
154+
print(
155+
f"Skipping link: {input.text} (language: {detected_lang}, PDF: {is_pdf})"
156+
)
107157
return None
108158

109159
# Check for main report indicators
@@ -134,29 +184,33 @@ def classify_download_link(
134184
url=input.url,
135185
text=input.text,
136186
file_info=input.file_info,
137-
classification=DocumentType.MAIN if position == 0 else DocumentType.SUPPLEMENTAL,
187+
classification=DocumentType.MAIN
188+
if position == 0
189+
else DocumentType.SUPPLEMENTAL,
138190
language_detected=detected_lang,
139191
reasoning=(
140192
"First English PDF (assumed main report)"
141-
if position == 0 else
142-
f"English PDF in position {position + 1} (assumed supplementary)"
143-
)
193+
if position == 0
194+
else f"English PDF in position {position + 1} (assumed supplementary)"
195+
),
144196
)
145197

146198
# Default case - no explicit language or PDF specified
147199
else:
148200
result = DownloadLinkWithClassification(
149-
url=input.url,
150-
text=input.text,
151-
file_info=input.file_info,
152-
classification=DocumentType.MAIN if position == 0 else DocumentType.SUPPLEMENTAL,
153-
language_detected=detected_lang,
154-
reasoning=(
155-
"First document with no language specified (assumed main English report)"
156-
if position == 0 else
157-
f"Document in position {position + 1} with no language specified (assumed supplementary)"
201+
url=input.url,
202+
text=input.text,
203+
file_info=input.file_info,
204+
classification=DocumentType.MAIN
205+
if position == 0
206+
else DocumentType.SUPPLEMENTAL,
207+
language_detected=detected_lang,
208+
reasoning=(
209+
"First document with no language specified (assumed main English report)"
210+
if position == 0
211+
else f"Document in position {position + 1} with no language specified (assumed supplementary)"
212+
),
158213
)
159-
)
160214

161215
if verbose:
162216
print(f"{input.text} -> {result.classification}")
@@ -199,7 +253,7 @@ def classify_download_links(
199253
if __name__ == "__main__":
200254
from extract.classify_mime_types import FileTypeInfo
201255
from pydantic import HttpUrl
202-
256+
203257
# Test the classification with some sample data including problematic cases
204258
test_links = [
205259
"English PDF (3.71 MB)",
@@ -217,17 +271,17 @@ def classify_download_links(
217271
"Spanish Document Overview (1.8 MB)",
218272
"Chinese Analysis Report (3.2 MB)",
219273
"Agriculture in Punjab (3.71 MB)",
220-
"Full Report (1.3 MB)"
274+
"Full Report (1.3 MB)",
221275
]
222276

223-
results = classify_download_links([
224-
DownloadLinkWithFileInfo(
225-
url=HttpUrl(f"https://localhost:8000/test{i+1}.pdf"),
226-
text=link,
227-
file_info=FileTypeInfo(
228-
mime_type="application/pdf",
229-
charset="utf-8"
277+
results = classify_download_links(
278+
[
279+
DownloadLinkWithFileInfo(
280+
url=HttpUrl(f"https://localhost:8000/test{i + 1}.pdf"),
281+
text=link,
282+
file_info=FileTypeInfo(mime_type="application/pdf", charset="utf-8"),
230283
)
231-
)
232-
for i, link in enumerate(test_links)
233-
], verbose=True)
284+
for i, link in enumerate(test_links)
285+
],
286+
verbose=True,
287+
)

extract/classify_mime_types.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,12 @@ def transform_worldbank_url(url: HttpUrl) -> HttpUrl:
115115
return url
116116

117117

118-
def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> DownloadLinkWithFileInfo:
118+
def get_file_type_from_url(
119+
download_link: DownloadLink, max_retries=3
120+
) -> DownloadLinkWithFileInfo:
119121
"""
120122
Get file type with retry logic for rate limiting.
121-
123+
122124
Raises:
123125
Exception: If unable to determine a valid MIME type after all retries.
124126
"""
@@ -139,12 +141,17 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
139141

140142
# Make a GET request with stream=True to get headers and peek at content
141143
with requests.get(
142-
str(actual_url), stream=True, allow_redirects=True, headers=DEFAULT_HEADERS
144+
str(actual_url),
145+
stream=True,
146+
allow_redirects=True,
147+
headers=DEFAULT_HEADERS,
143148
) as response:
144149
# Check for rate limiting
145150
if response.status_code == 429:
146151
if attempt == max_retries:
147-
raise Exception(f"Rate limited (429) after {max_retries} attempts for {download_link.url}")
152+
raise Exception(
153+
f"Rate limited (429) after {max_retries} attempts for {download_link.url}"
154+
)
148155
wait_time = random.uniform(15, 30) # Longer wait for rate limiting
149156
print(
150157
f"Rate limited. Waiting {wait_time:.1f} seconds before retry..."
@@ -156,8 +163,6 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
156163
content_type = response.headers.get("Content-Type", "unknown")
157164
parsed_header = parse_content_type(content_type)
158165

159-
160-
161166
# If we're still getting JSON content type or HTML, try to peek at actual content
162167
if (
163168
"json" in parsed_header["mime_type"]
@@ -186,11 +191,15 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
186191
# Apply UTF-8 fallback if no charset was determined
187192
if not charset:
188193
charset = "utf-8"
189-
print(f"Warning: No charset detected for {download_link.url}, defaulting to UTF-8")
194+
print(
195+
f"Warning: No charset detected for {download_link.url}, defaulting to UTF-8"
196+
)
190197

191198
# Log warning if guessed type doesn't match actual MIME type
192199
if guessed_type and guessed_type != mime_type:
193-
print(f"Warning: Guessed type '{guessed_type}' doesn't match detected type '{mime_type}' for {download_link.url}")
200+
print(
201+
f"Warning: Guessed type '{guessed_type}' doesn't match detected type '{mime_type}' for {download_link.url}"
202+
)
194203

195204
result = FileTypeInfo(
196205
mime_type=mime_type,
@@ -200,30 +209,36 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
200209
# If we got HTML when expecting PDF/text, consider it a failure
201210
if not is_valid_file_info(result.model_dump()):
202211
if attempt == max_retries:
203-
raise Exception(f"Failed to get valid file type for {download_link.url} after {max_retries} attempts - got {mime_type}")
212+
raise Exception(
213+
f"Failed to get valid file type for {download_link.url} after {max_retries} attempts - got {mime_type}"
214+
)
204215
print("Got unexpected file type, retrying...")
205216
continue
206217

207218
return DownloadLinkWithFileInfo(
208219
url=HttpUrl(actual_url) if STORE_FINAL_URL else download_link.url,
209220
text=download_link.text,
210-
file_info=result
221+
file_info=result,
211222
)
212223

213224
except Exception as e:
214225
print(f"Attempt {attempt}/{max_retries} failed: {str(e)}")
215226
if attempt == max_retries:
216-
raise Exception(f"Failed to determine MIME type for {download_link.url} after {max_retries} attempts: {str(e)}")
217-
227+
raise Exception(
228+
f"Failed to determine MIME type for {download_link.url} after {max_retries} attempts: {str(e)}"
229+
)
230+
218231
# This should never be reached, but satisfies the type checker
219232
raise Exception(f"Unexpected exit from retry loop for {download_link.url}")
220233

221234

222235
def main():
223236
# Create a sample DownloadLink
224237
download_link = DownloadLink(
225-
url=HttpUrl("https://openknowledge.worldbank.org/bitstreams/cf2a2b54-559b-5909-ada8-af36b21bd4da/download"),
226-
text="English PDF (18.05 MB)"
238+
url=HttpUrl(
239+
"https://openknowledge.worldbank.org/bitstreams/cf2a2b54-559b-5909-ada8-af36b21bd4da/download"
240+
),
241+
text="English PDF (18.05 MB)",
227242
)
228243

229244
dl_with_info = get_file_type_from_url(download_link)

extract/download_files.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ def main():
209209

210210
# Download files marked for download
211211
for link in pub["downloadLinks"]:
212-
213212
if link.get("to_download", False):
214213
try:
215214
print(f"\nDownloading {link['text']} for publication {pub_id}")

0 commit comments

Comments
 (0)