1919
2020# Filter out common non-language words and short words
2121NON_LANGUAGE_WORDS = {
22- "pdf" , "download" , "file" , "document" , "report" , "mb" , "kb" , "gb" ,
23- "summary" , "full" , "main" , "background" , "note" , "overview" ,
24- "the" , "and" , "or" , "for" , "in" , "with" , "of" , "a" , "an" , "is" , "are" ,
25- "executive" , "technical" , "appendix" , "annex" , "chapter" , "section" ,
22+ "pdf" ,
23+ "download" ,
24+ "file" ,
25+ "document" ,
26+ "report" ,
27+ "mb" ,
28+ "kb" ,
29+ "gb" ,
30+ "summary" ,
31+ "full" ,
32+ "main" ,
33+ "background" ,
34+ "note" ,
35+ "overview" ,
36+ "the" ,
37+ "and" ,
38+ "or" ,
39+ "for" ,
40+ "in" ,
41+ "with" ,
42+ "of" ,
43+ "a" ,
44+ "an" ,
45+ "is" ,
46+ "are" ,
47+ "executive" ,
48+ "technical" ,
49+ "appendix" ,
50+ "annex" ,
51+ "chapter" ,
52+ "section" ,
2653 # Add common English words that were causing false positives
27- "climate" , "change" , "economic" , "damage" , "environmental" , "risks" ,
28- "financial" , "private" , "sector" , "forestry" , "agroforestry" ,
29- "assessment" , "groundwater" , "irrigation" , "indicative" , "total" ,
30- "development" , "financing" , "needs" , "estimating" , "country"
54+ "climate" ,
55+ "change" ,
56+ "economic" ,
57+ "damage" ,
58+ "environmental" ,
59+ "risks" ,
60+ "financial" ,
61+ "private" ,
62+ "sector" ,
63+ "forestry" ,
64+ "agroforestry" ,
65+ "assessment" ,
66+ "groundwater" ,
67+ "irrigation" ,
68+ "indicative" ,
69+ "total" ,
70+ "development" ,
71+ "financing" ,
72+ "needs" ,
73+ "estimating" ,
74+ "country" ,
3175}
3276
3377NON_PDF_INDICATORS = [" text " ]
4589 "brief" ,
4690]
4791
92+
4893class DownloadLinkWithClassification (DownloadLinkWithFileInfo ):
4994 classification : DocumentType
5095 language_detected : Optional [str ]
5196 reasoning : str
5297
98+
5399class PublicationDetailsWithClassification (PublicationDetailsBase ):
54100 download_links : List [DownloadLinkWithClassification ]
55101
56102
57103def detect_language_in_text (text : str ) -> Optional [str ]:
58104 """
59105 Detect if text contains explicit non-English language names.
60-
106+
61107 This function looks for actual language names in English using langcodes.find(),
62108 which is much more precise than fuzzy matching language codes.
63109
@@ -66,19 +112,21 @@ def detect_language_in_text(text: str) -> Optional[str]:
66112 """
67113 # Extract words, removing punctuation and size info
68114 words = re .findall (r"\b[a-zA-Z]+\b" , text .lower ())
69-
70- filtered_words = [word for word in words if word not in NON_LANGUAGE_WORDS and len (word ) > 2 ]
71-
115+
116+ filtered_words = [
117+ word for word in words if word not in NON_LANGUAGE_WORDS and len (word ) > 2
118+ ]
119+
72120 for word in filtered_words :
73121 try :
74122 # Try to find the word as a language name in English
75- lang = langcodes .find (word , language = 'en' )
76- if lang and lang .language != 'en' : # Not English
123+ lang = langcodes .find (word , language = "en" )
124+ if lang and lang .language != "en" : # Not English
77125 return lang .language
78126 except LookupError :
79127 # Word is not a language name, continue checking other words
80128 continue
81-
129+
82130 return None
83131
84132
@@ -98,12 +146,14 @@ def classify_download_link(
98146 text_lower = input .text .lower ()
99147
100148 # Detect language and set default to English
101- detected_lang = detect_language_in_text (input .text ) or 'en'
149+ detected_lang = detect_language_in_text (input .text ) or "en"
102150 is_pdf = not any (indicator in text_lower for indicator in NON_PDF_INDICATORS )
103151
104- if detected_lang != 'en' or not is_pdf :
152+ if detected_lang != "en" or not is_pdf :
105153 if verbose :
106- print (f"Skipping link: { input .text } (language: { detected_lang } , PDF: { is_pdf } )" )
154+ print (
155+ f"Skipping link: { input .text } (language: { detected_lang } , PDF: { is_pdf } )"
156+ )
107157 return None
108158
109159 # Check for main report indicators
@@ -134,29 +184,33 @@ def classify_download_link(
134184 url = input .url ,
135185 text = input .text ,
136186 file_info = input .file_info ,
137- classification = DocumentType .MAIN if position == 0 else DocumentType .SUPPLEMENTAL ,
187+ classification = DocumentType .MAIN
188+ if position == 0
189+ else DocumentType .SUPPLEMENTAL ,
138190 language_detected = detected_lang ,
139191 reasoning = (
140192 "First English PDF (assumed main report)"
141- if position == 0 else
142- f"English PDF in position { position + 1 } (assumed supplementary)"
143- )
193+ if position == 0
194+ else f"English PDF in position { position + 1 } (assumed supplementary)"
195+ ),
144196 )
145197
146198 # Default case - no explicit language or PDF specified
147199 else :
148200 result = DownloadLinkWithClassification (
149- url = input .url ,
150- text = input .text ,
151- file_info = input .file_info ,
152- classification = DocumentType .MAIN if position == 0 else DocumentType .SUPPLEMENTAL ,
153- language_detected = detected_lang ,
154- reasoning = (
155- "First document with no language specified (assumed main English report)"
156- if position == 0 else
157- f"Document in position { position + 1 } with no language specified (assumed supplementary)"
201+ url = input .url ,
202+ text = input .text ,
203+ file_info = input .file_info ,
204+ classification = DocumentType .MAIN
205+ if position == 0
206+ else DocumentType .SUPPLEMENTAL ,
207+ language_detected = detected_lang ,
208+ reasoning = (
209+ "First document with no language specified (assumed main English report)"
210+ if position == 0
211+ else f"Document in position { position + 1 } with no language specified (assumed supplementary)"
212+ ),
158213 )
159- )
160214
161215 if verbose :
162216 print (f"{ input .text } -> { result .classification } " )
@@ -199,7 +253,7 @@ def classify_download_links(
199253if __name__ == "__main__" :
200254 from extract .classify_mime_types import FileTypeInfo
201255 from pydantic import HttpUrl
202-
256+
203257 # Test the classification with some sample data including problematic cases
204258 test_links = [
205259 "English PDF (3.71 MB)" ,
@@ -217,17 +271,17 @@ def classify_download_links(
217271 "Spanish Document Overview (1.8 MB)" ,
218272 "Chinese Analysis Report (3.2 MB)" ,
219273 "Agriculture in Punjab (3.71 MB)" ,
220- "Full Report (1.3 MB)"
274+ "Full Report (1.3 MB)" ,
221275 ]
222276
223- results = classify_download_links ([
224- DownloadLinkWithFileInfo (
225- url = HttpUrl (f"https://localhost:8000/test{ i + 1 } .pdf" ),
226- text = link ,
227- file_info = FileTypeInfo (
228- mime_type = "application/pdf" ,
229- charset = "utf-8"
277+ results = classify_download_links (
278+ [
279+ DownloadLinkWithFileInfo (
280+ url = HttpUrl (f"https://localhost:8000/test{ i + 1 } .pdf" ),
281+ text = link ,
282+ file_info = FileTypeInfo (mime_type = "application/pdf" , charset = "utf-8" ),
230283 )
231- )
232- for i , link in enumerate (test_links )
233- ], verbose = True )
284+ for i , link in enumerate (test_links )
285+ ],
286+ verbose = True ,
287+ )
0 commit comments