Teal-Insights
diff --git a/‎extract/classify_document_types.py‎
Lines changed: 98 additions & 44 deletions b/‎extract/classify_document_types.py‎
Lines changed: 98 additions & 44 deletions
diff --git a/‎extract/classify_mime_types.py‎
Lines changed: 29 additions & 14 deletions b/‎extract/classify_mime_types.py‎
Lines changed: 29 additions & 14 deletions
diff --git a/‎extract/download_files.py‎
Lines changed: 0 additions & 1 deletion b/‎extract/download_files.py‎
Lines changed: 0 additions & 1 deletion
@@ -19,15 +19,59 @@
 
 # Filter out common non-language words and short words
 NON_LANGUAGE_WORDS = {
-    "pdf", "download", "file", "document", "report", "mb", "kb", "gb",
-    "summary", "full", "main", "background", "note", "overview",
-    "the", "and", "or", "for", "in", "with", "of", "a", "an", "is", "are",
-    "executive", "technical", "appendix", "annex", "chapter", "section",
+    "pdf",
+    "download",
+    "file",
+    "document",
+    "report",
+    "mb",
+    "kb",
+    "gb",
+    "summary",
+    "full",
+    "main",
+    "background",
+    "note",
+    "overview",
+    "the",
+    "and",
+    "or",
+    "for",
+    "in",
+    "with",
+    "of",
+    "a",
+    "an",
+    "is",
+    "are",
+    "executive",
+    "technical",
+    "appendix",
+    "annex",
+    "chapter",
+    "section",
     # Add common English words that were causing false positives
-    "climate", "change", "economic", "damage", "environmental", "risks",
-    "financial", "private", "sector", "forestry", "agroforestry", 
-    "assessment", "groundwater", "irrigation", "indicative", "total",
-    "development", "financing", "needs", "estimating", "country"
+    "climate",
+    "change",
+    "economic",
+    "damage",
+    "environmental",
+    "risks",
+    "financial",
+    "private",
+    "sector",
+    "forestry",
+    "agroforestry",
+    "assessment",
+    "groundwater",
+    "irrigation",
+    "indicative",
+    "total",
+    "development",
+    "financing",
+    "needs",
+    "estimating",
+    "country",
 }
 
 NON_PDF_INDICATORS = [" text "]
@@ -45,19 +89,21 @@
     "brief",
 ]
 
+
 class DownloadLinkWithClassification(DownloadLinkWithFileInfo):
     classification: DocumentType
     language_detected: Optional[str]
     reasoning: str
 
+
 class PublicationDetailsWithClassification(PublicationDetailsBase):
     download_links: List[DownloadLinkWithClassification]
 
 
 def detect_language_in_text(text: str) -> Optional[str]:
     """
     Detect if text contains explicit non-English language names.
-    
+
     This function looks for actual language names in English using langcodes.find(),
     which is much more precise than fuzzy matching language codes.
 
@@ -66,19 +112,21 @@ def detect_language_in_text(text: str) -> Optional[str]:
     """
     # Extract words, removing punctuation and size info
     words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
-    
-    filtered_words = [word for word in words if word not in NON_LANGUAGE_WORDS and len(word) > 2]
-    
+
+    filtered_words = [
+        word for word in words if word not in NON_LANGUAGE_WORDS and len(word) > 2
+    ]
+
     for word in filtered_words:
         try:
             # Try to find the word as a language name in English
-            lang = langcodes.find(word, language='en')
-            if lang and lang.language != 'en':  # Not English
+            lang = langcodes.find(word, language="en")
+            if lang and lang.language != "en":  # Not English
                 return lang.language
         except LookupError:
             # Word is not a language name, continue checking other words
             continue
-    
+
     return None
 
 
@@ -98,12 +146,14 @@ def classify_download_link(
     text_lower = input.text.lower()
 
     # Detect language and set default to English
-    detected_lang = detect_language_in_text(input.text) or 'en'
+    detected_lang = detect_language_in_text(input.text) or "en"
     is_pdf = not any(indicator in text_lower for indicator in NON_PDF_INDICATORS)
 
-    if detected_lang != 'en' or not is_pdf:
+    if detected_lang != "en" or not is_pdf:
         if verbose:
-            print(f"Skipping link: {input.text} (language: {detected_lang}, PDF: {is_pdf})")
+            print(
+                f"Skipping link: {input.text} (language: {detected_lang}, PDF: {is_pdf})"
+            )
         return None
 
     # Check for main report indicators
@@ -134,29 +184,33 @@ def classify_download_link(
             url=input.url,
             text=input.text,
             file_info=input.file_info,
-            classification=DocumentType.MAIN if position == 0 else DocumentType.SUPPLEMENTAL,
+            classification=DocumentType.MAIN
+            if position == 0
+            else DocumentType.SUPPLEMENTAL,
             language_detected=detected_lang,
             reasoning=(
                 "First English PDF (assumed main report)"
-                if position == 0 else
-                f"English PDF in position {position + 1} (assumed supplementary)"
-            )
+                if position == 0
+                else f"English PDF in position {position + 1} (assumed supplementary)"
+            ),
         )
 
     # Default case - no explicit language or PDF specified
     else:
         result = DownloadLinkWithClassification(
-        url=input.url,
-        text=input.text,
-        file_info=input.file_info,
-        classification=DocumentType.MAIN if position == 0 else DocumentType.SUPPLEMENTAL,
-        language_detected=detected_lang,
-        reasoning=(
-            "First document with no language specified (assumed main English report)"
-            if position == 0 else
-            f"Document in position {position + 1} with no language specified (assumed supplementary)"
+            url=input.url,
+            text=input.text,
+            file_info=input.file_info,
+            classification=DocumentType.MAIN
+            if position == 0
+            else DocumentType.SUPPLEMENTAL,
+            language_detected=detected_lang,
+            reasoning=(
+                "First document with no language specified (assumed main English report)"
+                if position == 0
+                else f"Document in position {position + 1} with no language specified (assumed supplementary)"
+            ),
         )
-    )
 
     if verbose:
         print(f"{input.text} -> {result.classification}")
@@ -199,7 +253,7 @@ def classify_download_links(
 if __name__ == "__main__":
     from extract.classify_mime_types import FileTypeInfo
     from pydantic import HttpUrl
-    
+
     # Test the classification with some sample data including problematic cases
     test_links = [
         "English PDF (3.71 MB)",
@@ -217,17 +271,17 @@ def classify_download_links(
         "Spanish Document Overview (1.8 MB)",
         "Chinese Analysis Report (3.2 MB)",
         "Agriculture in Punjab (3.71 MB)",
-        "Full Report (1.3 MB)"
+        "Full Report (1.3 MB)",
     ]
 
-    results = classify_download_links([
-        DownloadLinkWithFileInfo(
-            url=HttpUrl(f"https://localhost:8000/test{i+1}.pdf"),
-            text=link,
-            file_info=FileTypeInfo(
-                mime_type="application/pdf",
-                charset="utf-8"
+    results = classify_download_links(
+        [
+            DownloadLinkWithFileInfo(
+                url=HttpUrl(f"https://localhost:8000/test{i + 1}.pdf"),
+                text=link,
+                file_info=FileTypeInfo(mime_type="application/pdf", charset="utf-8"),
             )
-        )
-        for i, link in enumerate(test_links)
-    ], verbose=True)
+            for i, link in enumerate(test_links)
+        ],
+        verbose=True,
+    )
@@ -115,10 +115,12 @@ def transform_worldbank_url(url: HttpUrl) -> HttpUrl:
     return url
 
 
-def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> DownloadLinkWithFileInfo:
+def get_file_type_from_url(
+    download_link: DownloadLink, max_retries=3
+) -> DownloadLinkWithFileInfo:
     """
     Get file type with retry logic for rate limiting.
-    
+
     Raises:
         Exception: If unable to determine a valid MIME type after all retries.
     """
@@ -139,12 +141,17 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
 
             # Make a GET request with stream=True to get headers and peek at content
             with requests.get(
-                str(actual_url), stream=True, allow_redirects=True, headers=DEFAULT_HEADERS
+                str(actual_url),
+                stream=True,
+                allow_redirects=True,
+                headers=DEFAULT_HEADERS,
             ) as response:
                 # Check for rate limiting
                 if response.status_code == 429:
                     if attempt == max_retries:
-                        raise Exception(f"Rate limited (429) after {max_retries} attempts for {download_link.url}")
+                        raise Exception(
+                            f"Rate limited (429) after {max_retries} attempts for {download_link.url}"
+                        )
                     wait_time = random.uniform(15, 30)  # Longer wait for rate limiting
                     print(
                         f"Rate limited. Waiting {wait_time:.1f} seconds before retry..."
@@ -156,8 +163,6 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
                 content_type = response.headers.get("Content-Type", "unknown")
                 parsed_header = parse_content_type(content_type)
 
-
-
                 # If we're still getting JSON content type or HTML, try to peek at actual content
                 if (
                     "json" in parsed_header["mime_type"]
@@ -186,11 +191,15 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
                 # Apply UTF-8 fallback if no charset was determined
                 if not charset:
                     charset = "utf-8"
-                    print(f"Warning: No charset detected for {download_link.url}, defaulting to UTF-8")
+                    print(
+                        f"Warning: No charset detected for {download_link.url}, defaulting to UTF-8"
+                    )
 
                 # Log warning if guessed type doesn't match actual MIME type
                 if guessed_type and guessed_type != mime_type:
-                    print(f"Warning: Guessed type '{guessed_type}' doesn't match detected type '{mime_type}' for {download_link.url}")
+                    print(
+                        f"Warning: Guessed type '{guessed_type}' doesn't match detected type '{mime_type}' for {download_link.url}"
+                    )
 
                 result = FileTypeInfo(
                     mime_type=mime_type,
@@ -200,30 +209,36 @@ def get_file_type_from_url(download_link: DownloadLink, max_retries=3) -> Downlo
                 # If we got HTML when expecting PDF/text, consider it a failure
                 if not is_valid_file_info(result.model_dump()):
                     if attempt == max_retries:
-                        raise Exception(f"Failed to get valid file type for {download_link.url} after {max_retries} attempts - got {mime_type}")
+                        raise Exception(
+                            f"Failed to get valid file type for {download_link.url} after {max_retries} attempts - got {mime_type}"
+                        )
                     print("Got unexpected file type, retrying...")
                     continue
 
                 return DownloadLinkWithFileInfo(
                     url=HttpUrl(actual_url) if STORE_FINAL_URL else download_link.url,
                     text=download_link.text,
-                    file_info=result
+                    file_info=result,
                 )
 
         except Exception as e:
             print(f"Attempt {attempt}/{max_retries} failed: {str(e)}")
             if attempt == max_retries:
-                raise Exception(f"Failed to determine MIME type for {download_link.url} after {max_retries} attempts: {str(e)}")
-    
+                raise Exception(
+                    f"Failed to determine MIME type for {download_link.url} after {max_retries} attempts: {str(e)}"
+                )
+
     # This should never be reached, but satisfies the type checker
     raise Exception(f"Unexpected exit from retry loop for {download_link.url}")
 
 
 def main():
     # Create a sample DownloadLink
     download_link = DownloadLink(
-        url=HttpUrl("https://openknowledge.worldbank.org/bitstreams/cf2a2b54-559b-5909-ada8-af36b21bd4da/download"),
-        text="English PDF (18.05 MB)"
+        url=HttpUrl(
+            "https://openknowledge.worldbank.org/bitstreams/cf2a2b54-559b-5909-ada8-af36b21bd4da/download"
+        ),
+        text="English PDF (18.05 MB)",
     )
 
     dl_with_info = get_file_type_from_url(download_link)
 
@@ -209,7 +209,6 @@ def main():
 
         # Download files marked for download
         for link in pub["downloadLinks"]:
-
             if link.get("to_download", False):
                 try:
                     print(f"\nDownloading {link['text']} for publication {pub_id}")