Melhora tratamento de erros para tipos de arquivo não suportados (#109)

diraol · web-flow · commit c39f6bf4410a · 2025-11-28T18:10:23.000-03:00
## 🎯 Objetivo

Reduzir a verbosidade dos logs para arquivos com tipos não suportados
(especialmente ZIP) e melhorar a eficiência evitando tentativas de
processamento desnecessárias.

## 📋 Problema

Arquivos ZIP e outros tipos não suportados geravam logs extremamente
verbosos:
- WARNING com o caminho do arquivo
- ERROR com a mensagem
- Stack trace completo com ~15 linhas

Exemplo:
```
WARNING:root:Could not process gazette: 5107602/2009-02-27/e9642a24f98c20f0ce2ef5637d65c6abb0bdf854.zip. Cause: Unsupported file type: application/zip
ERROR:root:Unsupported file type: application/zip
Traceback (most recent call last):
  [Stack trace completo]
```

## ✅ Solução Implementada

### 1. Nova Exceção Customizada
- `UnsupportedFileTypeError` criada em
`data_extraction/text_extraction.py`
- Exportada no módulo para uso consistente no projeto
- Permite tratamento específico de erros de tipos não suportados

### 2. Detecção Antecipada de ZIP
- Novo método `is_zip()` na classe `ApacheTikaTextExtractor`
- Verifica tipo de arquivo ANTES de tentar processar
- Log específico: "Skipping unsupported ZIP file: {file_path}"
- Evita processamento desnecessário

### 3. Tratamento de Erros Simplificado
- `UnsupportedFileTypeError` capturado separadamente
- Apenas WARNING logado (sem stack trace)
- Outros erros mantêm log completo para debugging

## 📊 Resultado

**Antes:**
```
WARNING:root:Could not process gazette: [...].zip. Cause: Unsupported file type: application/zip
ERROR:root:Unsupported file type: application/zip
Traceback (most recent call last):
  [~15 linhas de stack trace]
```

**Depois:**
```
WARNING:root:Skipping unsupported ZIP file: [...].zip
WARNING:root:Could not process gazette: [...].zip. Cause: Unsupported file type: application/zip
```

## 📝 Arquivos Modificados

- `data_extraction/text_extraction.py` - Nova exceção e método
`is_zip()`
- `data_extraction/__init__.py` - Export da exceção
- `tasks/gazette_text_extraction.py` - Detecção antecipada e tratamento
de erros

## 🧪 Testes

- Sintaxe Python validada com `py_compile`
- Imports verificados
- Lógica de fluxo mantida inalterada
diff --git a/data_extraction/__init__.py b/data_extraction/__init__.py
@@ -1,8 +1,13 @@
 from .interfaces import TextExtractorInterface
-from .text_extraction import ApacheTikaTextExtractor, create_apache_tika_text_extraction
+from .text_extraction import (
+    ApacheTikaTextExtractor,
+    UnsupportedFileTypeError,
+    create_apache_tika_text_extraction,
+)
 
 __all__ = [
     "ApacheTikaTextExtractor",
+    "UnsupportedFileTypeError",
     "create_apache_tika_text_extraction",
     "TextExtractorInterface",
 ]
diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py
@@ -8,6 +8,12 @@
 from .interfaces import TextExtractorInterface
 
 
+class UnsupportedFileTypeError(Exception):
+    """Exception raised when a file type is not supported for text extraction."""
+
+    pass
+
+
 class ApacheTikaTextExtractor(TextExtractorInterface):
     def __init__(self, url: str):
         self._url = url
@@ -28,7 +34,7 @@ def _try_extract_text(self, filepath: str) -> str:
         """
         if self.is_txt(filepath):
             return self._return_file_content(filepath)
-        
+
         try:
             with open(filepath, "rb") as file:
                 headers = {
@@ -37,19 +43,19 @@ def _try_extract_text(self, filepath: str) -> str:
                 }
                 # Use streaming to prevent loading entire file in memory
                 response = requests.put(
-                    f"{self._url}/tika", 
-                    data=file, 
+                    f"{self._url}/tika",
+                    data=file,
                     headers=headers,
-                    stream=False  # Tika requires full upload, but we stream the read
+                    stream=False,  # Tika requires full upload, but we stream the read
                 )
                 response.encoding = "UTF-8"
                 text = response.text
-                
+
                 # Explicit cleanup to free memory immediately
                 response.close()
                 del response
                 gc.collect()
-                
+
                 return text
         except Exception as e:
             # Ensure cleanup even on error
@@ -70,12 +76,13 @@ def check_file_exists(self, filepath: str):
             raise Exception(f"File does not exists: {filepath}")
 
     def check_file_type_supported(self, filepath: str) -> None:
+        file_type = self.get_file_type(filepath)
         if (
             not self.is_doc(filepath)
             and not self.is_pdf(filepath)
             and not self.is_txt(filepath)
         ):
-            raise Exception("Unsupported file type: " + self.get_file_type(filepath))
+            raise UnsupportedFileTypeError(f"Unsupported file type: {file_type}")
 
     def is_pdf(self, filepath):
         """
@@ -115,6 +122,13 @@ def is_file_type(self, filepath, file_types):
         """
         return self.get_file_type(filepath) in file_types
 
+    def is_zip(self, filepath):
+        """
+        If the file type is zip returns True. Otherwise,
+        returns False
+        """
+        return self.is_file_type(filepath, file_types=["application/zip"])
+
 
 def get_apache_tika_server_url():
     return os.environ["APACHE_TIKA_SERVER"]
diff --git a/tasks/gazette_text_extraction.py b/tasks/gazette_text_extraction.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Union
 
-from data_extraction import TextExtractorInterface
+from data_extraction import TextExtractorInterface, UnsupportedFileTypeError
 from database import DatabaseInterface
 from index import IndexInterface
 from segmentation import get_segmenter
@@ -38,19 +38,23 @@ def extract_text_from_gazettes(
 
     ids = []
     processed_count = 0
-    
+
     for gazette in gazettes:
         try:
             document_ids = try_process_gazette_file(
                 gazette, territories, database, storage, index, text_extractor
             )
             ids.extend(document_ids)
             processed_count += 1
-            
+
             # Log progress periodically
             if processed_count % 10 == 0:
                 logging.info(f"Processed {processed_count} gazettes")
-                
+
+        except UnsupportedFileTypeError as e:
+            logging.warning(
+                f"Could not process gazette: {gazette['file_path']}. Cause: {e}"
+            )
         except Exception as e:
             logging.warning(
                 f"Could not process gazette: {gazette['file_path']}. Cause: {e}"
@@ -59,7 +63,7 @@ def extract_text_from_gazettes(
         finally:
             # Clear gazette data from memory after processing
             gazette.clear()
-            
+
             # Force GC every 10 documents to prevent memory accumulation
             if processed_count % 10 == 0:
                 gc.collect()
@@ -82,23 +86,28 @@ def try_process_gazette_file(
     """
     logging.debug(f"Processing gazette {gazette['file_path']}")
     gazette_file = None
-    
+
     try:
         gazette_file = download_gazette_file(gazette, storage)
-        
+
+        # Check if file is ZIP - not supported, skip processing
+        if text_extractor.is_zip(gazette_file):
+            logging.warning(f"Skipping unsupported ZIP file: {gazette['file_path']}")
+            raise UnsupportedFileTypeError("application/zip")
+
         # Check file size to prevent OOM on very large files
         file_size = os.path.getsize(gazette_file)
         if file_size > MAX_FILE_SIZE_BYTES:
             raise Exception(
                 f"File too large ({file_size / 1024 / 1024:.2f}MB > {MAX_FILE_SIZE_MB}MB): {gazette['file_path']}"
             )
-        
+
         gazette["source_text"] = try_to_extract_content(gazette_file, text_extractor)
         gazette["url"] = define_file_url(gazette["file_path"])
         gazette_txt_path = define_gazette_txt_path(gazette)
         gazette["file_raw_txt"] = define_file_url(gazette_txt_path)
         upload_raw_text(gazette_txt_path, gazette["source_text"], storage)
-        
+
         # Delete file ASAP to free disk space
         delete_gazette_files(gazette_file)
         gazette_file = None
@@ -114,18 +123,18 @@ def try_process_gazette_file(
                 upload_raw_text(segment_txt_path, segment["source_text"], storage)
                 index.index_document(segment, document_id=segment["file_checksum"])
                 document_ids.append(segment["file_checksum"])
-                
+
                 # Clear segment data from memory
                 segment.clear()
-                
+
             # Clear segments list
             del territory_segments
         else:
             index.index_document(gazette, document_id=gazette["file_checksum"])
             document_ids.append(gazette["file_checksum"])
 
         set_gazette_as_processed(gazette, database)
-        
+
         # Clear gazette source_text from memory (large string)
         if "source_text" in gazette:
             del gazette["source_text"]
@@ -138,7 +147,7 @@ def try_process_gazette_file(
                 os.remove(gazette_file)
             except Exception as e:
                 logging.warning(f"Failed to cleanup temp file {gazette_file}: {e}")
-        
+
         # Force garbage collection after each document
         gc.collect()
 
diff --git a/tasks/list_gazettes_to_be_processed.py b/tasks/list_gazettes_to_be_processed.py
@@ -32,7 +32,7 @@ def get_gazettes_extracted_since_yesterday(
     Uses pagination to prevent loading all data into memory at once (OOM prevention)
     """
     logging.info("Listing gazettes extracted since yesterday (paginated)")
-    
+
     offset = 0
     while True:
         command = f"""
@@ -61,19 +61,21 @@ def get_gazettes_extracted_since_yesterday(
         LIMIT {QUERY_PAGE_SIZE} OFFSET {offset}
         ;
         """
-        
+
         page_results = list(database.select(command))
-        
+
         if not page_results:
             break
-            
-        logging.debug(f"Processing page with {len(page_results)} gazettes (offset={offset})")
-        
+
+        logging.debug(
+            f"Processing page with {len(page_results)} gazettes (offset={offset})"
+        )
+
         for gazette in page_results:
             yield format_gazette_data(gazette)
-        
+
         offset += QUERY_PAGE_SIZE
-        
+
         # If we got fewer results than page size, we're done
         if len(page_results) < QUERY_PAGE_SIZE:
             break
@@ -87,7 +89,7 @@ def get_all_gazettes_extracted(
     Uses pagination to prevent loading all data into memory at once (OOM prevention)
     """
     logging.info("Listing all gazettes extracted (paginated)")
-    
+
     offset = 0
     while True:
         command = f"""
@@ -114,19 +116,21 @@ def get_all_gazettes_extracted(
         LIMIT {QUERY_PAGE_SIZE} OFFSET {offset}
         ;
         """
-        
+
         page_results = list(database.select(command))
-        
+
         if not page_results:
             break
-            
-        logging.debug(f"Processing page with {len(page_results)} gazettes (offset={offset})")
-        
+
+        logging.debug(
+            f"Processing page with {len(page_results)} gazettes (offset={offset})"
+        )
+
         for gazette in page_results:
             yield format_gazette_data(gazette)
-        
+
         offset += QUERY_PAGE_SIZE
-        
+
         # If we got fewer results than page size, we're done
         if len(page_results) < QUERY_PAGE_SIZE:
             break
@@ -140,7 +144,7 @@ def get_unprocessed_gazettes(
     Uses pagination to prevent loading all data into memory at once (OOM prevention)
     """
     logging.info("Listing unprocessed gazettes (paginated)")
-    
+
     offset = 0
     while True:
         command = f"""
@@ -169,19 +173,21 @@ def get_unprocessed_gazettes(
         LIMIT {QUERY_PAGE_SIZE} OFFSET {offset}
         ;
         """
-        
+
         page_results = list(database.select(command))
-        
+
         if not page_results:
             break
-            
-        logging.debug(f"Processing page with {len(page_results)} unprocessed gazettes (offset={offset})")
-        
+
+        logging.debug(
+            f"Processing page with {len(page_results)} unprocessed gazettes (offset={offset})"
+        )
+
         for gazette in page_results:
             yield format_gazette_data(gazette)
-        
+
         offset += QUERY_PAGE_SIZE
-        
+
         # If we got fewer results than page size, we're done
         if len(page_results) < QUERY_PAGE_SIZE:
             break
diff --git a/tests/list_gazettes_pagination_tests.py b/tests/list_gazettes_pagination_tests.py
@@ -11,7 +11,7 @@
 import os
 from datetime import date, datetime
 from unittest import TestCase
-from unittest.mock import MagicMock, call, patch
+from unittest.mock import MagicMock, patch
 
 from tasks.list_gazettes_to_be_processed import (
     get_all_gazettes_extracted,
@@ -30,7 +30,7 @@ class GazettesListingPaginationTests(TestCase):
     def setUp(self):
         """Setup comum para todos os testes"""
         self.database_mock = MagicMock()
-        
+
         # Mock data - simula resultados do banco
         self.sample_gazette_row = (
             1,  # id
@@ -110,8 +110,12 @@ def test_get_unprocessed_gazettes_queries_contain_limit_and_offset(self):
         self.assertIn("OFFSET 0", sql_command, "SQL deve conter 'OFFSET 0'")
 
         # CRÍTICO: Verifica que NÃO usa placeholders de parâmetros
-        self.assertNotIn("%(limit)s", sql_command, "SQL não deve usar placeholder %(limit)s")
-        self.assertNotIn("%(offset)s", sql_command, "SQL não deve usar placeholder %(offset)s")
+        self.assertNotIn(
+            "%(limit)s", sql_command, "SQL não deve usar placeholder %(limit)s"
+        )
+        self.assertNotIn(
+            "%(offset)s", sql_command, "SQL não deve usar placeholder %(offset)s"
+        )
 
     @patch.dict("os.environ", {"GAZETTE_QUERY_PAGE_SIZE": "3"})
     def test_get_unprocessed_gazettes_stops_when_no_more_results(self):
@@ -167,7 +171,7 @@ def test_get_unprocessed_gazettes_offset_increments_correctly(self):
 
         # Verifica os OFFSETs nas chamadas
         calls = self.database_mock.select.call_args_list
-        
+
         sql_1 = calls[0][0][0]
         sql_2 = calls[1][0][0]
         sql_3 = calls[2][0][0]
@@ -288,20 +292,20 @@ class GazettesListingRegressionTests(TestCase):
     def test_select_method_signature_compatibility(self):
         """
         REGRESSÃO: Garante que select() é sempre chamado com a assinatura correta
-        
+
         Este teste falha se tentarmos passar parâmetros extras para select(),
         prevenindo a regressão do bug original:
         TypeError: PostgreSQL.select() takes 2 positional arguments but 3 were given
         """
         database_mock = MagicMock()
-        
+
         # Configura o mock para aceitar APENAS 1 argumento
         def strict_select(command):
             """Mock que rejeita chamadas com mais de 1 argumento"""
             if not isinstance(command, str):
                 raise TypeError("select() expects a string command")
             return []
-        
+
         database_mock.select.side_effect = strict_select
 
         # Se o código tentar passar parâmetros extras, este teste falhará
@@ -325,6 +329,7 @@ def test_sql_injection_safety_numeric_values(self):
 
         # Verifica que LIMIT e OFFSET são números inteiros no SQL
         import re
+
         limit_match = re.search(r"LIMIT\s+(\d+)", sql_command)
         offset_match = re.search(r"OFFSET\s+(\d+)", sql_command)
 
diff --git a/tests/validate_pagination_tests.py b/tests/validate_pagination_tests.py