Merge pull request #249 from enoch3712/248-paginate-handler---bad-optional-list-aggregation

enoch3712 · web-flow · commit 2b797c76875f · 2025-02-11T01:57:38.000+01:00
248 paginate handler   bad optional list aggregation
diff --git a/extract_thinker/global_models.py b/extract_thinker/global_models.py
@@ -0,0 +1,8 @@
+def get_lite_model():
+    """Return the lite model for cost efficiency."""
+    return "gpt-4o-mini"
+
+
+def get_big_model():
+    """Return the big model for high performance."""
+    return "gpt-4o" 
diff --git a/extract_thinker/pagination_handler.py b/extract_thinker/pagination_handler.py
@@ -17,6 +17,14 @@ class PaginationHandler(CompletionHandler):
     def __init__(self, llm):
         super().__init__(llm)
         
+    def _make_hashable(self, item: Any) -> Any:
+        """Recursively convert a value to something hashable."""
+        if isinstance(item, dict):
+            return tuple(sorted((k, self._make_hashable(v)) for k, v in item.items()))
+        elif isinstance(item, list):
+            return tuple(self._make_hashable(x) for x in item)
+        return item
+
     def handle(self, 
                content: List[Dict[str, Any]],
                response_model: type[BaseModel],
@@ -81,57 +89,56 @@ def _merge_results(self, results: List[Any], response_model: type[BaseModel], pa
         for _, result in pages_data:
             result_dict = result.model_dump()
             for field_name, field_value in result_dict.items():
-                if field_name not in field_values:
-                    field_values[field_name] = []
-                field_values[field_name].append(field_value)
+                field_values.setdefault(field_name, []).append(field_value)
 
         # Merge fields
         merged = {}
         for field_name, values in field_values.items():
+            # Get the annotated type from the response model
             field_type = response_model.model_fields[field_name].annotation if field_name in response_model.model_fields else None
             non_null_values = [v for v in values if v is not None]
 
             if field_type and get_origin(field_type) is list:
-                # Merge lists using a more sophisticated approach
+                # Merge list fields using a more sophisticated approach
                 merged_list = self._merge_list_field(field_name, values, field_type)
                 merged[field_name] = merged_list
             else:
                 # Scalar field handling
                 if len(non_null_values) == 0:
-                    merged[field_name] = None
+                    # If the field is expected to be a string, default to an empty string.
+                    if field_type == str or (get_origin(field_type) is Union and str in get_args(field_type)):
+                        merged[field_name] = ""
+                    else:
+                        continue
                 else:
-                    # Convert unhashable types (e.g., lists) to hashable types
-                    hashable_values = [tuple(item) if isinstance(item, list) else item for item in non_null_values]
+                    # Build a mapping from the hashable version of each candidate to the original candidate.
+                    distinct_map = {}
+                    for candidate in non_null_values:
+                        key = self._make_hashable(candidate)
+                        if key not in distinct_map:
+                            distinct_map[key] = candidate
+                    distinct_values = list(distinct_map.values())
                     
-                    try:
-                        distinct_values = list(set(hashable_values))
-                    except TypeError:
-                        # Fallback to order-preserving method if conversion fails
-                        seen = set()
-                        distinct_values = []
-                        for item in hashable_values:
-                            if item not in seen:
-                                seen.add(item)
-                                distinct_values.append(item)
-                    # **Modification Ends Here**
-
                     if len(distinct_values) == 1:
                         merged[field_name] = distinct_values[0]
                     else:
-                        # Store conflicts in special structure
+                        # Store conflicts in a special structure
                         merged[field_name] = {
                             "_conflict": True,
                             "candidates": distinct_values
                         }
-
+        
         # Check for conflicts and resolve if necessary
         if self._has_conflicts(merged, response_model):
             merged = self._resolve_conflicts(merged, response_model, pages_data, field_values)
         
         # Clean merged dictionary to ensure it's compatible with the response model
         merged = self._clean_merged_dict(merged, response_model)
-
-        # Now that conflicts are resolved and cleaned, instantiate the response model
+        
+        # Filter out any keys with a None value,
+        # now every required field (e.g., a string like "thinking") will be non-null.
+        merged = {k: v for k, v in merged.items() if v is not None}
+        
         return response_model(**merged)
 
     def _merge_list_field(self, field_name: str, values: List[Any], field_type: Any) -> List[Any]:
diff --git a/tests/models/gdp_contract.py b/tests/models/gdp_contract.py
@@ -35,7 +35,13 @@ class CountryData(Contract):
     )
 
 class EUData(Contract):
-    thinking: str = Field(None, description="Think step by step. You have 2 pages dont forget to add them.")
+    thinking: str = Field(None, description="Think step by step. You have 2 pages dont forget to add them. Cannot be NULL or empty.")
     eu_total_gdp_million_27: float = Field(None, description="EU27 Total GDP (€ million)")
     eu_total_gdp_million_28: float = Field(None, description="EU28 Total GDP (€ million)")
-    countries: List[CountryData] = Field(None, description="List of countries. Make sure you add all countries of every page, not just the first one.")
+    countries: List[CountryData] = Field(None, description="List of countries. Make sure you add all countries of every page, not just the first one.")
+
+class EUDataOptional(Contract):
+    #thinking: str = Field(None, description="Think step by step. You have 2 pages dont forget to add them.")
+    eu_total_gdp_million_27: float = Field(None, description="EU27 Total GDP (€ million)")
+    eu_total_gdp_million_28: float = Field(None, description="EU28 Total GDP (€ million)")
+    countries: Optional[List[CountryData]] = Field(None, description="List of countries. Make sure you add all countries of every page, not just the first one.")
diff --git a/tests/test_classify.py b/tests/test_classify.py
@@ -12,6 +12,7 @@
 from extract_thinker.models.classification_response import ClassificationResponse
 from tests.models.invoice import CreditNoteContract, FinancialContract, InvoiceContract
 from tests.models.driver_license import DriverLicense, IdentificationContract
+from extract_thinker.global_models import get_lite_model, get_big_model
 
 # Setup environment and common paths
 load_dotenv()
@@ -32,9 +33,9 @@ def setup_extractors():
     document_loader = DocumentLoaderTesseract(tesseract_path)
 
     extractors = [
-        ("gpt-3.5-turbo", "gpt-3.5-turbo"),
-        ("claude-3-haiku-20240307", "claude-3-haiku-20240307"),
-        ("gpt-4o", "gpt-4o")
+        (get_lite_model(), get_lite_model()),
+        (get_big_model(), get_big_model()),
+        (get_big_model(), get_big_model())
     ]
 
     configured_extractors = []
@@ -78,9 +79,9 @@ def setup_process_with_gpt4_extractor():
     print(f"Tesseract path: {tesseract_path}")
     document_loader = DocumentLoaderTesseract(tesseract_path)
 
-    # Initialize the GPT-4 extractor
+    # Initialize the GPT-4 extractor using the big model
     gpt_4_extractor = Extractor(document_loader)
-    gpt_4_extractor.load_llm("gpt-4o")
+    gpt_4_extractor.load_llm(get_big_model())
 
     # Create the process with only the GPT-4 extractor
     process = Process()
@@ -298,16 +299,16 @@ def test_mom_classification_layers():
     # Initialize extractors with different models
     # Layer 1: Small models that might disagree
     gpt35_extractor = Extractor(document_loader)
-    gpt35_extractor.load_llm("claude-3-5-haiku-20241022")
+    gpt35_extractor.load_llm(get_big_model())
     
     claude_haiku_extractor = Extractor(document_loader)
-    claude_haiku_extractor.load_llm("gpt-4o-mini")
+    claude_haiku_extractor.load_llm(get_lite_model())
     
     # Layer 2: More capable models for resolution
     gpt4_extractor = Extractor(document_loader)
-    gpt4_extractor.load_llm("gpt-4o")
+    gpt4_extractor.load_llm(get_big_model())
     sonnet_extractor = Extractor(document_loader)
-    sonnet_extractor.load_llm("claude-3-5-sonnet-20241022")
+    sonnet_extractor.load_llm(get_big_model())
     
     # Create process with multiple layers
     process = Process()
diff --git a/tests/test_document_loader_pypdf.py b/tests/test_document_loader_pypdf.py
@@ -1,7 +1,7 @@
 import os
 import pytest
 from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf, PyPDFConfig
-from .test_document_loader_base import BaseDocumentLoaderTest
+from tests.test_document_loader_base import BaseDocumentLoaderTest
 
 class TestDocumentLoaderPyPdf(BaseDocumentLoaderTest):
     @pytest.fixture
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
@@ -11,13 +11,14 @@
 from tests.models.invoice import InvoiceContract
 from tests.models.ChartWithContent import ChartWithContent
 from tests.models.page_contract import ReportContract
-from tests.models.gdp_contract import EUData
+from tests.models.gdp_contract import EUData, EUDataOptional
 from extract_thinker.document_loader.document_loader_azure_document_intelligence import DocumentLoaderAzureForm
 import pytest
 import numpy as np
 from litellm import embedding
-from extract_thinker.document_loader.document_loader_docling import DocumentLoaderDocling
+from extract_thinker.document_loader.document_loader_docling import DoclingConfig, DocumentLoaderDocling
 from tests.models.handbook_contract import HandbookContract
+from extract_thinker.global_models import get_lite_model, get_big_model
 
 
 load_dotenv()
@@ -32,7 +33,7 @@ def test_extract_with_pypdf_and_gpt4o_mini():
     extractor.load_document_loader(
         DocumentLoaderPyPdf()
     )
-    extractor.load_llm("gpt-4o-mini")
+    extractor.load_llm(get_lite_model())
 
     # Act
     result = extractor.extract(test_file_path, InvoiceContract)
@@ -51,7 +52,7 @@ def test_extract_with_azure_di_and_gpt4o_mini():
     extractor.load_document_loader(
         DocumentLoaderAzureForm(subscription_key, endpoint)
     )
-    extractor.load_llm("gpt-4o-mini")
+    extractor.load_llm(get_lite_model())
     # Act
     result = extractor.extract(test_file_path, InvoiceContract)
 
@@ -71,7 +72,7 @@ def test_extract_with_pypdf_and_gpt4o_mini():
     extractor.load_llm("gpt-4o-mini")
     
     # Act
-    result = extractor.extract(test_file_path, InvoiceContract)
+    result = extractor.extract(test_file_path, InvoiceContract, vision=True)
 
     # Assert
     assert result is not None
@@ -83,7 +84,7 @@ def test_extract_with_pypdf_and_gpt4o_mini():
 def test_vision_content_pdf():
     # Arrange
     extractor = Extractor()
-    extractor.load_llm("gpt-4o-mini")
+    extractor.load_llm(get_lite_model())
     test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf")
 
     # Act
@@ -108,7 +109,7 @@ def test_vision_content_pdf():
 def test_chart_with_content():
     # Arrange
     extractor = Extractor()
-    extractor.load_llm("gpt-4o-mini")
+    extractor.load_llm(get_lite_model())
     test_file_path = os.path.join(cwd, "tests", "test_images", "eu_tax_chart.png")
 
     # Act
@@ -131,7 +132,7 @@ def test_extract_with_loader_and_vision():
     extractor = Extractor()
     loader = DocumentLoaderPyPdf()
     extractor.load_document_loader(loader)
-    extractor.load_llm("gpt-4o-mini")
+    extractor.load_llm(get_lite_model())
 
     # Act
     result = extractor.extract(test_file_path, InvoiceContract, vision=True)
@@ -152,7 +153,7 @@ def test_extract_with_loader_and_vision():
 def test_extract_with_invalid_file_path():
     # Arrange
     extractor = Extractor()
-    extractor.load_llm("gpt-4o-mini")
+    extractor.load_llm(get_lite_model())
     invalid_file_path = os.path.join(cwd, "tests", "nonexistent", "fake_file.png")
 
     # Act & Assert
@@ -165,7 +166,7 @@ def test_forbidden_strategy_with_token_limit():
     test_file_path = os.path.join(os.getcwd(), "tests", "test_images", "eu_tax_chart.png")
     tesseract_path = os.getenv("TESSERACT_PATH")
 
-    llm = LLM("gpt-4o-mini", token_limit=10)
+    llm = LLM(get_lite_model(), token_limit=10)
 
     extractor = Extractor()
     extractor.load_document_loader(DocumentLoaderTesseract(tesseract_path))
@@ -194,7 +195,7 @@ def test_pagination_handler():
 
     extractor = Extractor()
     extractor.load_document_loader(DocumentLoaderDocling())
-    extractor.load_llm("gpt-4o")
+    extractor.load_llm(get_big_model())
 
     # Create and run both extractions in parallel
     async def run_parallel_extractions():
@@ -204,8 +205,8 @@ async def run_parallel_extractions():
         )
         return result_1, result_2
 
-    # Run the async code
-    results: tuple[EUData, EUData] = asyncio.run(run_parallel_extractions())
+    # Run the async extraction and get the results as instances of OptionalEUData
+    results = asyncio.run(run_parallel_extractions())
     result_1, result_2 = results
 
     # Compare top-level EU data
@@ -252,6 +253,25 @@ async def run_parallel_extractions():
     #             assert province1.share_in_eu27_gdp == matching_province.share_in_eu27_gdp
     #             assert province1.gdp_per_capita == matching_province.gdp_per_capita
 
+def test_pagination_handler_optional():
+    test_file_path = os.path.join(os.getcwd(), "tests", "files", "Regional_GDP_per_capita_2018_2.pdf")
+
+    extractor = Extractor()
+    extractor.load_document_loader(DocumentLoaderDocling())
+    extractor.load_llm(get_big_model())
+
+    async def extract_async_optional(extractor, file_path, vision, completion_strategy):
+        return extractor.extract(
+            file_path,
+            EUDataOptional,
+            vision=vision,
+            completion_strategy=completion_strategy
+        )
+    
+    result = asyncio.run(extract_async_optional(extractor, test_file_path, vision=True, completion_strategy=CompletionStrategy.PAGINATE))
+
+    assert len(result.countries) == 6
+
 def get_embedding(text, model="text-embedding-ada-002"):
     text = text.replace("\n", " ")
     response = embedding(
@@ -284,7 +304,7 @@ def test_concatenation_handler():
     tesseract_path = os.getenv("TESSERACT_PATH")
     extractor = Extractor()
     extractor.load_document_loader(DocumentLoaderTesseract(tesseract_path))
-    llm_first = LLM("gpt-4o", token_limit=500)
+    llm_first = LLM(get_big_model(), token_limit=500)
     extractor.load_llm(llm_first)
 
     result_1: ReportContract = extractor.extract(
@@ -296,7 +316,7 @@ def test_concatenation_handler():
 
     second_extractor = Extractor()
     second_extractor.load_document_loader(DocumentLoaderTesseract(tesseract_path))
-    second_extractor.load_llm("gpt-4o")
+    second_extractor.load_llm(get_big_model())
 
     result_2: ReportContract = second_extractor.extract(
         test_file_path,
@@ -324,7 +344,7 @@ def test_llm_timeout():
     extractor.load_document_loader(DocumentLoaderPyPdf())
     
     # Create LLM with very short timeout
-    llm = LLM("gpt-4o-mini")
+    llm = LLM(get_lite_model())
     llm.set_timeout(1)  # Set timeout to 1ms (extremely short to force timeout)
     extractor.load_llm(llm)
     
@@ -374,7 +394,7 @@ def test_extract_with_default_backend():
     
     extractor = Extractor()
     extractor.load_document_loader(DocumentLoaderPyPdf())
-    extractor.load_llm(LLM("gpt-4o-mini", backend=LLMEngine.DEFAULT))
+    extractor.load_llm(LLM(get_lite_model(), backend=LLMEngine.DEFAULT))
 
     # Act
     result = extractor.extract(test_file_path, InvoiceContract)
@@ -395,7 +415,7 @@ def test_extract_with_pydanticai_backend():
         
         extractor = Extractor()
         extractor.load_document_loader(DocumentLoaderPyPdf())
-        extractor.load_llm(LLM("openai:gpt-4o-mini", backend=LLMEngine.PYDANTIC_AI))
+        extractor.load_llm(LLM(get_lite_model(), backend=LLMEngine.PYDANTIC_AI))
 
         # Act
         result = extractor.extract(test_file_path, InvoiceContract)
@@ -419,7 +439,7 @@ def test_extract_from_url_docling_and_gpt4o_mini():
     # Initialize the extractor, load the Docling loader and the gpt-4o-mini LLM
     extractor = Extractor()
     extractor.load_document_loader(DocumentLoaderDocling())
-    extractor.load_llm("gpt-4o-mini")
+    extractor.load_llm(get_lite_model())
     
     # Act: Extract the document using the specified URL and the HandbookContract
     result = extractor.extract(url, HandbookContract)
diff --git a/tests/test_process.py b/tests/test_process.py