Merge pull request #110 from enoch3712/107-mom-of-classification-group_classifications-value-it-is-replaced-every-time-the-layer-is-changed

enoch3712 · web-flow · commit 21e38713971c · 2024-12-10T15:58:56.000+01:00
classification layers fixed
diff --git a/extract_thinker/process.py b/extract_thinker/process.py
@@ -1,5 +1,6 @@
 import asyncio
 from typing import IO, Any, Dict, List, Optional, Union
+from extract_thinker.models.classification_response import ClassificationResponse
 from extract_thinker.models.classification_strategy import ClassificationStrategy
 from extract_thinker.models.doc_groups2 import DocGroups2
 from extract_thinker.models.splitting_strategy import SplittingStrategy
@@ -52,8 +53,10 @@ async def _classify_async(self, extractor: Extractor, file: str, classifications
         return await loop.run_in_executor(None, extractor.classify, file, classifications, image)
 
     def classify(self, file: str, classifications, strategy: ClassificationStrategy = ClassificationStrategy.CONSENSUS, threshold: int = 9, image: bool = False) -> Optional[Classification]:
+        if not isinstance(threshold, int) or threshold < 1 or threshold > 10:
+            raise ValueError("Threshold must be an integer between 1 and 10")
+        
         result = asyncio.run(self.classify_async(file, classifications, strategy, threshold, image))
-
         return result
 
     async def classify_async(
@@ -64,28 +67,43 @@ async def classify_async(
         threshold: int = 9,
         image: str = False
     ) -> Optional[Classification]:
+        if not isinstance(threshold, int) or threshold < 1 or threshold > 10:
+            raise ValueError("Threshold must be an integer between 1 and 10")
 
         if isinstance(classifications, ClassificationTree):
             return await self._classify_tree_async(file, classifications, threshold, image)
 
+        # Try each layer of extractors until we get a valid result
         for extractor_group in self.extractor_groups:
-            group_classifications = await asyncio.gather(*(self._classify_async(extractor, file, classifications, image) for extractor in extractor_group))
-
-        # Implement different strategies
-        if strategy == ClassificationStrategy.CONSENSUS:
-            # Check if all classifications in the group are the same
-            if len(set(group_classifications)) == 1:
-                return group_classifications[0]
-        elif strategy == ClassificationStrategy.HIGHER_ORDER:
-            # Pick the result with the highest confidence
-            return max(group_classifications, key=lambda c: c.confidence)
-        elif strategy == ClassificationStrategy.CONSENSUS_WITH_THRESHOLD:
-            if len(set(group_classifications)) == 1:
-                maxResult = max(group_classifications, key=lambda c: c.confidence)
-                if maxResult.confidence >= threshold:
-                    return maxResult
-
-        raise ValueError("No consensus could be reached on the classification of the document. Please try again with a different strategy or threshold.")
+            group_classifications = await asyncio.gather(*(
+                self._classify_async(extractor, file, classifications, image) 
+                for extractor in extractor_group
+            ))
+
+            try:
+                # Attempt to get result based on strategy
+                if strategy == ClassificationStrategy.CONSENSUS:
+                    if len(set(c.name for c in group_classifications)) == 1:
+                        return group_classifications[0]
+                        
+                elif strategy == ClassificationStrategy.HIGHER_ORDER:
+                    return max(group_classifications, key=lambda c: c.confidence)
+                    
+                elif strategy == ClassificationStrategy.CONSENSUS_WITH_THRESHOLD:
+                    if len(set(c.name for c in group_classifications)) == 1:
+                        if all(c.confidence >= threshold for c in group_classifications):
+                            return group_classifications[0]
+                            
+                # If we get here, current layer didn't meet criteria - continue to next layer
+                continue
+                
+            except Exception as e:
+                # If there's an error processing this layer, try the next one
+                print(f"Layer failed with error: {str(e)}")
+                continue
+
+        # If we've tried all layers and none worked
+        raise ValueError("No consensus could be reached on the classification of the document across any layer. Please try again with a different strategy or threshold.")
 
     async def _classify_tree_async(
         self, 
@@ -94,6 +112,9 @@ async def _classify_tree_async(
         threshold: float,
         image: bool
     ) -> Optional[Classification]:
+        if not isinstance(threshold, (int, float)) or threshold < 1 or threshold > 10:
+            raise ValueError("Threshold must be a number between 1 and 10")
+
         """
         Perform classification in a hierarchical, level-by-level approach.
         """
@@ -114,23 +135,23 @@ async def _classify_tree_async(
 
             if classification.confidence < threshold:
                 raise ValueError(
-                    f"Classification confidence {classification.confidence} "
-                    f"for '{classification.classification}' is below the threshold of {threshold}."
+                    f"Classification confidence {classification.confidence}"
+                    f"for '{classification.name}' is below the threshold of {threshold}."
                 )
 
-            best_classification = classification
+            best_classification: ClassificationResponse = classification
 
             matching_node = next(
                 (
-                    node for node in current_nodes 
+                    node for node in current_nodes
                     if node.classification.name == best_classification.name
                 ),
                 None
             )
 
             if matching_node is None:
                 raise ValueError(
-                    f"No matching node found for classification '{classification.classification}'."
+                    f"No matching node found for classification '{classification.name}'."
                 )
 
             if matching_node.children:
diff --git a/tests/test_classify.py b/tests/test_classify.py
@@ -2,6 +2,7 @@
 import asyncio
 from dotenv import load_dotenv
 from extract_thinker.document_loader.document_loader_aws_textract import DocumentLoaderAWSTextract
+from extract_thinker.document_loader.document_loader_txt import DocumentLoaderTxt
 from extract_thinker.extractor import Extractor
 from extract_thinker.models.classification_node import ClassificationNode
 from extract_thinker.models.classification_tree import ClassificationTree
@@ -116,7 +117,7 @@ def test_classify_consensus():
     assert result is not None
     assert isinstance(result, ClassificationResponse)
     assert result.name == "Invoice"
-
+ 
 
 def test_classify_higher_order():
     """Test classification using higher order strategy."""
@@ -229,7 +230,69 @@ def test_with_tree():
     current_dir = os.path.dirname(os.path.abspath(__file__))
     pdf_path = os.path.join(current_dir, 'files','invoice.pdf')
 
-    result = process.classify(pdf_path, classification_tree, threshold=0.8)
+    result = process.classify(pdf_path, classification_tree, threshold=7)
 
     assert result is not None
-    assert result.name == "Invoice"
+    assert result.name == "Invoice"
+
+def test_mom_classification_layers():
+    """Test Mixture of Models (MoM) classification with multiple layers."""
+    # Arrange
+    document_loader = DocumentLoaderTxt()
+    
+    # Get test file path 
+    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+    CREDIT_NOTE_PATH = os.path.join(CURRENT_DIR, "files", "ambiguous_credit_note.txt")
+    
+    # Create ambiguous classifications
+    test_classifications = [
+        Classification(
+            name="Receipt",
+            description="A document showing payment received for goods or services, typically including items purchased, amounts, and payment method",
+            contract=InvoiceContract
+        ),
+        Classification(
+            name="Credit Note",
+            description="A document issued to reverse a previous transaction, showing returned items and credit amount, usually referencing an original invoice",
+            contract=CreditNoteContract
+        )
+    ]
+    
+    # Initialize extractors with different models
+    # Layer 1: Small models that might disagree
+    gpt35_extractor = Extractor(document_loader)
+    gpt35_extractor.load_llm("gpt-3.5-turbo")
+    
+    claude_haiku_extractor = Extractor(document_loader)
+    claude_haiku_extractor.load_llm("claude-3-haiku-20240307")
+    
+    # Layer 2: More capable models for resolution
+    gpt4_extractor = Extractor(document_loader)
+    gpt4_extractor.load_llm("gpt-4o")
+    sonnet_extractor = Extractor(document_loader)
+    sonnet_extractor.load_llm("claude-3-5-sonnet-20241022")
+    
+    # Create process with multiple layers
+    process = Process()
+    process.add_classify_extractor([
+        [gpt35_extractor, claude_haiku_extractor],  # Layer 1: Small models
+        [gpt4_extractor, sonnet_extractor]          # Layer 2: Resolution model
+    ])
+    
+    # Test full MoM process (should resolve using Layer 2)
+    final_result = process.classify(
+        CREDIT_NOTE_PATH,
+        test_classifications,
+        strategy=ClassificationStrategy.CONSENSUS_WITH_THRESHOLD,
+        threshold=8
+    )
+    
+    # Print results for debugging
+    print("\nMoM Classification Results:")
+    print(f"Final Classification: {final_result.name}")
+    print(f"Confidence: {final_result.confidence}")
+    
+    # Assertions
+    assert final_result is not None, "MoM should produce a result"
+    assert final_result.name == "Credit Note", "Final classification should be Credit Note"
+    assert final_result.confidence >= 8, "Final confidence should be high"