enoch3712
diff --git a/‎extract_thinker/document_loader/document_loader_mistral_ocr.py‎
Lines changed: 434 additions & 28 deletions b/‎extract_thinker/document_loader/document_loader_mistral_ocr.py‎
Lines changed: 434 additions & 28 deletions
diff --git a/‎extract_thinker/global_models.py‎
Lines changed: 2 additions & 2 deletions b/‎extract_thinker/global_models.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎extract_thinker/markdown/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎extract_thinker/markdown/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extract_thinker/markdown/markdown_converter.py‎
Lines changed: 751 additions & 0 deletions b/‎extract_thinker/markdown/markdown_converter.py‎
Lines changed: 751 additions & 0 deletions
diff --git a/‎extract_thinker/utils.py‎
Lines changed: 72 additions & 1 deletion b/‎extract_thinker/utils.py‎
Lines changed: 72 additions & 1 deletion
diff --git a/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/test_document_loader_mistral_ocr.py‎
Lines changed: 174 additions & 47 deletions b/‎tests/test_document_loader_mistral_ocr.py‎
Lines changed: 174 additions & 47 deletions
@@ -1,9 +1,9 @@
 def get_lite_model():
     """Return the lite model for cost efficiency."""
     #return "vertex_ai/gemini-2.0-flash"
-    return "gpt-4o-mini"
+    return "gemini/gemini-2.0-flash"
 
 def get_big_model():
     """Return the big model for high performance."""
     #return "vertex_ai/gemini-2.0-flash"
-    return "gpt-4o"
+    return "gemini/gemini-2.0-flash"
@@ -0,0 +1 @@
+ 
@@ -560,4 +560,75 @@ def classify_vision_error(e: Exception, vision: bool) -> None:
     if vision and isinstance(e.args[0], litellm.BadRequestError):
         raise VisionError(f"Make sure that the model you're using supports vision features: {e.args[0].message}") from e
     else:
-        raise e
+        raise e
+
+def convert_jpg_to_png(source: Union[str, BytesIO, bytes, Image.Image], output_path: Optional[str] = None) -> Union[str, BytesIO]:
+    """
+    Convert JPG to PNG format.
+    
+    Args:
+        source (Union[str, BytesIO, bytes, Image.Image]): The JPG image source.
+        output_path (Optional[str]): Path to save the PNG image. If None and source is a file path,
+                                     replaces the .jpg extension with .png.
+                                     
+    Returns:
+        Union[str, BytesIO]: Path to the saved PNG file or BytesIO containing the PNG data
+    """
+    try:
+        # Handle different source types
+        if isinstance(source, str):
+            # It's a file path
+            img = Image.open(source)
+            
+            # If no output path provided, replace extension
+            if output_path is None:
+                filename, ext = os.path.splitext(source)
+                output_path = f"{filename}.png"
+            
+            # Save as PNG
+            img.save(output_path, "PNG")
+            return output_path
+            
+        elif isinstance(source, BytesIO):
+            # It's a BytesIO object
+            current_position = source.tell()
+            source.seek(0)
+            img = Image.open(source)
+            source.seek(current_position)
+            
+            # Create a new BytesIO for the PNG
+            png_buffer = BytesIO()
+            img.save(png_buffer, format="PNG")
+            png_buffer.seek(0)
+            return png_buffer
+            
+        elif isinstance(source, bytes):
+            # It's raw bytes
+            img = Image.open(BytesIO(source))
+            
+            if output_path:
+                # Save to file
+                img.save(output_path, "PNG")
+                return output_path
+            else:
+                # Return as BytesIO
+                png_buffer = BytesIO()
+                img.save(png_buffer, format="PNG")
+                png_buffer.seek(0)
+                return png_buffer
+                
+        elif isinstance(source, Image.Image):
+            # It's already a PIL Image
+            if output_path:
+                source.save(output_path, "PNG")
+                return output_path
+            else:
+                png_buffer = BytesIO()
+                source.save(png_buffer, format="PNG")
+                png_buffer.seek(0)
+                return png_buffer
+        else:
+            raise ValueError("Source must be a file path (str), BytesIO, bytes, or PIL Image")
+            
+    except Exception as e:
+        raise Exception(f"Failed to convert JPG to PNG: {str(e)}")
@@ -44,6 +44,7 @@ nav:
           - Text: core-concepts/document-loaders/txt.md
           - Docling: core-concepts/document-loaders/docling.md
           - Data: core-concepts/document-loaders/data.md
+          - Mistral OCR: core-concepts/document-loaders/mistral-ocr.md
           - Adobe PDF Services: '#'
           - ABBYY FineReader: '#'
           - PaddleOCR: '#'
 
@@ -1,8 +1,8 @@
 import os
 import pytest
-import tempfile
 from io import BytesIO
 from extract_thinker.document_loader.document_loader_mistral_ocr import DocumentLoaderMistralOCR, MistralOCRConfig
+from unittest.mock import patch, Mock
 
 class TestDocumentLoaderMistralOCR:
     def test_config_validation(self):
@@ -25,23 +25,28 @@ def test_url_processing(self):
         # Create config and loader
         config = MistralOCRConfig(
             api_key=api_key,
-            model="mistral-ocr-latest",
-            include_image_base64=True
+            model="mistral-ocr-latest"
         )
         loader = DocumentLoaderMistralOCR(config)
 
         # Use a publicly accessible PDF URL for testing
         test_url = "https://arxiv.org/pdf/2503.24339"
 
-        # Test URL handling
-        result = loader.load(test_url)
-        
-        # Verify response structure
-        assert isinstance(result, list)
-        assert len(result) > 0
-        assert "content" in result[0]
-        assert isinstance(result[0]["content"], str)
-        assert len(result[0]["content"]) > 0
+        # Mock the convert_to_images method to avoid Playwright issues
+        with patch.object(loader, 'convert_to_images', return_value={}):
+            # Mock the API response to avoid rate limiting issues
+            mock_response = Mock()
+            mock_response.json.return_value = {"pages": [{"markdown": "Test content", "index": 0}]}
+            with patch('requests.post', return_value=mock_response) as mock_post:
+                # Test URL handling
+                result = loader.load(test_url)
+                
+                # Verify response structure
+                assert isinstance(result, list)
+                assert len(result) > 0
+                assert "content" in result[0]
+                assert isinstance(result[0]["content"], str)
+                assert len(result[0]["content"]) > 0
 
     def test_file_processing(self):
         """Test processing local files"""
@@ -53,8 +58,7 @@ def test_file_processing(self):
         # Create config and loader
         config = MistralOCRConfig(
             api_key=api_key,
-            model="mistral-ocr-latest",
-            include_image_base64=True
+            model="mistral-ocr-latest"
         )
         loader = DocumentLoaderMistralOCR(config)
 
@@ -70,21 +74,61 @@ def test_file_processing(self):
         if file_size_mb > 50:
             pytest.skip(f"Test file too large ({file_size_mb:.2f}MB) - Mistral has a 50MB limit")
 
-        # Test file handling
-        try:
-            result = loader.load(test_file_path)
+        # Mock the convert_to_images method since we're just testing API calls
+        with patch.object(loader, 'convert_to_images', return_value={0: b'dummy_image_data'}):
+            # Mock the API response to avoid rate limiting issues
+            mock_response = Mock()
+            mock_response.json.return_value = {"pages": [{"markdown": "Test content", "index": 0}]}
+            
+            # Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
+            with patch.object(loader, '_upload_file_to_mistral', return_value='dummy-file-id'):
+                with patch.object(loader, '_get_signed_url', return_value='https://example.com/file'):
+                    with patch('requests.post', return_value=mock_response):
+                        result = loader.load(test_file_path)
+                        
+                        # Verify response structure
+                        assert isinstance(result, list)
+                        assert len(result) > 0
+                        assert "content" in result[0]
+                        assert isinstance(result[0]["content"], str)
+                        assert len(result[0]["content"]) > 0
+
+    def test_image_processing(self):
+        """Test processing image files"""
+        # Skip if no API key is set
+        api_key = os.getenv("MISTRAL_API_KEY")
+        if not api_key:
+            pytest.skip("MISTRAL_API_KEY environment variable not set")
+            
+        # Create config and loader
+        config = MistralOCRConfig(
+            api_key=api_key,
+            model="mistral-ocr-latest"
+        )
+        loader = DocumentLoaderMistralOCR(config)
+        
+        # Create a simple test image in memory
+        # This avoids issues with file path dependencies
+        image_data = self._create_test_image()
+        image_buffer = BytesIO(image_data)
+        
+        # Mock the convert_to_images method for consistent testing
+        with patch.object(loader, 'convert_to_images', return_value={0: image_data}):
+            # Mock the API response for image processing
+            mock_response = Mock()
+            mock_response.json.return_value = {"pages": [{"markdown": "Test OCR Image", "index": 0}]}
 
-            # Verify response structure
-            assert isinstance(result, list)
-            assert len(result) > 0
-            assert "content" in result[0]
-            assert isinstance(result[0]["content"], str)
-            assert len(result[0]["content"]) > 0
-        except ValueError as e:
-            if "Mistral API upload error" in str(e) or "Mistral API signed URL error" in str(e):
-                pytest.skip(f"Mistral API upload failed: {str(e)}")
-            else:
-                raise
+            # Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
+            with patch.object(loader, '_upload_file_to_mistral', return_value='dummy-file-id'):
+                with patch.object(loader, '_get_signed_url', return_value='https://example.com/file'):
+                    with patch('requests.post', return_value=mock_response):
+                        result = loader.load(image_buffer)
+                        
+                        # Verify response structure
+                        assert isinstance(result, list)
+                        assert len(result) > 0
+                        assert "content" in result[0]
+                        assert isinstance(result[0]["content"], str)
 
     def test_bytesio_processing(self):
         """Test processing BytesIO objects"""
@@ -96,8 +140,7 @@ def test_bytesio_processing(self):
         # Create config and loader
         config = MistralOCRConfig(
             api_key=api_key,
-            model="mistral-ocr-latest",
-            include_image_base64=True
+            model="mistral-ocr-latest"
         )
         loader = DocumentLoaderMistralOCR(config)
 
@@ -117,21 +160,53 @@ def test_bytesio_processing(self):
         with open(test_file_path, 'rb') as f:
             bytes_io = BytesIO(f.read())
 
-        # Test BytesIO handling
+        # Mock convert_to_images for consistent testing
+        with patch.object(loader, 'convert_to_images', return_value={0: b'dummy_image_data'}):
+            # Mock the API response to avoid rate limiting issues
+            mock_response = Mock()
+            mock_response.json.return_value = {"pages": [{"markdown": "Test content", "index": 0}]}
+            
+            # Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
+            with patch.object(loader, '_upload_file_to_mistral', return_value='dummy-file-id'):
+                with patch.object(loader, '_get_signed_url', return_value='https://example.com/file'):
+                    with patch('requests.post', return_value=mock_response):
+                        result = loader.load(bytes_io)
+                        
+                        # Verify response structure
+                        assert isinstance(result, list)
+                        assert len(result) > 0
+                        assert "content" in result[0]
+                        assert isinstance(result[0]["content"], str)
+                        assert len(result[0]["content"]) > 0
+
+    def _create_test_image(self):
+        """Create a simple test image for testing"""
         try:
-            result = loader.load(bytes_io)
+            from PIL import Image, ImageDraw
+            
+            # Create a blank image with text
+            img = Image.new('RGB', (200, 100), color='white')
+            d = ImageDraw.Draw(img)
+            d.text((20, 40), "Test OCR Image", fill='black')
 
-            # Verify response structure
-            assert isinstance(result, list)
-            assert len(result) > 0
-            assert "content" in result[0]
-            assert isinstance(result[0]["content"], str)
-            assert len(result[0]["content"]) > 0
-        except ValueError as e:
-            if "Mistral API upload error" in str(e) or "Mistral API signed URL error" in str(e):
-                pytest.skip(f"Mistral API upload failed: {str(e)}")
-            else:
-                raise
+            # Save to BytesIO
+            img_byte_array = BytesIO()
+            img.save(img_byte_array, format="JPEG")
+            return img_byte_array.getvalue()
+        except ImportError:
+            # If PIL is not available, return a minimal valid JPEG
+            # This is a 1x1 pixel JPEG
+            return bytes([
+                0xFF, 0xD8,                      # SOI marker
+                0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00,  # APP0 marker
+                0xFF, 0xDB, 0x00, 0x43, 0x00,    # DQT marker
+                0x08, 0x06, 0x06, 0x07, 0x06, 0x05, 0x08, 0x07, 0x07, 0x07, 0x09, 0x09, 0x08, 0x0A, 0x0C, 0x14, 0x0D, 0x0C, 0x0B, 0x0B, 0x0C, 0x19, 0x12, 0x13, 0x0F, 0x14, 0x1D, 0x1A, 0x1F, 0x1E, 0x1D, 0x1A, 0x1C, 0x1C, 0x20, 0x24, 0x2E, 0x27, 0x20, 0x22, 0x2C, 0x23, 0x1C, 0x1C, 0x28, 0x37, 0x29, 0x2C, 0x30, 0x31, 0x34, 0x34, 0x34, 0x1F, 0x27, 0x39, 0x3D, 0x38, 0x32, 0x3C, 0x2E, 0x33, 0x34, 0x32,
+                0xFF, 0xC0, 0x00, 0x0B, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01, 0x01, 0x11, 0x00,  # SOF marker (1x1 px)
+                0xFF, 0xC4, 0x00, 0x14, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,  # DHT marker
+                0xFF, 0xC4, 0x00, 0x14, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  # DHT marker
+                0xFF, 0xDA, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00,  # SOS marker
+                0xFF, 0xD9                       # EOI marker
+            ])
 
     def test_can_handle(self):
         """Test can_handle method"""
@@ -171,10 +246,62 @@ def test_pagination_support(self):
         try:
             assert loader.can_handle_paginate("document.pdf") is True
             assert loader.can_handle_paginate("document.jpg") is False
-            assert loader.can_handle_paginate("document.png") is False
+            assert loader.can_handle_paginate("https://example.com/document.pdf") is False
         finally:
             os.path.isfile = original_isfile
 
-if __name__ == "__main__":
-    test = TestDocumentLoaderMistralOCR()
-    test.test_url_processing()
+#     @pytest.mark.slow
+#     def test_recursive_image_processing(self):
+#         """Test recursive processing of images within a PDF document."""
+#         # Skip if no API key is set
+#         api_key = os.getenv("MISTRAL_API_KEY")
+#         if not api_key:
+#             pytest.skip("MISTRAL_API_KEY environment variable not set")
+            
+#         # Create config and loader
+#         config = MistralOCRConfig(
+#             api_key=api_key,
+#             model="mistral-ocr-latest",
+#             include_image_base64=True  # Enable image base64 in response
+#         )
+#         loader = DocumentLoaderMistralOCR(config)
+        
+#         # Test file path for bulk.pdf which contains embedded images
+#         test_file_path = os.path.join(os.getcwd(), 'tests', 'files', 'bulk.pdf')
+        
+#         # Verify the file exists
+#         if not os.path.exists(test_file_path):
+#             pytest.skip(f"Test file not found: {test_file_path}")
+            
+#         # Check file size - Mistral has a 50MB limit
+#         file_size_mb = os.path.getsize(test_file_path) / (1024 * 1024)
+#         if file_size_mb > 50:
+#             pytest.skip(f"Test file too large ({file_size_mb:.2f}MB) - Mistral has a 50MB limit")
+        
+#         # Load the document
+#         result = loader.load(test_file_path)
+        
+#         # Verify response structure
+#         assert isinstance(result, list)
+#         assert len(result) > 0
+        
+#         # Get the last page
+#         last_page = result[-1]
+        
+#         # Check if the page has images
+#         assert "images" in last_page, "Last page should have images"
+#         assert isinstance(last_page["images"], list), "images should be a list"
+#         assert len(last_page["images"]) > 0, "Should have at least one image"
+        
+#         # Check if the driver's license number is in the processed content
+#         license_found = False
+#         for img in last_page["images"]:
+#             if "content" in img and "123 456 789" in img["content"]:
+#                 license_found = True
+#                 break
+        
+#         assert license_found, "Driver's license number '123 456 789' not found in processed images"
+
+# if __name__ == "__main__":
+#     test = TestDocumentLoaderMistralOCR()
+#     test.test_recursive_image_processing()