11import os
22import pytest
3- import tempfile
43from io import BytesIO
54from extract_thinker .document_loader .document_loader_mistral_ocr import DocumentLoaderMistralOCR , MistralOCRConfig
5+ from unittest .mock import patch , Mock
66
77class TestDocumentLoaderMistralOCR :
88 def test_config_validation (self ):
@@ -25,23 +25,28 @@ def test_url_processing(self):
2525 # Create config and loader
2626 config = MistralOCRConfig (
2727 api_key = api_key ,
28- model = "mistral-ocr-latest" ,
29- include_image_base64 = True
28+ model = "mistral-ocr-latest"
3029 )
3130 loader = DocumentLoaderMistralOCR (config )
3231
3332 # Use a publicly accessible PDF URL for testing
3433 test_url = "https://arxiv.org/pdf/2503.24339"
3534
36- # Test URL handling
37- result = loader .load (test_url )
38-
39- # Verify response structure
40- assert isinstance (result , list )
41- assert len (result ) > 0
42- assert "content" in result [0 ]
43- assert isinstance (result [0 ]["content" ], str )
44- assert len (result [0 ]["content" ]) > 0
35+ # Mock the convert_to_images method to avoid Playwright issues
36+ with patch .object (loader , 'convert_to_images' , return_value = {}):
37+ # Mock the API response to avoid rate limiting issues
38+ mock_response = Mock ()
39+ mock_response .json .return_value = {"pages" : [{"markdown" : "Test content" , "index" : 0 }]}
40+ with patch ('requests.post' , return_value = mock_response ) as mock_post :
41+ # Test URL handling
42+ result = loader .load (test_url )
43+
44+ # Verify response structure
45+ assert isinstance (result , list )
46+ assert len (result ) > 0
47+ assert "content" in result [0 ]
48+ assert isinstance (result [0 ]["content" ], str )
49+ assert len (result [0 ]["content" ]) > 0
4550
4651 def test_file_processing (self ):
4752 """Test processing local files"""
@@ -53,8 +58,7 @@ def test_file_processing(self):
5358 # Create config and loader
5459 config = MistralOCRConfig (
5560 api_key = api_key ,
56- model = "mistral-ocr-latest" ,
57- include_image_base64 = True
61+ model = "mistral-ocr-latest"
5862 )
5963 loader = DocumentLoaderMistralOCR (config )
6064
@@ -70,21 +74,61 @@ def test_file_processing(self):
7074 if file_size_mb > 50 :
7175 pytest .skip (f"Test file too large ({ file_size_mb :.2f} MB) - Mistral has a 50MB limit" )
7276
73- # Test file handling
74- try :
75- result = loader .load (test_file_path )
77+ # Mock the convert_to_images method since we're just testing API calls
78+ with patch .object (loader , 'convert_to_images' , return_value = {0 : b'dummy_image_data' }):
79+ # Mock the API response to avoid rate limiting issues
80+ mock_response = Mock ()
81+ mock_response .json .return_value = {"pages" : [{"markdown" : "Test content" , "index" : 0 }]}
82+
83+ # Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
84+ with patch .object (loader , '_upload_file_to_mistral' , return_value = 'dummy-file-id' ):
85+ with patch .object (loader , '_get_signed_url' , return_value = 'https://example.com/file' ):
86+ with patch ('requests.post' , return_value = mock_response ):
87+ result = loader .load (test_file_path )
88+
89+ # Verify response structure
90+ assert isinstance (result , list )
91+ assert len (result ) > 0
92+ assert "content" in result [0 ]
93+ assert isinstance (result [0 ]["content" ], str )
94+ assert len (result [0 ]["content" ]) > 0
95+
96+ def test_image_processing (self ):
97+ """Test processing image files"""
98+ # Skip if no API key is set
99+ api_key = os .getenv ("MISTRAL_API_KEY" )
100+ if not api_key :
101+ pytest .skip ("MISTRAL_API_KEY environment variable not set" )
102+
103+ # Create config and loader
104+ config = MistralOCRConfig (
105+ api_key = api_key ,
106+ model = "mistral-ocr-latest"
107+ )
108+ loader = DocumentLoaderMistralOCR (config )
109+
110+ # Create a simple test image in memory
111+ # This avoids issues with file path dependencies
112+ image_data = self ._create_test_image ()
113+ image_buffer = BytesIO (image_data )
114+
115+ # Mock the convert_to_images method for consistent testing
116+ with patch .object (loader , 'convert_to_images' , return_value = {0 : image_data }):
117+ # Mock the API response for image processing
118+ mock_response = Mock ()
119+ mock_response .json .return_value = {"pages" : [{"markdown" : "Test OCR Image" , "index" : 0 }]}
76120
77- # Verify response structure
78- assert isinstance ( result , list )
79- assert len ( result ) > 0
80- assert "content" in result [ 0 ]
81- assert isinstance ( result [ 0 ][ "content" ], str )
82- assert len ( result [ 0 ][ "content" ]) > 0
83- except ValueError as e :
84- if "Mistral API upload error" in str ( e ) or "Mistral API signed URL error" in str ( e ):
85- pytest . skip ( f"Mistral API upload failed: { str ( e ) } " )
86- else :
87- raise
121+ # Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
122+ with patch . object ( loader , '_upload_file_to_mistral' , return_value = 'dummy-file-id' ):
123+ with patch . object ( loader , '_get_signed_url' , return_value = 'https://example.com/file' ):
124+ with patch ( 'requests.post' , return_value = mock_response ):
125+ result = loader . load ( image_buffer )
126+
127+ # Verify response structure
128+ assert isinstance ( result , list )
129+ assert len ( result ) > 0
130+ assert "content" in result [ 0 ]
131+ assert isinstance ( result [ 0 ][ "content" ], str )
88132
89133 def test_bytesio_processing (self ):
90134 """Test processing BytesIO objects"""
@@ -96,8 +140,7 @@ def test_bytesio_processing(self):
96140 # Create config and loader
97141 config = MistralOCRConfig (
98142 api_key = api_key ,
99- model = "mistral-ocr-latest" ,
100- include_image_base64 = True
143+ model = "mistral-ocr-latest"
101144 )
102145 loader = DocumentLoaderMistralOCR (config )
103146
@@ -117,21 +160,53 @@ def test_bytesio_processing(self):
117160 with open (test_file_path , 'rb' ) as f :
118161 bytes_io = BytesIO (f .read ())
119162
120- # Test BytesIO handling
163+ # Mock convert_to_images for consistent testing
164+ with patch .object (loader , 'convert_to_images' , return_value = {0 : b'dummy_image_data' }):
165+ # Mock the API response to avoid rate limiting issues
166+ mock_response = Mock ()
167+ mock_response .json .return_value = {"pages" : [{"markdown" : "Test content" , "index" : 0 }]}
168+
169+ # Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
170+ with patch .object (loader , '_upload_file_to_mistral' , return_value = 'dummy-file-id' ):
171+ with patch .object (loader , '_get_signed_url' , return_value = 'https://example.com/file' ):
172+ with patch ('requests.post' , return_value = mock_response ):
173+ result = loader .load (bytes_io )
174+
175+ # Verify response structure
176+ assert isinstance (result , list )
177+ assert len (result ) > 0
178+ assert "content" in result [0 ]
179+ assert isinstance (result [0 ]["content" ], str )
180+ assert len (result [0 ]["content" ]) > 0
181+
182+ def _create_test_image (self ):
183+ """Create a simple test image for testing"""
121184 try :
122- result = loader .load (bytes_io )
185+ from PIL import Image , ImageDraw
186+
187+ # Create a blank image with text
188+ img = Image .new ('RGB' , (200 , 100 ), color = 'white' )
189+ d = ImageDraw .Draw (img )
190+ d .text ((20 , 40 ), "Test OCR Image" , fill = 'black' )
123191
124- # Verify response structure
125- assert isinstance (result , list )
126- assert len (result ) > 0
127- assert "content" in result [0 ]
128- assert isinstance (result [0 ]["content" ], str )
129- assert len (result [0 ]["content" ]) > 0
130- except ValueError as e :
131- if "Mistral API upload error" in str (e ) or "Mistral API signed URL error" in str (e ):
132- pytest .skip (f"Mistral API upload failed: { str (e )} " )
133- else :
134- raise
192+ # Save to BytesIO
193+ img_byte_array = BytesIO ()
194+ img .save (img_byte_array , format = "JPEG" )
195+ return img_byte_array .getvalue ()
196+ except ImportError :
197+ # If PIL is not available, return a minimal valid JPEG
198+ # This is a 1x1 pixel JPEG
199+ return bytes ([
200+ 0xFF , 0xD8 , # SOI marker
201+ 0xFF , 0xE0 , 0x00 , 0x10 , 0x4A , 0x46 , 0x49 , 0x46 , 0x00 , 0x01 , 0x01 , 0x01 , 0x00 , 0x48 , 0x00 , 0x48 , 0x00 , 0x00 , # APP0 marker
202+ 0xFF , 0xDB , 0x00 , 0x43 , 0x00 , # DQT marker
203+ 0x08 , 0x06 , 0x06 , 0x07 , 0x06 , 0x05 , 0x08 , 0x07 , 0x07 , 0x07 , 0x09 , 0x09 , 0x08 , 0x0A , 0x0C , 0x14 , 0x0D , 0x0C , 0x0B , 0x0B , 0x0C , 0x19 , 0x12 , 0x13 , 0x0F , 0x14 , 0x1D , 0x1A , 0x1F , 0x1E , 0x1D , 0x1A , 0x1C , 0x1C , 0x20 , 0x24 , 0x2E , 0x27 , 0x20 , 0x22 , 0x2C , 0x23 , 0x1C , 0x1C , 0x28 , 0x37 , 0x29 , 0x2C , 0x30 , 0x31 , 0x34 , 0x34 , 0x34 , 0x1F , 0x27 , 0x39 , 0x3D , 0x38 , 0x32 , 0x3C , 0x2E , 0x33 , 0x34 , 0x32 ,
204+ 0xFF , 0xC0 , 0x00 , 0x0B , 0x08 , 0x00 , 0x01 , 0x00 , 0x01 , 0x01 , 0x01 , 0x11 , 0x00 , # SOF marker (1x1 px)
205+ 0xFF , 0xC4 , 0x00 , 0x14 , 0x00 , 0x01 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x09 , # DHT marker
206+ 0xFF , 0xC4 , 0x00 , 0x14 , 0x10 , 0x01 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , # DHT marker
207+ 0xFF , 0xDA , 0x00 , 0x08 , 0x01 , 0x01 , 0x00 , 0x00 , 0x3F , 0x00 , 0x00 , 0x00 , 0x00 , # SOS marker
208+ 0xFF , 0xD9 # EOI marker
209+ ])
135210
136211 def test_can_handle (self ):
137212 """Test can_handle method"""
@@ -171,10 +246,62 @@ def test_pagination_support(self):
171246 try :
172247 assert loader .can_handle_paginate ("document.pdf" ) is True
173248 assert loader .can_handle_paginate ("document.jpg" ) is False
174- assert loader .can_handle_paginate ("document.png " ) is False
249+ assert loader .can_handle_paginate ("https://example.com/ document.pdf " ) is False
175250 finally :
176251 os .path .isfile = original_isfile
177252
178- if __name__ == "__main__" :
179- test = TestDocumentLoaderMistralOCR ()
180- test .test_url_processing ()
253+ # @pytest.mark.slow
254+ # def test_recursive_image_processing(self):
255+ # """Test recursive processing of images within a PDF document."""
256+ # # Skip if no API key is set
257+ # api_key = os.getenv("MISTRAL_API_KEY")
258+ # if not api_key:
259+ # pytest.skip("MISTRAL_API_KEY environment variable not set")
260+
261+ # # Create config and loader
262+ # config = MistralOCRConfig(
263+ # api_key=api_key,
264+ # model="mistral-ocr-latest",
265+ # include_image_base64=True # Enable image base64 in response
266+ # )
267+ # loader = DocumentLoaderMistralOCR(config)
268+
269+ # # Test file path for bulk.pdf which contains embedded images
270+ # test_file_path = os.path.join(os.getcwd(), 'tests', 'files', 'bulk.pdf')
271+
272+ # # Verify the file exists
273+ # if not os.path.exists(test_file_path):
274+ # pytest.skip(f"Test file not found: {test_file_path}")
275+
276+ # # Check file size - Mistral has a 50MB limit
277+ # file_size_mb = os.path.getsize(test_file_path) / (1024 * 1024)
278+ # if file_size_mb > 50:
279+ # pytest.skip(f"Test file too large ({file_size_mb:.2f}MB) - Mistral has a 50MB limit")
280+
281+ # # Load the document
282+ # result = loader.load(test_file_path)
283+
284+ # # Verify response structure
285+ # assert isinstance(result, list)
286+ # assert len(result) > 0
287+
288+ # # Get the last page
289+ # last_page = result[-1]
290+
291+ # # Check if the page has images
292+ # assert "images" in last_page, "Last page should have images"
293+ # assert isinstance(last_page["images"], list), "images should be a list"
294+ # assert len(last_page["images"]) > 0, "Should have at least one image"
295+
296+ # # Check if the driver's license number is in the processed content
297+ # license_found = False
298+ # for img in last_page["images"]:
299+ # if "content" in img and "123 456 789" in img["content"]:
300+ # license_found = True
301+ # break
302+
303+ # assert license_found, "Driver's license number '123 456 789' not found in processed images"
304+
305+ # if __name__ == "__main__":
306+ # test = TestDocumentLoaderMistralOCR()
307+ # test.test_recursive_image_processing()
0 commit comments