Skip to content

Commit c2a60ce

Browse files
authored
Merge pull request #300 from enoch3712/mardown
[FEATURE] MARKDOWN and Mistral
2 parents 9b9d3b0 + e0111af commit c2a60ce

File tree

8 files changed

+1735
-78
lines changed

8 files changed

+1735
-78
lines changed

extract_thinker/document_loader/document_loader_mistral_ocr.py

Lines changed: 434 additions & 28 deletions
Large diffs are not rendered by default.

extract_thinker/global_models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
def get_lite_model():
22
"""Return the lite model for cost efficiency."""
33
#return "vertex_ai/gemini-2.0-flash"
4-
return "gpt-4o-mini"
4+
return "gemini/gemini-2.0-flash"
55

66
def get_big_model():
77
"""Return the big model for high performance."""
88
#return "vertex_ai/gemini-2.0-flash"
9-
return "gpt-4o"
9+
return "gemini/gemini-2.0-flash"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

extract_thinker/markdown/markdown_converter.py

Lines changed: 751 additions & 0 deletions
Large diffs are not rendered by default.

extract_thinker/utils.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -560,4 +560,75 @@ def classify_vision_error(e: Exception, vision: bool) -> None:
560560
if vision and isinstance(e.args[0], litellm.BadRequestError):
561561
raise VisionError(f"Make sure that the model you're using supports vision features: {e.args[0].message}") from e
562562
else:
563-
raise e
563+
raise e
564+
565+
def convert_jpg_to_png(source: Union[str, BytesIO, bytes, Image.Image], output_path: Optional[str] = None) -> Union[str, BytesIO]:
566+
"""
567+
Convert JPG to PNG format.
568+
569+
Args:
570+
source (Union[str, BytesIO, bytes, Image.Image]): The JPG image source.
571+
output_path (Optional[str]): Path to save the PNG image. If None and source is a file path,
572+
replaces the .jpg extension with .png.
573+
574+
Returns:
575+
Union[str, BytesIO]: Path to the saved PNG file or BytesIO containing the PNG data
576+
"""
577+
try:
578+
# Handle different source types
579+
if isinstance(source, str):
580+
# It's a file path
581+
img = Image.open(source)
582+
583+
# If no output path provided, replace extension
584+
if output_path is None:
585+
filename, ext = os.path.splitext(source)
586+
output_path = f"{filename}.png"
587+
588+
# Save as PNG
589+
img.save(output_path, "PNG")
590+
return output_path
591+
592+
elif isinstance(source, BytesIO):
593+
# It's a BytesIO object
594+
current_position = source.tell()
595+
source.seek(0)
596+
img = Image.open(source)
597+
source.seek(current_position)
598+
599+
# Create a new BytesIO for the PNG
600+
png_buffer = BytesIO()
601+
img.save(png_buffer, format="PNG")
602+
png_buffer.seek(0)
603+
return png_buffer
604+
605+
elif isinstance(source, bytes):
606+
# It's raw bytes
607+
img = Image.open(BytesIO(source))
608+
609+
if output_path:
610+
# Save to file
611+
img.save(output_path, "PNG")
612+
return output_path
613+
else:
614+
# Return as BytesIO
615+
png_buffer = BytesIO()
616+
img.save(png_buffer, format="PNG")
617+
png_buffer.seek(0)
618+
return png_buffer
619+
620+
elif isinstance(source, Image.Image):
621+
# It's already a PIL Image
622+
if output_path:
623+
source.save(output_path, "PNG")
624+
return output_path
625+
else:
626+
png_buffer = BytesIO()
627+
source.save(png_buffer, format="PNG")
628+
png_buffer.seek(0)
629+
return png_buffer
630+
else:
631+
raise ValueError("Source must be a file path (str), BytesIO, bytes, or PIL Image")
632+
633+
except Exception as e:
634+
raise Exception(f"Failed to convert JPG to PNG: {str(e)}")

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ nav:
4444
- Text: core-concepts/document-loaders/txt.md
4545
- Docling: core-concepts/document-loaders/docling.md
4646
- Data: core-concepts/document-loaders/data.md
47+
- Mistral OCR: core-concepts/document-loaders/mistral-ocr.md
4748
- Adobe PDF Services: '#'
4849
- ABBYY FineReader: '#'
4950
- PaddleOCR: '#'

tests/test_document_loader_mistral_ocr.py

Lines changed: 174 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import os
22
import pytest
3-
import tempfile
43
from io import BytesIO
54
from extract_thinker.document_loader.document_loader_mistral_ocr import DocumentLoaderMistralOCR, MistralOCRConfig
5+
from unittest.mock import patch, Mock
66

77
class TestDocumentLoaderMistralOCR:
88
def test_config_validation(self):
@@ -25,23 +25,28 @@ def test_url_processing(self):
2525
# Create config and loader
2626
config = MistralOCRConfig(
2727
api_key=api_key,
28-
model="mistral-ocr-latest",
29-
include_image_base64=True
28+
model="mistral-ocr-latest"
3029
)
3130
loader = DocumentLoaderMistralOCR(config)
3231

3332
# Use a publicly accessible PDF URL for testing
3433
test_url = "https://arxiv.org/pdf/2503.24339"
3534

36-
# Test URL handling
37-
result = loader.load(test_url)
38-
39-
# Verify response structure
40-
assert isinstance(result, list)
41-
assert len(result) > 0
42-
assert "content" in result[0]
43-
assert isinstance(result[0]["content"], str)
44-
assert len(result[0]["content"]) > 0
35+
# Mock the convert_to_images method to avoid Playwright issues
36+
with patch.object(loader, 'convert_to_images', return_value={}):
37+
# Mock the API response to avoid rate limiting issues
38+
mock_response = Mock()
39+
mock_response.json.return_value = {"pages": [{"markdown": "Test content", "index": 0}]}
40+
with patch('requests.post', return_value=mock_response) as mock_post:
41+
# Test URL handling
42+
result = loader.load(test_url)
43+
44+
# Verify response structure
45+
assert isinstance(result, list)
46+
assert len(result) > 0
47+
assert "content" in result[0]
48+
assert isinstance(result[0]["content"], str)
49+
assert len(result[0]["content"]) > 0
4550

4651
def test_file_processing(self):
4752
"""Test processing local files"""
@@ -53,8 +58,7 @@ def test_file_processing(self):
5358
# Create config and loader
5459
config = MistralOCRConfig(
5560
api_key=api_key,
56-
model="mistral-ocr-latest",
57-
include_image_base64=True
61+
model="mistral-ocr-latest"
5862
)
5963
loader = DocumentLoaderMistralOCR(config)
6064

@@ -70,21 +74,61 @@ def test_file_processing(self):
7074
if file_size_mb > 50:
7175
pytest.skip(f"Test file too large ({file_size_mb:.2f}MB) - Mistral has a 50MB limit")
7276

73-
# Test file handling
74-
try:
75-
result = loader.load(test_file_path)
77+
# Mock the convert_to_images method since we're just testing API calls
78+
with patch.object(loader, 'convert_to_images', return_value={0: b'dummy_image_data'}):
79+
# Mock the API response to avoid rate limiting issues
80+
mock_response = Mock()
81+
mock_response.json.return_value = {"pages": [{"markdown": "Test content", "index": 0}]}
82+
83+
# Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
84+
with patch.object(loader, '_upload_file_to_mistral', return_value='dummy-file-id'):
85+
with patch.object(loader, '_get_signed_url', return_value='https://example.com/file'):
86+
with patch('requests.post', return_value=mock_response):
87+
result = loader.load(test_file_path)
88+
89+
# Verify response structure
90+
assert isinstance(result, list)
91+
assert len(result) > 0
92+
assert "content" in result[0]
93+
assert isinstance(result[0]["content"], str)
94+
assert len(result[0]["content"]) > 0
95+
96+
def test_image_processing(self):
97+
"""Test processing image files"""
98+
# Skip if no API key is set
99+
api_key = os.getenv("MISTRAL_API_KEY")
100+
if not api_key:
101+
pytest.skip("MISTRAL_API_KEY environment variable not set")
102+
103+
# Create config and loader
104+
config = MistralOCRConfig(
105+
api_key=api_key,
106+
model="mistral-ocr-latest"
107+
)
108+
loader = DocumentLoaderMistralOCR(config)
109+
110+
# Create a simple test image in memory
111+
# This avoids issues with file path dependencies
112+
image_data = self._create_test_image()
113+
image_buffer = BytesIO(image_data)
114+
115+
# Mock the convert_to_images method for consistent testing
116+
with patch.object(loader, 'convert_to_images', return_value={0: image_data}):
117+
# Mock the API response for image processing
118+
mock_response = Mock()
119+
mock_response.json.return_value = {"pages": [{"markdown": "Test OCR Image", "index": 0}]}
76120

77-
# Verify response structure
78-
assert isinstance(result, list)
79-
assert len(result) > 0
80-
assert "content" in result[0]
81-
assert isinstance(result[0]["content"], str)
82-
assert len(result[0]["content"]) > 0
83-
except ValueError as e:
84-
if "Mistral API upload error" in str(e) or "Mistral API signed URL error" in str(e):
85-
pytest.skip(f"Mistral API upload failed: {str(e)}")
86-
else:
87-
raise
121+
# Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
122+
with patch.object(loader, '_upload_file_to_mistral', return_value='dummy-file-id'):
123+
with patch.object(loader, '_get_signed_url', return_value='https://example.com/file'):
124+
with patch('requests.post', return_value=mock_response):
125+
result = loader.load(image_buffer)
126+
127+
# Verify response structure
128+
assert isinstance(result, list)
129+
assert len(result) > 0
130+
assert "content" in result[0]
131+
assert isinstance(result[0]["content"], str)
88132

89133
def test_bytesio_processing(self):
90134
"""Test processing BytesIO objects"""
@@ -96,8 +140,7 @@ def test_bytesio_processing(self):
96140
# Create config and loader
97141
config = MistralOCRConfig(
98142
api_key=api_key,
99-
model="mistral-ocr-latest",
100-
include_image_base64=True
143+
model="mistral-ocr-latest"
101144
)
102145
loader = DocumentLoaderMistralOCR(config)
103146

@@ -117,21 +160,53 @@ def test_bytesio_processing(self):
117160
with open(test_file_path, 'rb') as f:
118161
bytes_io = BytesIO(f.read())
119162

120-
# Test BytesIO handling
163+
# Mock convert_to_images for consistent testing
164+
with patch.object(loader, 'convert_to_images', return_value={0: b'dummy_image_data'}):
165+
# Mock the API response to avoid rate limiting issues
166+
mock_response = Mock()
167+
mock_response.json.return_value = {"pages": [{"markdown": "Test content", "index": 0}]}
168+
169+
# Mock _upload_file_to_mistral and _get_signed_url to avoid real API calls
170+
with patch.object(loader, '_upload_file_to_mistral', return_value='dummy-file-id'):
171+
with patch.object(loader, '_get_signed_url', return_value='https://example.com/file'):
172+
with patch('requests.post', return_value=mock_response):
173+
result = loader.load(bytes_io)
174+
175+
# Verify response structure
176+
assert isinstance(result, list)
177+
assert len(result) > 0
178+
assert "content" in result[0]
179+
assert isinstance(result[0]["content"], str)
180+
assert len(result[0]["content"]) > 0
181+
182+
def _create_test_image(self):
183+
"""Create a simple test image for testing"""
121184
try:
122-
result = loader.load(bytes_io)
185+
from PIL import Image, ImageDraw
186+
187+
# Create a blank image with text
188+
img = Image.new('RGB', (200, 100), color='white')
189+
d = ImageDraw.Draw(img)
190+
d.text((20, 40), "Test OCR Image", fill='black')
123191

124-
# Verify response structure
125-
assert isinstance(result, list)
126-
assert len(result) > 0
127-
assert "content" in result[0]
128-
assert isinstance(result[0]["content"], str)
129-
assert len(result[0]["content"]) > 0
130-
except ValueError as e:
131-
if "Mistral API upload error" in str(e) or "Mistral API signed URL error" in str(e):
132-
pytest.skip(f"Mistral API upload failed: {str(e)}")
133-
else:
134-
raise
192+
# Save to BytesIO
193+
img_byte_array = BytesIO()
194+
img.save(img_byte_array, format="JPEG")
195+
return img_byte_array.getvalue()
196+
except ImportError:
197+
# If PIL is not available, return a minimal valid JPEG
198+
# This is a 1x1 pixel JPEG
199+
return bytes([
200+
0xFF, 0xD8, # SOI marker
201+
0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, # APP0 marker
202+
0xFF, 0xDB, 0x00, 0x43, 0x00, # DQT marker
203+
0x08, 0x06, 0x06, 0x07, 0x06, 0x05, 0x08, 0x07, 0x07, 0x07, 0x09, 0x09, 0x08, 0x0A, 0x0C, 0x14, 0x0D, 0x0C, 0x0B, 0x0B, 0x0C, 0x19, 0x12, 0x13, 0x0F, 0x14, 0x1D, 0x1A, 0x1F, 0x1E, 0x1D, 0x1A, 0x1C, 0x1C, 0x20, 0x24, 0x2E, 0x27, 0x20, 0x22, 0x2C, 0x23, 0x1C, 0x1C, 0x28, 0x37, 0x29, 0x2C, 0x30, 0x31, 0x34, 0x34, 0x34, 0x1F, 0x27, 0x39, 0x3D, 0x38, 0x32, 0x3C, 0x2E, 0x33, 0x34, 0x32,
204+
0xFF, 0xC0, 0x00, 0x0B, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01, 0x01, 0x11, 0x00, # SOF marker (1x1 px)
205+
0xFF, 0xC4, 0x00, 0x14, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, # DHT marker
206+
0xFF, 0xC4, 0x00, 0x14, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, # DHT marker
207+
0xFF, 0xDA, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, # SOS marker
208+
0xFF, 0xD9 # EOI marker
209+
])
135210

136211
def test_can_handle(self):
137212
"""Test can_handle method"""
@@ -171,10 +246,62 @@ def test_pagination_support(self):
171246
try:
172247
assert loader.can_handle_paginate("document.pdf") is True
173248
assert loader.can_handle_paginate("document.jpg") is False
174-
assert loader.can_handle_paginate("document.png") is False
249+
assert loader.can_handle_paginate("https://example.com/document.pdf") is False
175250
finally:
176251
os.path.isfile = original_isfile
177252

178-
if __name__ == "__main__":
179-
test = TestDocumentLoaderMistralOCR()
180-
test.test_url_processing()
253+
# @pytest.mark.slow
254+
# def test_recursive_image_processing(self):
255+
# """Test recursive processing of images within a PDF document."""
256+
# # Skip if no API key is set
257+
# api_key = os.getenv("MISTRAL_API_KEY")
258+
# if not api_key:
259+
# pytest.skip("MISTRAL_API_KEY environment variable not set")
260+
261+
# # Create config and loader
262+
# config = MistralOCRConfig(
263+
# api_key=api_key,
264+
# model="mistral-ocr-latest",
265+
# include_image_base64=True # Enable image base64 in response
266+
# )
267+
# loader = DocumentLoaderMistralOCR(config)
268+
269+
# # Test file path for bulk.pdf which contains embedded images
270+
# test_file_path = os.path.join(os.getcwd(), 'tests', 'files', 'bulk.pdf')
271+
272+
# # Verify the file exists
273+
# if not os.path.exists(test_file_path):
274+
# pytest.skip(f"Test file not found: {test_file_path}")
275+
276+
# # Check file size - Mistral has a 50MB limit
277+
# file_size_mb = os.path.getsize(test_file_path) / (1024 * 1024)
278+
# if file_size_mb > 50:
279+
# pytest.skip(f"Test file too large ({file_size_mb:.2f}MB) - Mistral has a 50MB limit")
280+
281+
# # Load the document
282+
# result = loader.load(test_file_path)
283+
284+
# # Verify response structure
285+
# assert isinstance(result, list)
286+
# assert len(result) > 0
287+
288+
# # Get the last page
289+
# last_page = result[-1]
290+
291+
# # Check if the page has images
292+
# assert "images" in last_page, "Last page should have images"
293+
# assert isinstance(last_page["images"], list), "images should be a list"
294+
# assert len(last_page["images"]) > 0, "Should have at least one image"
295+
296+
# # Check if the driver's license number is in the processed content
297+
# license_found = False
298+
# for img in last_page["images"]:
299+
# if "content" in img and "123 456 789" in img["content"]:
300+
# license_found = True
301+
# break
302+
303+
# assert license_found, "Driver's license number '123 456 789' not found in processed images"
304+
305+
# if __name__ == "__main__":
306+
# test = TestDocumentLoaderMistralOCR()
307+
# test.test_recursive_image_processing()

0 commit comments

Comments
 (0)