Skip to content

Commit 681e235

Browse files
authored
Merge pull request #315 from enoch3712/314-bug-fix-init---add-markdown-elements
add elements to init
2 parents a2b3da6 + 1239414 commit 681e235

File tree

4 files changed

+29
-28
lines changed

4 files changed

+29
-28
lines changed

extract_thinker/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
)
3434
from .warning import filter_pydantic_v2_warnings
3535
from .document_loader.document_loader_mistral_ocr import DocumentLoaderMistralOCR, MistralOCRConfig
36+
from .markdown.markdown_converter import MarkdownConverter, PageContent
3637
filter_pydantic_v2_warnings()
3738

3839
__all__ = [
@@ -81,4 +82,6 @@
8182
'BatchJob',
8283
'DocumentLoaderMistralOCR',
8384
'MistralOCRConfig',
85+
'MarkdownConverter',
86+
'PageContent',
8487
]

extract_thinker/document_loader/document_loader_mistral_ocr.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class MistralOCRConfig:
3131
pages: Specific pages to process (optional)
3232
image_limit: Maximum number of images to extract (optional)
3333
image_min_size: Minimum image size to extract (optional)
34+
allow_image_recursive: Whether to allow recursive image extraction (default: False)
3435
"""
3536
api_key: str
3637
model: str = "mistral-ocr-latest"
@@ -40,7 +41,8 @@ class MistralOCRConfig:
4041
pages: Optional[List[int]] = None
4142
image_limit: Optional[int] = None
4243
image_min_size: Optional[int] = None
43-
44+
allow_image_recursive: bool = False
45+
4446
def __post_init__(self):
4547
"""Validate configuration after initialization."""
4648
if not self.api_key:
@@ -576,7 +578,7 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
576578
page_dict["dimensions"] = page["dimensions"]
577579

578580
image_extraction_results = {} # Store img_id -> extracted_text
579-
if "images" in page and page["images"]:
581+
if "images" in page and page["images"] and self.config.allow_image_recursive:
580582
futures = []
581583
with ThreadPoolExecutor() as executor:
582584
for img in page["images"]:

extract_thinker/markdown/markdown_converter.py

+21-25
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,21 @@ class MarkdownConverter:
4444
4545
Instructions:
4646
1. Create proper Markdown with headings, lists, formatting etc.
47+
- Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
48+
- Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
49+
- Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
50+
- Create proper links using [text](url) format if applicable
51+
- Format code blocks with triple backticks ``` if applicable
52+
- Format tables using proper Markdown table syntax if applicable
53+
- Use block quotes with > where appropriate
4754
2. Include ALL content from the image in your Markdown output.
4855
3. After the Markdown section, add a JSON block that follows the above schema.
4956
4. The JSON should break down the Markdown into logical sections with certainty scores.
5057
5. Make sure the certainty scores accurately reflect your confidence (1-10).
5158
6. Make sure you keep things like checkboxes, lists, etc. as is.
59+
7. IMPORTANT: If you see a placeholder for an image (e.g. [img-1.jpeg], <!-- image -->, etc.), NEVER include the placeholder directly in your output. ALWAYS replace it with the actual content or description of what the image shows.
60+
Example: Replace [img-1.jpeg] with "Signature of William Smith" or appropriate description of what's in the image.
61+
8. MAINTAIN ALL THE ORIGINAL CONTENT AND MEANING - do not add or remove information.
5262
5363
Your response format should be:
5464
@@ -77,33 +87,19 @@ class MarkdownConverter:
7787
Convert the image into well-formatted Markdown content.
7888
7989
Instructions:
80-
1. Create proper Markdown with headings, lists, formatting etc.
81-
2. Include ALL content from the image in your Markdown output.
82-
3. Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
83-
4. Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
84-
5. Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
85-
6. Create proper links using [text](url) format if applicable
86-
7. Format code blocks with triple backticks ``` if applicable
87-
8. Format tables using proper Markdown table syntax if applicable
88-
9. Make sure you keep things like checkboxes, lists, etc. as is.
90+
Create proper Markdown with headings, lists, formatting etc.
91+
Include ALL content from the image in your Markdown output.
92+
Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
93+
Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
94+
Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
95+
Create proper links using [text](url) format if applicable
96+
Format code blocks with triple backticks ``` if applicable
97+
Format tables using proper Markdown table syntax if applicable
98+
Make sure you keep things like checkboxes, lists, etc. as is.
99+
IMPORTANT: If you see a placeholder for an image (e.g. [img-1.jpeg], <!-- image -->, etc.), NEVER include the placeholder directly in your output. ALWAYS replace it with the actual content or description of what the image shows.
100+
Example: Replace [img-1.jpeg] with "Signature of William Smith" or appropriate description of what's in the image.
89101
90102
Your response should ONLY include the formatted Markdown content without any additional JSON structure or explanations.
91-
"""
92-
93-
MARKDOWN_VERIFICATION_PROMPT = """
94-
Look at the image provided and reformat the text content into well-structured Markdown.
95-
The text content is already accurate, but may lack proper Markdown formatting. Your task is to:
96-
97-
1. Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
98-
2. Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
99-
3. Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
100-
4. Create proper links using [text](url) format
101-
5. Format code blocks with triple backticks ```
102-
6. Format tables using proper Markdown table syntax if applicable
103-
7. Use block quotes with > where appropriate
104-
8. MAINTAIN ALL THE ORIGINAL CONTENT AND MEANING - do not add or remove information
105-
106-
Preserve all the original information while improving its readability through proper Markdown formatting.
107103
"""
108104

109105
PLACEHOLDER_INSTRUCTION = (

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "extract_thinker"
3-
version = "0.1.11"
3+
version = "0.1.12"
44
description = "Library to extract data from files and documents agnositicaly using LLMs"
55
authors = ["Júlio Almeida <[email protected]>"]
66
readme = "README.md"

0 commit comments

Comments
 (0)