Merge pull request #315 from enoch3712/314-bug-fix-init---add-markdown-elements

enoch3712 · web-flow · commit 681e235b954d · 2025-04-16T14:32:34.000+01:00
add elements to init
diff --git a/extract_thinker/__init__.py b/extract_thinker/__init__.py
@@ -33,6 +33,7 @@
 )
 from .warning import filter_pydantic_v2_warnings
 from .document_loader.document_loader_mistral_ocr import DocumentLoaderMistralOCR, MistralOCRConfig
+from .markdown.markdown_converter import MarkdownConverter, PageContent
 filter_pydantic_v2_warnings()
 
 __all__ = [
@@ -81,4 +82,6 @@
     'BatchJob',
     'DocumentLoaderMistralOCR',
     'MistralOCRConfig',
+    'MarkdownConverter',
+    'PageContent',
 ]
diff --git a/extract_thinker/document_loader/document_loader_mistral_ocr.py b/extract_thinker/document_loader/document_loader_mistral_ocr.py
@@ -31,6 +31,7 @@ class MistralOCRConfig:
         pages: Specific pages to process (optional)
         image_limit: Maximum number of images to extract (optional)
         image_min_size: Minimum image size to extract (optional)
+        allow_image_recursive: Whether to allow recursive image extraction (default: False)
     """
     api_key: str
     model: str = "mistral-ocr-latest"
@@ -40,7 +41,8 @@ class MistralOCRConfig:
     pages: Optional[List[int]] = None
     image_limit: Optional[int] = None
     image_min_size: Optional[int] = None
-
+    allow_image_recursive: bool = False
+    
     def __post_init__(self):
         """Validate configuration after initialization."""
         if not self.api_key:
@@ -576,7 +578,7 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
                     page_dict["dimensions"] = page["dimensions"]
 
                 image_extraction_results = {} # Store img_id -> extracted_text
-                if "images" in page and page["images"]:
+                if "images" in page and page["images"] and self.config.allow_image_recursive:
                     futures = []
                     with ThreadPoolExecutor() as executor:
                         for img in page["images"]:
diff --git a/extract_thinker/markdown/markdown_converter.py b/extract_thinker/markdown/markdown_converter.py
@@ -44,11 +44,21 @@ class MarkdownConverter:
 
 Instructions:
 1. Create proper Markdown with headings, lists, formatting etc.
+   - Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
+   - Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
+   - Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
+   - Create proper links using [text](url) format if applicable
+   - Format code blocks with triple backticks ``` if applicable
+   - Format tables using proper Markdown table syntax if applicable
+   - Use block quotes with > where appropriate
 2. Include ALL content from the image in your Markdown output.
 3. After the Markdown section, add a JSON block that follows the above schema.
 4. The JSON should break down the Markdown into logical sections with certainty scores.
 5. Make sure the certainty scores accurately reflect your confidence (1-10).
 6. Make sure you keep things like checkboxes, lists, etc. as is.
+7. IMPORTANT: If you see a placeholder for an image (e.g. [img-1.jpeg], <!-- image -->, etc.), NEVER include the placeholder directly in your output. ALWAYS replace it with the actual content or description of what the image shows.
+   Example: Replace [img-1.jpeg] with "Signature of William Smith" or appropriate description of what's in the image.
+8. MAINTAIN ALL THE ORIGINAL CONTENT AND MEANING - do not add or remove information.
 
 Your response format should be:
 
@@ -77,33 +87,19 @@ class MarkdownConverter:
 Convert the image into well-formatted Markdown content.
 
 Instructions:
-1. Create proper Markdown with headings, lists, formatting etc.
-2. Include ALL content from the image in your Markdown output.
-3. Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
-4. Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
-5. Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
-6. Create proper links using [text](url) format if applicable
-7. Format code blocks with triple backticks ``` if applicable
-8. Format tables using proper Markdown table syntax if applicable
-9. Make sure you keep things like checkboxes, lists, etc. as is.
+Create proper Markdown with headings, lists, formatting etc.
+Include ALL content from the image in your Markdown output.
+Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
+Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
+Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
+Create proper links using [text](url) format if applicable
+Format code blocks with triple backticks ``` if applicable
+Format tables using proper Markdown table syntax if applicable
+Make sure you keep things like checkboxes, lists, etc. as is.
+IMPORTANT: If you see a placeholder for an image (e.g. [img-1.jpeg], <!-- image -->, etc.), NEVER include the placeholder directly in your output. ALWAYS replace it with the actual content or description of what the image shows.
+Example: Replace [img-1.jpeg] with "Signature of William Smith" or appropriate description of what's in the image.
 
 Your response should ONLY include the formatted Markdown content without any additional JSON structure or explanations.
-"""
-
-    MARKDOWN_VERIFICATION_PROMPT = """
-Look at the image provided and reformat the text content into well-structured Markdown. 
-The text content is already accurate, but may lack proper Markdown formatting. Your task is to:
-
-1. Format headings with # syntax (# for main headings, ## for sub-headings, etc.)
-2. Format lists with proper bullet points (*, -) or numbers (1., 2., etc.)
-3. Apply proper emphasis using **bold**, *italic*, or `code` where appropriate
-4. Create proper links using [text](url) format
-5. Format code blocks with triple backticks ```
-6. Format tables using proper Markdown table syntax if applicable
-7. Use block quotes with > where appropriate
-8. MAINTAIN ALL THE ORIGINAL CONTENT AND MEANING - do not add or remove information
-
-Preserve all the original information while improving its readability through proper Markdown formatting.
 """
 
     PLACEHOLDER_INSTRUCTION = (
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "extract_thinker"
-version = "0.1.11"
+version = "0.1.12"
 description = "Library to extract data from files and documents agnositicaly using LLMs"
 authors = ["Júlio Almeida <enoch3712@gmail.com>"]
 readme = "README.md"

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@`
`33`	`33`	`)`
`34`	`34`	`from .warning import filter_pydantic_v2_warnings`
`35`	`35`	`from .document_loader.document_loader_mistral_ocr import DocumentLoaderMistralOCR, MistralOCRConfig`
	`36`	`+from .markdown.markdown_converter import MarkdownConverter, PageContent`
`36`	`37`	`filter_pydantic_v2_warnings()`
`37`	`38`
`38`	`39`	`__all__ = [`
`@@ -81,4 +82,6 @@`
`81`	`82`	`'BatchJob',`
`82`	`83`	`'DocumentLoaderMistralOCR',`
`83`	`84`	`'MistralOCRConfig',`
	`85`	`+ 'MarkdownConverter',`
	`86`	`+ 'PageContent',`
`84`	`87`	`]`