Skip to content

Commit bd47a1c

Browse files
authored
Merge pull request #201 from enoch3712/200-markitdown-refactor
markdown multi page fix
2 parents e11306b + 24167a8 commit bd47a1c

File tree

2 files changed

+88
-59
lines changed

2 files changed

+88
-59
lines changed

extract_thinker/document_loader/document_loader_markitdown.py

Lines changed: 67 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313

1414
@dataclass
1515
class MarkItDownConfig:
16-
"""Configuration for MarkItDown document loader.
17-
16+
"""
17+
Configuration for MarkItDown document loader.
18+
1819
Args:
1920
content: Initial content (optional)
2021
cache_ttl: Cache time-to-live in seconds (default: 300)
@@ -25,7 +26,6 @@ class MarkItDownConfig:
2526
page_separator: Character used to separate pages (default: form feed '\\f')
2627
preserve_whitespace: Whether to preserve whitespace in text (default: False)
2728
"""
28-
# Optional parameters
2929
content: Optional[Any] = None
3030
cache_ttl: int = 300
3131
llm_client: Optional[Any] = None
@@ -51,8 +51,11 @@ class DocumentLoaderMarkItDown(CachedDocumentLoader):
5151
"""
5252
Document loader that uses MarkItDown to extract content from various file formats.
5353
Supports text extraction and optional image/page rendering in vision mode.
54+
Produces a list of pages, each with:
55+
- "content": text from that page
56+
- "image": optional page/image bytes if vision_mode is True
5457
"""
55-
58+
5659
SUPPORTED_FORMATS = [
5760
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx",
5861
"csv", "tsv", "txt", "html", "xml", "json", "zip",
@@ -70,26 +73,25 @@ def __init__(
7073
page_separator: str = '\f',
7174
preserve_whitespace: bool = False
7275
):
73-
"""Initialize loader.
74-
76+
"""
77+
Initialize the loader.
78+
7579
Args:
76-
content_or_config: Either a MarkItDownConfig object or initial content
80+
content_or_config: Either a MarkItDownConfig object or the initial content
7781
cache_ttl: Cache time-to-live in seconds (only used if content_or_config is not MarkItDownConfig)
78-
llm_client: LLM client for enhanced text processing (only used if content_or_config is not MarkItDownConfig)
79-
llm_model: LLM model name to use (only used if content_or_config is not MarkItDownConfig)
80-
mime_type_detection: Whether to use magic for MIME type detection (only used if content_or_config is not MarkItDownConfig)
81-
default_extension: Default file extension when type cannot be determined (only used if content_or_config is not MarkItDownConfig)
82-
page_separator: Character used to separate pages (only used if content_or_config is not MarkItDownConfig)
83-
preserve_whitespace: Whether to preserve whitespace in text (only used if content_or_config is not MarkItDownConfig)
82+
llm_client: LLM client (only used if content_or_config is not MarkItDownConfig)
83+
llm_model: LLM model name (only used if content_or_config is not MarkItDownConfig)
84+
mime_type_detection: Whether to use magic for MIME type detection
85+
default_extension: Default extension if MIME type detection fails
86+
page_separator: Character used to separate pages
87+
preserve_whitespace: Whether to preserve whitespace
8488
"""
85-
# Check dependencies before initializing
8689
self._check_dependencies()
87-
88-
# Handle both config-based and old-style initialization
90+
91+
# Handle config object vs. old-style params
8992
if isinstance(content_or_config, MarkItDownConfig):
9093
self.config = content_or_config
9194
else:
92-
# Create config from individual parameters
9395
self.config = MarkItDownConfig(
9496
content=content_or_config,
9597
cache_ttl=cache_ttl,
@@ -100,8 +102,10 @@ def __init__(
100102
page_separator=page_separator,
101103
preserve_whitespace=preserve_whitespace
102104
)
103-
105+
104106
super().__init__(self.config.content, self.config.cache_ttl)
107+
108+
# MarkItDown object
105109
self.markitdown = self._get_markitdown()(
106110
llm_client=self.config.llm_client,
107111
llm_model=self.config.llm_model
@@ -114,82 +118,87 @@ def _check_dependencies():
114118
import markitdown
115119
except ImportError:
116120
raise ImportError(
117-
"Could not import markitdown package. "
121+
"Could not import the 'markitdown' package. "
118122
"Please install it with `pip install markitdown`."
119123
)
120124

121125
def _get_markitdown(self):
122-
"""Lazy load MarkItDown."""
123-
try:
124-
from markitdown import MarkItDown
125-
return MarkItDown
126-
except ImportError:
127-
raise ImportError(
128-
"Could not import markitdown python package. "
129-
"Please install it with `pip install markitdown`."
130-
)
126+
"""Lazy-import MarkItDown class."""
127+
from markitdown import MarkItDown
128+
return MarkItDown
131129

132130
def _process_text(self, text: str) -> str:
133-
"""Process text according to configuration."""
134-
if not self.config.preserve_whitespace:
135-
text = text.strip()
136-
return text
131+
"""Apply any additional text processing (e.g., strip whitespace)."""
132+
return text if self.config.preserve_whitespace else text.strip()
137133

138134
@cachedmethod(cache=attrgetter('cache'),
139-
key=lambda self, source: hashkey(source if isinstance(source, str) else source.getvalue(), self.vision_mode))
135+
key=lambda self, source: hashkey(
136+
source if isinstance(source, str) else source.getvalue(),
137+
self.vision_mode
138+
))
140139
def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
141140
"""
142-
Load and process content using MarkItDown.
143-
Returns a list of pages, each containing:
144-
- content: The text content
145-
- image: The page/image bytes if vision_mode is True
146-
141+
Load and process the source with MarkItDown, returning a list of pages.
142+
147143
Args:
148-
source: Either a file path or BytesIO stream
149-
144+
source: A file path or a BytesIO stream
145+
150146
Returns:
151-
List[Dict[str, Any]]: List of pages with content and optional images
147+
A list of dictionaries where each dict is one "page" of text.
148+
- "content": The text content (str)
149+
- "image": Optional bytes if vision mode is enabled (key only present if vision_mode is True)
152150
"""
153151
if not self.can_handle(source):
154152
raise ValueError(f"Cannot handle source: {source}")
155153

154+
# Basic check for vision mode
156155
if self.vision_mode and not self.can_handle_vision(source):
157156
raise ValueError(f"Cannot handle source in vision mode: {source}")
158157

159158
try:
160-
# Extract text content using MarkItDown
159+
# Convert the file or stream with MarkItDown
161160
if isinstance(source, str):
161+
# File path
162162
result = self.markitdown.convert(source)
163163
else:
164-
# For BytesIO, we need to determine the file type
164+
# BytesIO
165165
source.seek(0)
166166
if self.config.mime_type_detection:
167167
mime = magic.from_buffer(source.getvalue(), mime=True)
168-
ext = next((ext for ext, mime_types in MIME_TYPE_MAPPING.items()
169-
if mime in (mime_types if isinstance(mime_types, list) else [mime_types])),
170-
self.config.default_extension)
168+
# Attempt to deduce extension from MIME type
169+
ext = next(
170+
(
171+
e
172+
for e, mime_list in MIME_TYPE_MAPPING.items()
173+
if mime in (mime_list if isinstance(mime_list, list) else [mime_list])
174+
),
175+
self.config.default_extension
176+
)
171177
else:
172178
ext = self.config.default_extension
173179
result = self.markitdown.convert_stream(source, file_extension=f".{ext}")
174180
source.seek(0)
175181

182+
# Full text from MarkItDown
176183
text_content = result.text_content
184+
if not text_content:
185+
text_content = ""
177186

178-
# Split into pages if supported
179-
pages = []
180-
if self.can_handle_paginate(source):
181-
raw_pages = text_content.split(self.config.page_separator)
182-
for page_text in raw_pages:
183-
processed_text = self._process_text(page_text)
184-
if processed_text or self.config.preserve_whitespace:
185-
pages.append({"content": processed_text})
186-
else:
187-
processed_text = self._process_text(text_content)
188-
pages = [{"content": processed_text}]
187+
# Split text content into pages (based on config.page_separator)
188+
raw_pages = text_content.split(self.config.page_separator)
189189

190-
# Add images in vision mode
190+
pages = []
191+
for page_text in raw_pages:
192+
processed = self._process_text(page_text)
193+
# Always include the page if preserve_whitespace is True,
194+
# or if there's any non-empty text.
195+
if processed or self.config.preserve_whitespace:
196+
pages.append({"content": processed})
197+
198+
# In vision mode, attach rendered images if applicable
191199
if self.vision_mode:
192200
images_dict = self.convert_to_images(source)
201+
# Match up page images by index
193202
for idx, page_dict in enumerate(pages):
194203
if idx in images_dict:
195204
page_dict["image"] = images_dict[idx]

tests/test_document_loader_markitdown.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,4 +141,24 @@ def test_simple_initialization(self, test_file_path):
141141
pages = loader.load(stream)
142142
assert len(pages) > 0
143143
assert isinstance(pages[0]["content"], str)
144-
144+
145+
146+
def test_page_separator_splitting():
147+
"""
148+
Test that multiple pages are correctly separated when loading a multi-page PDF.
149+
Uses bulk.pdf which should contain 3 distinct pages.
150+
"""
151+
# Get path to bulk.pdf test file
152+
current_dir = os.path.dirname(os.path.abspath(__file__))
153+
bulk_pdf_path = os.path.join(current_dir, 'files', 'Regional_GDP_per_capita_2018_2.pdf')
154+
155+
# Test without MIME type detection
156+
config = MarkItDownConfig(
157+
mime_type_detection=False,
158+
default_extension='pdf'
159+
)
160+
loader = DocumentLoaderMarkItDown(config)
161+
pages = loader.load(bulk_pdf_path)
162+
163+
# Verify we get exactly 3 pages
164+
assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"

0 commit comments

Comments
 (0)