1313
1414@dataclass
1515class MarkItDownConfig :
16- """Configuration for MarkItDown document loader.
17-
16+ """
17+ Configuration for MarkItDown document loader.
18+
1819 Args:
1920 content: Initial content (optional)
2021 cache_ttl: Cache time-to-live in seconds (default: 300)
@@ -25,7 +26,6 @@ class MarkItDownConfig:
2526 page_separator: Character used to separate pages (default: form feed '\\ f')
2627 preserve_whitespace: Whether to preserve whitespace in text (default: False)
2728 """
28- # Optional parameters
2929 content : Optional [Any ] = None
3030 cache_ttl : int = 300
3131 llm_client : Optional [Any ] = None
@@ -51,8 +51,11 @@ class DocumentLoaderMarkItDown(CachedDocumentLoader):
5151 """
5252 Document loader that uses MarkItDown to extract content from various file formats.
5353 Supports text extraction and optional image/page rendering in vision mode.
54+ Produces a list of pages, each with:
55+ - "content": text from that page
56+ - "image": optional page/image bytes if vision_mode is True
5457 """
55-
58+
5659 SUPPORTED_FORMATS = [
5760 "pdf" , "doc" , "docx" , "ppt" , "pptx" , "xls" , "xlsx" ,
5861 "csv" , "tsv" , "txt" , "html" , "xml" , "json" , "zip" ,
@@ -70,26 +73,25 @@ def __init__(
7073 page_separator : str = '\f ' ,
7174 preserve_whitespace : bool = False
7275 ):
73- """Initialize loader.
74-
76+ """
77+ Initialize the loader.
78+
7579 Args:
76- content_or_config: Either a MarkItDownConfig object or initial content
80+ content_or_config: Either a MarkItDownConfig object or the initial content
7781 cache_ttl: Cache time-to-live in seconds (only used if content_or_config is not MarkItDownConfig)
78- llm_client: LLM client for enhanced text processing (only used if content_or_config is not MarkItDownConfig)
79- llm_model: LLM model name to use (only used if content_or_config is not MarkItDownConfig)
80- mime_type_detection: Whether to use magic for MIME type detection (only used if content_or_config is not MarkItDownConfig)
81- default_extension: Default file extension when type cannot be determined (only used if content_or_config is not MarkItDownConfig)
82- page_separator: Character used to separate pages (only used if content_or_config is not MarkItDownConfig)
83- preserve_whitespace: Whether to preserve whitespace in text (only used if content_or_config is not MarkItDownConfig)
82+ llm_client: LLM client (only used if content_or_config is not MarkItDownConfig)
83+ llm_model: LLM model name (only used if content_or_config is not MarkItDownConfig)
84+ mime_type_detection: Whether to use magic for MIME type detection
85+ default_extension: Default extension if MIME type detection fails
86+ page_separator: Character used to separate pages
87+ preserve_whitespace: Whether to preserve whitespace
8488 """
85- # Check dependencies before initializing
8689 self ._check_dependencies ()
87-
88- # Handle both config-based and old-style initialization
90+
91+ # Handle config object vs. old-style params
8992 if isinstance (content_or_config , MarkItDownConfig ):
9093 self .config = content_or_config
9194 else :
92- # Create config from individual parameters
9395 self .config = MarkItDownConfig (
9496 content = content_or_config ,
9597 cache_ttl = cache_ttl ,
@@ -100,8 +102,10 @@ def __init__(
100102 page_separator = page_separator ,
101103 preserve_whitespace = preserve_whitespace
102104 )
103-
105+
104106 super ().__init__ (self .config .content , self .config .cache_ttl )
107+
108+ # MarkItDown object
105109 self .markitdown = self ._get_markitdown ()(
106110 llm_client = self .config .llm_client ,
107111 llm_model = self .config .llm_model
@@ -114,82 +118,87 @@ def _check_dependencies():
114118 import markitdown
115119 except ImportError :
116120 raise ImportError (
117- "Could not import markitdown package. "
121+ "Could not import the ' markitdown' package. "
118122 "Please install it with `pip install markitdown`."
119123 )
120124
121125 def _get_markitdown (self ):
122- """Lazy load MarkItDown."""
123- try :
124- from markitdown import MarkItDown
125- return MarkItDown
126- except ImportError :
127- raise ImportError (
128- "Could not import markitdown python package. "
129- "Please install it with `pip install markitdown`."
130- )
126+ """Lazy-import MarkItDown class."""
127+ from markitdown import MarkItDown
128+ return MarkItDown
131129
132130 def _process_text (self , text : str ) -> str :
133- """Process text according to configuration."""
134- if not self .config .preserve_whitespace :
135- text = text .strip ()
136- return text
131+ """Apply any additional text processing (e.g., strip whitespace)."""
132+ return text if self .config .preserve_whitespace else text .strip ()
137133
138134 @cachedmethod (cache = attrgetter ('cache' ),
139- key = lambda self , source : hashkey (source if isinstance (source , str ) else source .getvalue (), self .vision_mode ))
135+ key = lambda self , source : hashkey (
136+ source if isinstance (source , str ) else source .getvalue (),
137+ self .vision_mode
138+ ))
140139 def load (self , source : Union [str , BytesIO ]) -> List [Dict [str , Any ]]:
141140 """
142- Load and process content using MarkItDown.
143- Returns a list of pages, each containing:
144- - content: The text content
145- - image: The page/image bytes if vision_mode is True
146-
141+ Load and process the source with MarkItDown, returning a list of pages.
142+
147143 Args:
148- source: Either a file path or BytesIO stream
149-
144+ source: A file path or a BytesIO stream
145+
150146 Returns:
151- List[Dict[str, Any]]: List of pages with content and optional images
147+ A list of dictionaries where each dict is one "page" of text.
148+ - "content": The text content (str)
149+ - "image": Optional bytes if vision mode is enabled (key only present if vision_mode is True)
152150 """
153151 if not self .can_handle (source ):
154152 raise ValueError (f"Cannot handle source: { source } " )
155153
154+ # Basic check for vision mode
156155 if self .vision_mode and not self .can_handle_vision (source ):
157156 raise ValueError (f"Cannot handle source in vision mode: { source } " )
158157
159158 try :
160- # Extract text content using MarkItDown
159+ # Convert the file or stream with MarkItDown
161160 if isinstance (source , str ):
161+ # File path
162162 result = self .markitdown .convert (source )
163163 else :
164- # For BytesIO, we need to determine the file type
164+ # BytesIO
165165 source .seek (0 )
166166 if self .config .mime_type_detection :
167167 mime = magic .from_buffer (source .getvalue (), mime = True )
168- ext = next ((ext for ext , mime_types in MIME_TYPE_MAPPING .items ()
169- if mime in (mime_types if isinstance (mime_types , list ) else [mime_types ])),
170- self .config .default_extension )
168+ # Attempt to deduce extension from MIME type
169+ ext = next (
170+ (
171+ e
172+ for e , mime_list in MIME_TYPE_MAPPING .items ()
173+ if mime in (mime_list if isinstance (mime_list , list ) else [mime_list ])
174+ ),
175+ self .config .default_extension
176+ )
171177 else :
172178 ext = self .config .default_extension
173179 result = self .markitdown .convert_stream (source , file_extension = f".{ ext } " )
174180 source .seek (0 )
175181
182+ # Full text from MarkItDown
176183 text_content = result .text_content
184+ if not text_content :
185+ text_content = ""
177186
178- # Split into pages if supported
179- pages = []
180- if self .can_handle_paginate (source ):
181- raw_pages = text_content .split (self .config .page_separator )
182- for page_text in raw_pages :
183- processed_text = self ._process_text (page_text )
184- if processed_text or self .config .preserve_whitespace :
185- pages .append ({"content" : processed_text })
186- else :
187- processed_text = self ._process_text (text_content )
188- pages = [{"content" : processed_text }]
187+ # Split text content into pages (based on config.page_separator)
188+ raw_pages = text_content .split (self .config .page_separator )
189189
190- # Add images in vision mode
190+ pages = []
191+ for page_text in raw_pages :
192+ processed = self ._process_text (page_text )
193+ # Always include the page if preserve_whitespace is True,
194+ # or if there's any non-empty text.
195+ if processed or self .config .preserve_whitespace :
196+ pages .append ({"content" : processed })
197+
198+ # In vision mode, attach rendered images if applicable
191199 if self .vision_mode :
192200 images_dict = self .convert_to_images (source )
201+ # Match up page images by index
193202 for idx , page_dict in enumerate (pages ):
194203 if idx in images_dict :
195204 page_dict ["image" ] = images_dict [idx ]
0 commit comments