Skip to content

Commit 96118b4

Browse files
authored
Merge pull request #187 from enoch3712/134-docling-documentloader
134 docling documentloader
2 parents ef81dd9 + cc91b3d commit 96118b4

File tree

4 files changed

+626
-145
lines changed

4 files changed

+626
-145
lines changed
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
from io import BytesIO
2+
from typing import Any, Dict, List, Union, Optional
3+
4+
from cachetools import cachedmethod
5+
from cachetools.keys import hashkey
6+
7+
# Import your DocumentLoader base
8+
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
9+
10+
class DocumentLoaderDocling(CachedDocumentLoader):
11+
"""
12+
Document loader that uses Docling to extract content from various file formats.
13+
Produces a list of pages, each with:
14+
- "content": text from that page
15+
- "image": optional page image bytes if vision_mode is True
16+
- "markdown": Markdown string of that page
17+
"""
18+
19+
SUPPORTED_FORMATS = [
20+
# Microsoft Word family
21+
"docx", "dotx", "docm", "dotm",
22+
# Microsoft PowerPoint family
23+
"pptx", "potx", "ppsx", "pptm", "potm", "ppsm",
24+
# Microsoft Excel family
25+
"xlsx",
26+
# PDF
27+
"pdf",
28+
# HTML variants
29+
"html", "htm", "xhtml",
30+
# Markdown
31+
"md",
32+
# AsciiDoc variants
33+
"adoc", "asciidoc", "asc",
34+
# Common image types
35+
"png", "jpg", "jpeg", "tif", "tiff", "bmp",
36+
# XML (including PubMed .nxml)
37+
"xml", "nxml",
38+
# Plain text
39+
"txt"
40+
]
41+
42+
def __init__(
43+
self,
44+
content: Any = None,
45+
cache_ttl: int = 300,
46+
format_options: Optional[Dict[str, Any]] = None,
47+
):
48+
"""Initialize loader.
49+
50+
Args:
51+
content: Initial content
52+
cache_ttl: Cache time-to-live in seconds
53+
format_options: Dictionary mapping input formats to their FormatOption configurations
54+
Example:
55+
{
56+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options),
57+
InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_options),
58+
...
59+
}
60+
"""
61+
from docling.datamodel.base_models import InputFormat
62+
from docling.document_converter import FormatOption
63+
64+
# Check dependencies before any initialization
65+
self._check_dependencies()
66+
67+
super().__init__(content, cache_ttl)
68+
self.format_options = format_options
69+
self.converter = self._init_docling_converter()
70+
71+
@staticmethod
72+
def _check_dependencies():
73+
"""Check if required dependencies are installed."""
74+
try:
75+
import docling
76+
import docling.document_converter
77+
import docling.datamodel.document
78+
import docling.datamodel.pipeline_options
79+
import docling_core.types.doc
80+
except ImportError:
81+
raise ImportError(
82+
"Could not import docling python package. "
83+
"Please install it with `pip install docling`."
84+
)
85+
86+
def _init_docling_converter(self):
87+
"""Initialize the Docling document converter."""
88+
from docling.document_converter import DocumentConverter
89+
return DocumentConverter()
90+
91+
@cachedmethod(cache=lambda self: self.cache,
92+
key=lambda self, source: hashkey(
93+
source if isinstance(source, str) else source.getvalue(),
94+
self.vision_mode
95+
))
96+
def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
97+
"""
98+
Load and parse the document using Docling.
99+
100+
Returns:
101+
A list of dictionaries, each representing a "page" with:
102+
- "content": text from that page
103+
- "image": optional image bytes if vision_mode is True
104+
- "markdown": Markdown string of that page
105+
"""
106+
if not self.can_handle(source):
107+
raise ValueError(f"Cannot handle source: {source}")
108+
109+
# Convert the source to a docling "ConversionResult"
110+
conv_result = self._docling_convert(source)
111+
112+
test = conv_result.document.export_to_markdown()
113+
print(test)
114+
115+
# Build the output list of page data
116+
pages_output = []
117+
for p in conv_result.pages:
118+
page_dict = {
119+
"content": conv_result.document.export_to_markdown(page_no=p.page_no+1),
120+
"image": None
121+
}
122+
123+
# Handle image extraction if vision_mode is enabled
124+
if self.vision_mode:
125+
images_dict = self.convert_to_images(source)
126+
page_dict["image"] = images_dict.get(p.page_no)
127+
128+
pages_output.append(page_dict)
129+
130+
# Fallback for documents without explicit pages
131+
if not pages_output:
132+
doc_text = conv_result.document.export_to_markdown()
133+
pages_output = [{"content": doc_text, "image": None}]
134+
135+
return pages_output
136+
137+
def _docling_convert(self, source: Union[str, BytesIO]) -> Any:
138+
"""
139+
Internal method that runs the docling convert pipeline.
140+
Uses format_options if provided during initialization, otherwise uses default settings.
141+
"""
142+
from docling.document_converter import DocumentConverter
143+
from docling_core.types.io import DocumentStream
144+
import uuid
145+
146+
# Create converter with optional format options from initialization
147+
docling_converter = DocumentConverter(
148+
format_options=self.format_options if self.format_options else None
149+
)
150+
151+
# Handle different input types
152+
if isinstance(source, BytesIO):
153+
# Generate a unique filename using UUID
154+
unique_filename = f"{uuid.uuid4()}.pdf"
155+
doc_stream = DocumentStream(name=unique_filename, stream=source)
156+
conv_result = docling_converter.convert(doc_stream, raises_on_error=True)
157+
elif isinstance(source, str):
158+
# Handle string paths or URLs directly
159+
conv_result = docling_converter.convert(source, raises_on_error=True)
160+
else:
161+
raise ValueError(f"Unsupported source type: {type(source)}")
162+
163+
return conv_result
164+
165+
def _extract_page_text(self, page: Any) -> str:
166+
"""
167+
Gather text from a docling Page object.
168+
Handles both text and table items.
169+
"""
170+
171+
from docling_core.types.doc import DocItemLabel, TableItem
172+
173+
lines = []
174+
if page.assembled and page.assembled.elements:
175+
for element in page.assembled.elements:
176+
# Normal text
177+
if element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]:
178+
lines.append(element.text or "")
179+
# Tables
180+
elif element.label == DocItemLabel.TABLE and isinstance(element, TableItem):
181+
table_text = self.convert_table_to_text(element)
182+
lines.append(table_text)
183+
else:
184+
# If no "assembled" data, fallback to the raw text cells
185+
# or produce an empty string
186+
if page.cells:
187+
# Join cell texts. Not always great, but a fallback:
188+
return "\n".join(cell.text for cell in page.cells if cell.text)
189+
return ""
190+
191+
return "\n".join(lines)
192+
193+
def convert_table_to_text(self, table_item: Any) -> str:
194+
"""
195+
Convert a TableItem to a Markdown table string.
196+
"""
197+
headers = []
198+
rows = []
199+
200+
# Assuming the first row is the header
201+
for idx, row in enumerate(table_item.table_rows):
202+
row_text = []
203+
for cell in row.table_cells:
204+
row_text.append(cell.text.strip() if cell.text else "")
205+
if idx == 0:
206+
headers = row_text
207+
rows.append("| " + " | ".join(headers) + " |")
208+
rows.append("| " + " | ".join(['---'] * len(headers)) + " |")
209+
else:
210+
rows.append("| " + " | ".join(row_text) + " |")
211+
212+
return "\n".join(rows)
213+

0 commit comments

Comments
 (0)