Skip to content

Commit e11306b

Browse files
authored
Merge pull request #199 from enoch3712/198-docling-support-simple-setup
Docling refactor
2 parents 6dd3cd9 + d845e92 commit e11306b

File tree

2 files changed

+173
-111
lines changed

2 files changed

+173
-111
lines changed

extract_thinker/document_loader/document_loader_docling.py

Lines changed: 53 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,70 +12,83 @@
1212
class DoclingConfig:
1313
"""Configuration for Docling document loader.
1414
15+
This class supports both simple and complex configurations:
16+
17+
Simple usage:
18+
config = DoclingConfig() # Uses default settings
19+
20+
Complex usage:
21+
config = DoclingConfig(
22+
format_options={
23+
InputFormat.PDF: PdfFormatOption(
24+
pipeline_options=PdfPipelineOptions(
25+
do_table_structure=True,
26+
do_ocr=True,
27+
table_structure_options=TableStructureOptions(
28+
do_cell_matching=True
29+
)
30+
)
31+
)
32+
}
33+
)
34+
1535
Args:
1636
content: Initial content (optional)
1737
cache_ttl: Cache time-to-live in seconds (default: 300)
18-
format_options: Dictionary mapping input formats to their FormatOption configurations
19-
Example:
38+
format_options: Dictionary mapping input formats to their FormatOption configurations.
39+
If None, default options will be created based on other parameters.
40+
For complex scenarios, you can provide your own format options:
2041
{
2142
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options),
2243
InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_options),
2344
...
2445
}
25-
ocr_enabled: Whether to enable OCR processing (default: True)
46+
ocr_enabled: Whether to enable OCR processing (default: False)
2647
table_structure_enabled: Whether to enable table structure detection (default: True)
27-
tesseract_cmd: Path to tesseract executable (default: None)
2848
force_full_page_ocr: Whether to force OCR on entire pages (default: False)
2949
do_cell_matching: Whether to enable cell matching in tables (default: True)
3050
"""
3151
# Optional parameters
3252
content: Optional[Any] = None
3353
cache_ttl: int = 300
3454
format_options: Optional[Dict[str, Any]] = None
35-
ocr_enabled: bool = True
55+
ocr_enabled: bool = False # OCR disabled by default
3656
table_structure_enabled: bool = True
37-
tesseract_cmd: Optional[str] = None
3857
force_full_page_ocr: bool = False
3958
do_cell_matching: bool = True
4059

4160
def __post_init__(self):
42-
"""Initialize format options based on configuration."""
43-
if self.format_options is None:
44-
from docling.datamodel.pipeline_options import (
45-
PdfPipelineOptions,
46-
TesseractCliOcrOptions,
47-
TableStructureOptions,
61+
"""Initialize format options if not provided."""
62+
# If format_options are provided, use them as is (complex configuration)
63+
if self.format_options is not None:
64+
return
65+
66+
# Simple configuration: create default format options based on parameters
67+
from docling.datamodel.pipeline_options import (
68+
PdfPipelineOptions,
69+
TableStructureOptions,
70+
)
71+
from docling.datamodel.base_models import InputFormat
72+
from docling.document_converter import PdfFormatOption
73+
74+
# Set up table options
75+
table_options = None
76+
if self.table_structure_enabled:
77+
table_options = TableStructureOptions(
78+
do_cell_matching=self.do_cell_matching
4879
)
49-
from docling.datamodel.base_models import InputFormat
50-
from docling.document_converter import PdfFormatOption
51-
52-
# Set up OCR options
53-
ocr_options = None
54-
if self.ocr_enabled:
55-
ocr_options = TesseractCliOcrOptions(
56-
force_full_page_ocr=self.force_full_page_ocr,
57-
tesseract_cmd=self.tesseract_cmd
58-
)
59-
60-
# Set up table options
61-
table_options = None
62-
if self.table_structure_enabled:
63-
table_options = TableStructureOptions(
64-
do_cell_matching=self.do_cell_matching
65-
)
6680

67-
# Create pipeline options
68-
pipeline_options = PdfPipelineOptions(
69-
do_table_structure=self.table_structure_enabled,
70-
do_ocr=self.ocr_enabled,
71-
ocr_options=ocr_options,
72-
table_structure_options=table_options
73-
)
81+
# Create pipeline options
82+
pipeline_options = PdfPipelineOptions(
83+
do_table_structure=self.table_structure_enabled,
84+
do_ocr=self.ocr_enabled,
85+
table_structure_options=table_options
86+
)
7487

75-
# Create format options
76-
self.format_options = {
77-
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
78-
}
88+
# Create format options
89+
self.format_options = {
90+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
91+
}
7992

8093

8194
class DocumentLoaderDocling(CachedDocumentLoader):

tests/test_document_loader_docling.py

Lines changed: 120 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,21 @@
1111
TesseractCliOcrOptions,
1212
TableStructureOptions,
1313
)
14-
1514
from docling.datamodel.base_models import InputFormat
16-
from docling.document_converter import PdfFormatOption, ImageFormatOption
15+
from docling.document_converter import PdfFormatOption
16+
1717

1818
class TestDocumentLoaderDocling(BaseDocumentLoaderTest):
19+
@pytest.fixture
20+
def test_file_path(self):
21+
current_dir = os.path.dirname(os.path.abspath(__file__))
22+
return os.path.join(current_dir, 'files', 'invoice.pdf')
23+
24+
@pytest.fixture
25+
def loader(self):
26+
"""Required fixture from BaseDocumentLoaderTest - returns a basic loader instance"""
27+
return DocumentLoaderDocling()
28+
1929
@pytest.fixture
2030
def default_pipeline_options(self):
2131
"""Default pipeline options for testing"""
@@ -35,56 +45,72 @@ def default_pipeline_options(self):
3545
table_structure_options=table_options
3646
)
3747

38-
@pytest.fixture
39-
def docling_config(self, default_pipeline_options):
40-
"""Default Docling configuration for testing"""
41-
format_options = {
42-
InputFormat.PDF: PdfFormatOption(pipeline_options=default_pipeline_options)
43-
}
44-
return DoclingConfig(
45-
format_options=format_options,
46-
ocr_enabled=True,
48+
def test_simple_initialization(self, test_file_path, loader):
49+
"""Test simple initialization without any configuration"""
50+
# Basic load and verify
51+
pages = loader.load(test_file_path)
52+
assert isinstance(pages, list)
53+
assert len(pages) > 0
54+
assert "content" in pages[0]
55+
assert isinstance(pages[0]["content"], str)
56+
assert len(pages[0]["content"]) > 0
57+
58+
def test_simple_config(self, test_file_path):
59+
"""Test simple configuration with basic options"""
60+
config = DoclingConfig(
61+
ocr_enabled=False,
4762
table_structure_enabled=True,
48-
tesseract_cmd="/opt/homebrew/bin/tesseract",
49-
force_full_page_ocr=True,
5063
do_cell_matching=True
5164
)
65+
loader = DocumentLoaderDocling(config)
66+
67+
pages = loader.load(test_file_path)
68+
assert isinstance(pages, list)
69+
assert len(pages) > 0
70+
assert "content" in pages[0]
5271

53-
@pytest.fixture
54-
def loader(self, docling_config):
55-
return DocumentLoaderDocling(docling_config)
56-
57-
@pytest.fixture
58-
def loader_no_ocr(self):
59-
"""Loader instance with OCR disabled"""
60-
return DocumentLoaderDocling(
61-
DoclingConfig(
62-
ocr_enabled=False,
63-
table_structure_enabled=True
72+
def test_complex_config(self, test_file_path):
73+
"""Test complex configuration with custom format options"""
74+
# Set up pipeline options
75+
pipeline_options = PdfPipelineOptions(
76+
do_table_structure=True,
77+
do_ocr=False,
78+
table_structure_options=TableStructureOptions(
79+
do_cell_matching=True
6480
)
6581
)
66-
67-
@pytest.fixture
68-
def test_file_path(self):
69-
current_dir = os.path.dirname(os.path.abspath(__file__))
70-
return os.path.join(current_dir, 'files', 'invoice.pdf')
71-
72-
def test_docling_specific_content(self, loader, test_file_path):
73-
"""Test Docling-specific content extraction"""
74-
pages = loader.load(test_file_path)
7582

83+
# Create format options
84+
format_options = {
85+
InputFormat.PDF: PdfFormatOption(
86+
pipeline_options=pipeline_options
87+
)
88+
}
89+
90+
# Create config with format options
91+
config = DoclingConfig(format_options=format_options)
92+
loader = DocumentLoaderDocling(config)
93+
94+
pages = loader.load(test_file_path)
7695
assert isinstance(pages, list)
7796
assert len(pages) > 0
78-
79-
first_page = pages[0]
80-
assert "content" in first_page
81-
assert len(first_page["content"]) > 0
97+
assert "content" in pages[0]
8298

83-
def test_vision_mode(self, loader, test_file_path):
99+
def test_stream_loading(self, test_file_path, loader):
100+
"""Test loading from BytesIO stream"""
101+
with open(test_file_path, 'rb') as f:
102+
stream = BytesIO(f.read())
103+
pages = loader.load(stream)
104+
105+
assert isinstance(pages, list)
106+
assert len(pages) > 0
107+
assert "content" in pages[0]
108+
109+
def test_vision_mode(self, test_file_path, loader):
84110
"""Test vision mode functionality"""
85111
loader.set_vision_mode(True)
86-
pages = loader.load(test_file_path)
87112

113+
pages = loader.load(test_file_path)
88114
assert isinstance(pages, list)
89115
assert len(pages) > 0
90116

@@ -95,65 +121,88 @@ def test_vision_mode(self, loader, test_file_path):
95121
assert "image" in page
96122
assert isinstance(page["image"], bytes)
97123

98-
def test_stream_loading(self, loader, test_file_path):
99-
"""Test loading from BytesIO stream"""
100-
with open(test_file_path, 'rb') as f:
101-
stream = BytesIO(f.read())
102-
pages = loader.load(stream)
103-
104-
assert isinstance(pages, list)
105-
assert len(pages) > 0
106-
assert "content" in pages[0]
107-
108-
def test_pagination(self, loader, test_file_path):
124+
def test_pagination(self, test_file_path, loader):
109125
"""Test pagination functionality"""
110126
pages = loader.load(test_file_path)
111-
112127
assert isinstance(pages, list)
113128
if loader.can_handle_paginate(test_file_path):
114129
assert len(pages) > 0
115130
for page in pages:
116131
assert "content" in page
117132
assert isinstance(page["content"], str)
118133

119-
def test_no_ocr_loading(self, loader_no_ocr, test_file_path):
120-
"""Test loading with OCR disabled"""
121-
pages = loader_no_ocr.load(test_file_path)
134+
def test_supported_formats(self, loader):
135+
"""Test that supported formats are correctly defined"""
136+
assert isinstance(loader.SUPPORTED_FORMATS, list)
137+
assert "pdf" in loader.SUPPORTED_FORMATS
138+
assert "docx" in loader.SUPPORTED_FORMATS
139+
assert "txt" in loader.SUPPORTED_FORMATS
140+
141+
def test_ocr_disabled(self, test_file_path):
142+
"""Test that OCR is disabled by default"""
143+
config = DoclingConfig() # Default config
144+
loader = DocumentLoaderDocling(config)
122145

146+
pages = loader.load(test_file_path)
123147
assert isinstance(pages, list)
124148
assert len(pages) > 0
125149
assert "content" in pages[0]
126150

127-
def test_config_features(self, test_file_path):
128-
"""Test various configuration features"""
129-
# Test with custom OCR settings
151+
def test_ocr_enabled(self, test_file_path, default_pipeline_options):
152+
"""Test with OCR enabled using tesseract"""
153+
# Create format options with OCR
154+
format_options = {
155+
InputFormat.PDF: PdfFormatOption(
156+
pipeline_options=default_pipeline_options
157+
)
158+
}
159+
130160
config = DoclingConfig(
161+
format_options=format_options,
131162
ocr_enabled=True,
132-
tesseract_cmd="/opt/homebrew/bin/tesseract",
133163
force_full_page_ocr=True
134164
)
135165
loader = DocumentLoaderDocling(config)
166+
136167
pages = loader.load(test_file_path)
168+
assert isinstance(pages, list)
137169
assert len(pages) > 0
170+
assert "content" in pages[0]
138171

139-
# Test with custom table settings
172+
def test_custom_ocr_config(self, test_file_path):
173+
"""Test with custom OCR configuration"""
174+
# Set up OCR options
175+
ocr_options = TesseractCliOcrOptions(
176+
force_full_page_ocr=True,
177+
tesseract_cmd="/opt/homebrew/bin/tesseract"
178+
)
179+
180+
# Set up pipeline options with OCR
181+
pipeline_options = PdfPipelineOptions(
182+
do_table_structure=True,
183+
do_ocr=True,
184+
ocr_options=ocr_options,
185+
table_structure_options=TableStructureOptions(
186+
do_cell_matching=True
187+
)
188+
)
189+
190+
# Create format options
191+
format_options = {
192+
InputFormat.PDF: PdfFormatOption(
193+
pipeline_options=pipeline_options
194+
)
195+
}
196+
197+
# Create config with OCR enabled
140198
config = DoclingConfig(
141-
table_structure_enabled=True,
142-
do_cell_matching=False
199+
format_options=format_options,
200+
ocr_enabled=True,
201+
force_full_page_ocr=True
143202
)
144203
loader = DocumentLoaderDocling(config)
145-
pages = loader.load(test_file_path)
146-
assert len(pages) > 0
147-
148-
def test_simple_initialization(self, test_file_path):
149-
"""Test simple initialization and basic functionality without any special configurations"""
150-
# Simple initialization like before
151-
loader = DocumentLoaderDocling()
152204

153-
# Basic load and verify
154205
pages = loader.load(test_file_path)
155206
assert isinstance(pages, list)
156207
assert len(pages) > 0
157208
assert "content" in pages[0]
158-
assert isinstance(pages[0]["content"], str)
159-
assert len(pages[0]["content"]) > 0 # Should have extracted some text

0 commit comments

Comments
 (0)