1111 TesseractCliOcrOptions ,
1212 TableStructureOptions ,
1313)
14-
1514from docling .datamodel .base_models import InputFormat
16- from docling .document_converter import PdfFormatOption , ImageFormatOption
15+ from docling .document_converter import PdfFormatOption
16+
1717
1818class TestDocumentLoaderDocling (BaseDocumentLoaderTest ):
19+ @pytest .fixture
20+ def test_file_path (self ):
21+ current_dir = os .path .dirname (os .path .abspath (__file__ ))
22+ return os .path .join (current_dir , 'files' , 'invoice.pdf' )
23+
24+ @pytest .fixture
25+ def loader (self ):
26+ """Required fixture from BaseDocumentLoaderTest - returns a basic loader instance"""
27+ return DocumentLoaderDocling ()
28+
1929 @pytest .fixture
2030 def default_pipeline_options (self ):
2131 """Default pipeline options for testing"""
@@ -35,56 +45,72 @@ def default_pipeline_options(self):
3545 table_structure_options = table_options
3646 )
3747
38- @pytest .fixture
39- def docling_config (self , default_pipeline_options ):
40- """Default Docling configuration for testing"""
41- format_options = {
42- InputFormat .PDF : PdfFormatOption (pipeline_options = default_pipeline_options )
43- }
44- return DoclingConfig (
45- format_options = format_options ,
46- ocr_enabled = True ,
48+ def test_simple_initialization (self , test_file_path , loader ):
49+ """Test simple initialization without any configuration"""
50+ # Basic load and verify
51+ pages = loader .load (test_file_path )
52+ assert isinstance (pages , list )
53+ assert len (pages ) > 0
54+ assert "content" in pages [0 ]
55+ assert isinstance (pages [0 ]["content" ], str )
56+ assert len (pages [0 ]["content" ]) > 0
57+
58+ def test_simple_config (self , test_file_path ):
59+ """Test simple configuration with basic options"""
60+ config = DoclingConfig (
61+ ocr_enabled = False ,
4762 table_structure_enabled = True ,
48- tesseract_cmd = "/opt/homebrew/bin/tesseract" ,
49- force_full_page_ocr = True ,
5063 do_cell_matching = True
5164 )
65+ loader = DocumentLoaderDocling (config )
66+
67+ pages = loader .load (test_file_path )
68+ assert isinstance (pages , list )
69+ assert len (pages ) > 0
70+ assert "content" in pages [0 ]
5271
53- @pytest .fixture
54- def loader (self , docling_config ):
55- return DocumentLoaderDocling (docling_config )
56-
57- @pytest .fixture
58- def loader_no_ocr (self ):
59- """Loader instance with OCR disabled"""
60- return DocumentLoaderDocling (
61- DoclingConfig (
62- ocr_enabled = False ,
63- table_structure_enabled = True
72+ def test_complex_config (self , test_file_path ):
73+ """Test complex configuration with custom format options"""
74+ # Set up pipeline options
75+ pipeline_options = PdfPipelineOptions (
76+ do_table_structure = True ,
77+ do_ocr = False ,
78+ table_structure_options = TableStructureOptions (
79+ do_cell_matching = True
6480 )
6581 )
66-
67- @pytest .fixture
68- def test_file_path (self ):
69- current_dir = os .path .dirname (os .path .abspath (__file__ ))
70- return os .path .join (current_dir , 'files' , 'invoice.pdf' )
71-
72- def test_docling_specific_content (self , loader , test_file_path ):
73- """Test Docling-specific content extraction"""
74- pages = loader .load (test_file_path )
7582
83+ # Create format options
84+ format_options = {
85+ InputFormat .PDF : PdfFormatOption (
86+ pipeline_options = pipeline_options
87+ )
88+ }
89+
90+ # Create config with format options
91+ config = DoclingConfig (format_options = format_options )
92+ loader = DocumentLoaderDocling (config )
93+
94+ pages = loader .load (test_file_path )
7695 assert isinstance (pages , list )
7796 assert len (pages ) > 0
78-
79- first_page = pages [0 ]
80- assert "content" in first_page
81- assert len (first_page ["content" ]) > 0
97+ assert "content" in pages [0 ]
8298
83- def test_vision_mode (self , loader , test_file_path ):
99+ def test_stream_loading (self , test_file_path , loader ):
100+ """Test loading from BytesIO stream"""
101+ with open (test_file_path , 'rb' ) as f :
102+ stream = BytesIO (f .read ())
103+ pages = loader .load (stream )
104+
105+ assert isinstance (pages , list )
106+ assert len (pages ) > 0
107+ assert "content" in pages [0 ]
108+
109+ def test_vision_mode (self , test_file_path , loader ):
84110 """Test vision mode functionality"""
85111 loader .set_vision_mode (True )
86- pages = loader .load (test_file_path )
87112
113+ pages = loader .load (test_file_path )
88114 assert isinstance (pages , list )
89115 assert len (pages ) > 0
90116
@@ -95,65 +121,88 @@ def test_vision_mode(self, loader, test_file_path):
95121 assert "image" in page
96122 assert isinstance (page ["image" ], bytes )
97123
98- def test_stream_loading (self , loader , test_file_path ):
99- """Test loading from BytesIO stream"""
100- with open (test_file_path , 'rb' ) as f :
101- stream = BytesIO (f .read ())
102- pages = loader .load (stream )
103-
104- assert isinstance (pages , list )
105- assert len (pages ) > 0
106- assert "content" in pages [0 ]
107-
108- def test_pagination (self , loader , test_file_path ):
124+ def test_pagination (self , test_file_path , loader ):
109125 """Test pagination functionality"""
110126 pages = loader .load (test_file_path )
111-
112127 assert isinstance (pages , list )
113128 if loader .can_handle_paginate (test_file_path ):
114129 assert len (pages ) > 0
115130 for page in pages :
116131 assert "content" in page
117132 assert isinstance (page ["content" ], str )
118133
119- def test_no_ocr_loading (self , loader_no_ocr , test_file_path ):
120- """Test loading with OCR disabled"""
121- pages = loader_no_ocr .load (test_file_path )
134+ def test_supported_formats (self , loader ):
135+ """Test that supported formats are correctly defined"""
136+ assert isinstance (loader .SUPPORTED_FORMATS , list )
137+ assert "pdf" in loader .SUPPORTED_FORMATS
138+ assert "docx" in loader .SUPPORTED_FORMATS
139+ assert "txt" in loader .SUPPORTED_FORMATS
140+
141+ def test_ocr_disabled (self , test_file_path ):
142+ """Test that OCR is disabled by default"""
143+ config = DoclingConfig () # Default config
144+ loader = DocumentLoaderDocling (config )
122145
146+ pages = loader .load (test_file_path )
123147 assert isinstance (pages , list )
124148 assert len (pages ) > 0
125149 assert "content" in pages [0 ]
126150
127- def test_config_features (self , test_file_path ):
128- """Test various configuration features"""
129- # Test with custom OCR settings
151+ def test_ocr_enabled (self , test_file_path , default_pipeline_options ):
152+ """Test with OCR enabled using tesseract"""
153+ # Create format options with OCR
154+ format_options = {
155+ InputFormat .PDF : PdfFormatOption (
156+ pipeline_options = default_pipeline_options
157+ )
158+ }
159+
130160 config = DoclingConfig (
161+ format_options = format_options ,
131162 ocr_enabled = True ,
132- tesseract_cmd = "/opt/homebrew/bin/tesseract" ,
133163 force_full_page_ocr = True
134164 )
135165 loader = DocumentLoaderDocling (config )
166+
136167 pages = loader .load (test_file_path )
168+ assert isinstance (pages , list )
137169 assert len (pages ) > 0
170+ assert "content" in pages [0 ]
138171
139- # Test with custom table settings
172+ def test_custom_ocr_config (self , test_file_path ):
173+ """Test with custom OCR configuration"""
174+ # Set up OCR options
175+ ocr_options = TesseractCliOcrOptions (
176+ force_full_page_ocr = True ,
177+ tesseract_cmd = "/opt/homebrew/bin/tesseract"
178+ )
179+
180+ # Set up pipeline options with OCR
181+ pipeline_options = PdfPipelineOptions (
182+ do_table_structure = True ,
183+ do_ocr = True ,
184+ ocr_options = ocr_options ,
185+ table_structure_options = TableStructureOptions (
186+ do_cell_matching = True
187+ )
188+ )
189+
190+ # Create format options
191+ format_options = {
192+ InputFormat .PDF : PdfFormatOption (
193+ pipeline_options = pipeline_options
194+ )
195+ }
196+
197+ # Create config with OCR enabled
140198 config = DoclingConfig (
141- table_structure_enabled = True ,
142- do_cell_matching = False
199+ format_options = format_options ,
200+ ocr_enabled = True ,
201+ force_full_page_ocr = True
143202 )
144203 loader = DocumentLoaderDocling (config )
145- pages = loader .load (test_file_path )
146- assert len (pages ) > 0
147-
148- def test_simple_initialization (self , test_file_path ):
149- """Test simple initialization and basic functionality without any special configurations"""
150- # Simple initialization like before
151- loader = DocumentLoaderDocling ()
152204
153- # Basic load and verify
154205 pages = loader .load (test_file_path )
155206 assert isinstance (pages , list )
156207 assert len (pages ) > 0
157208 assert "content" in pages [0 ]
158- assert isinstance (pages [0 ]["content" ], str )
159- assert len (pages [0 ]["content" ]) > 0 # Should have extracted some text
0 commit comments