Merge pull request #199 from enoch3712/198-docling-support-simple-setup

enoch3712 · web-flow · commit e11306b09a4e · 2025-01-17T16:50:05.000+04:00
Docling refactor
diff --git a/extract_thinker/document_loader/document_loader_docling.py b/extract_thinker/document_loader/document_loader_docling.py
@@ -12,70 +12,83 @@
 class DoclingConfig:
     """Configuration for Docling document loader.
     
+    This class supports both simple and complex configurations:
+    
+    Simple usage:
+        config = DoclingConfig()  # Uses default settings
+        
+    Complex usage:
+        config = DoclingConfig(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=PdfPipelineOptions(
+                        do_table_structure=True,
+                        do_ocr=True,
+                        table_structure_options=TableStructureOptions(
+                            do_cell_matching=True
+                        )
+                    )
+                )
+            }
+        )
+    
     Args:
         content: Initial content (optional)
         cache_ttl: Cache time-to-live in seconds (default: 300)
-        format_options: Dictionary mapping input formats to their FormatOption configurations
-            Example:
+        format_options: Dictionary mapping input formats to their FormatOption configurations.
+            If None, default options will be created based on other parameters.
+            For complex scenarios, you can provide your own format options:
             {
                 InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options),
                 InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_options),
                 ...
             }
-        ocr_enabled: Whether to enable OCR processing (default: True)
+        ocr_enabled: Whether to enable OCR processing (default: False)
         table_structure_enabled: Whether to enable table structure detection (default: True)
-        tesseract_cmd: Path to tesseract executable (default: None)
         force_full_page_ocr: Whether to force OCR on entire pages (default: False)
         do_cell_matching: Whether to enable cell matching in tables (default: True)
     """
     # Optional parameters
     content: Optional[Any] = None
     cache_ttl: int = 300
     format_options: Optional[Dict[str, Any]] = None
-    ocr_enabled: bool = True
+    ocr_enabled: bool = False  # OCR disabled by default
     table_structure_enabled: bool = True
-    tesseract_cmd: Optional[str] = None
     force_full_page_ocr: bool = False
     do_cell_matching: bool = True
 
     def __post_init__(self):
-        """Initialize format options based on configuration."""
-        if self.format_options is None:
-            from docling.datamodel.pipeline_options import (
-                PdfPipelineOptions,
-                TesseractCliOcrOptions,
-                TableStructureOptions,
+        """Initialize format options if not provided."""
+        # If format_options are provided, use them as is (complex configuration)
+        if self.format_options is not None:
+            return
+
+        # Simple configuration: create default format options based on parameters
+        from docling.datamodel.pipeline_options import (
+            PdfPipelineOptions,
+            TableStructureOptions,
+        )
+        from docling.datamodel.base_models import InputFormat
+        from docling.document_converter import PdfFormatOption
+
+        # Set up table options
+        table_options = None
+        if self.table_structure_enabled:
+            table_options = TableStructureOptions(
+                do_cell_matching=self.do_cell_matching
             )
-            from docling.datamodel.base_models import InputFormat
-            from docling.document_converter import PdfFormatOption
-
-            # Set up OCR options
-            ocr_options = None
-            if self.ocr_enabled:
-                ocr_options = TesseractCliOcrOptions(
-                    force_full_page_ocr=self.force_full_page_ocr,
-                    tesseract_cmd=self.tesseract_cmd
-                )
-
-            # Set up table options
-            table_options = None
-            if self.table_structure_enabled:
-                table_options = TableStructureOptions(
-                    do_cell_matching=self.do_cell_matching
-                )
 
-            # Create pipeline options
-            pipeline_options = PdfPipelineOptions(
-                do_table_structure=self.table_structure_enabled,
-                do_ocr=self.ocr_enabled,
-                ocr_options=ocr_options,
-                table_structure_options=table_options
-            )
+        # Create pipeline options
+        pipeline_options = PdfPipelineOptions(
+            do_table_structure=self.table_structure_enabled,
+            do_ocr=self.ocr_enabled,
+            table_structure_options=table_options
+        )
 
-            # Create format options
-            self.format_options = {
-                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
-            }
+        # Create format options
+        self.format_options = {
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+        }
 
 
 class DocumentLoaderDocling(CachedDocumentLoader):
diff --git a/tests/test_document_loader_docling.py b/tests/test_document_loader_docling.py
@@ -11,11 +11,21 @@
     TesseractCliOcrOptions,
     TableStructureOptions,
 )
-
 from docling.datamodel.base_models import InputFormat
-from docling.document_converter import PdfFormatOption, ImageFormatOption
+from docling.document_converter import PdfFormatOption
+
 
 class TestDocumentLoaderDocling(BaseDocumentLoaderTest):
+    @pytest.fixture
+    def test_file_path(self):
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        return os.path.join(current_dir, 'files', 'invoice.pdf')
+
+    @pytest.fixture
+    def loader(self):
+        """Required fixture from BaseDocumentLoaderTest - returns a basic loader instance"""
+        return DocumentLoaderDocling()
+
     @pytest.fixture
     def default_pipeline_options(self):
         """Default pipeline options for testing"""
@@ -35,56 +45,72 @@ def default_pipeline_options(self):
             table_structure_options=table_options
         )
 
-    @pytest.fixture
-    def docling_config(self, default_pipeline_options):
-        """Default Docling configuration for testing"""
-        format_options = {
-            InputFormat.PDF: PdfFormatOption(pipeline_options=default_pipeline_options)
-        }
-        return DoclingConfig(
-            format_options=format_options,
-            ocr_enabled=True,
+    def test_simple_initialization(self, test_file_path, loader):
+        """Test simple initialization without any configuration"""
+        # Basic load and verify
+        pages = loader.load(test_file_path)
+        assert isinstance(pages, list)
+        assert len(pages) > 0
+        assert "content" in pages[0]
+        assert isinstance(pages[0]["content"], str)
+        assert len(pages[0]["content"]) > 0
+
+    def test_simple_config(self, test_file_path):
+        """Test simple configuration with basic options"""
+        config = DoclingConfig(
+            ocr_enabled=False,
             table_structure_enabled=True,
-            tesseract_cmd="/opt/homebrew/bin/tesseract",
-            force_full_page_ocr=True,
             do_cell_matching=True
         )
+        loader = DocumentLoaderDocling(config)
+        
+        pages = loader.load(test_file_path)
+        assert isinstance(pages, list)
+        assert len(pages) > 0
+        assert "content" in pages[0]
 
-    @pytest.fixture
-    def loader(self, docling_config):
-        return DocumentLoaderDocling(docling_config)
-
-    @pytest.fixture
-    def loader_no_ocr(self):
-        """Loader instance with OCR disabled"""
-        return DocumentLoaderDocling(
-            DoclingConfig(
-                ocr_enabled=False,
-                table_structure_enabled=True
+    def test_complex_config(self, test_file_path):
+        """Test complex configuration with custom format options"""
+        # Set up pipeline options
+        pipeline_options = PdfPipelineOptions(
+            do_table_structure=True,
+            do_ocr=False,
+            table_structure_options=TableStructureOptions(
+                do_cell_matching=True
             )
         )
-
-    @pytest.fixture
-    def test_file_path(self):
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        return os.path.join(current_dir, 'files', 'invoice.pdf')
-
-    def test_docling_specific_content(self, loader, test_file_path):
-        """Test Docling-specific content extraction"""
-        pages = loader.load(test_file_path)
         
+        # Create format options
+        format_options = {
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options
+            )
+        }
+        
+        # Create config with format options
+        config = DoclingConfig(format_options=format_options)
+        loader = DocumentLoaderDocling(config)
+        
+        pages = loader.load(test_file_path)
         assert isinstance(pages, list)
         assert len(pages) > 0
-        
-        first_page = pages[0]
-        assert "content" in first_page
-        assert len(first_page["content"]) > 0
+        assert "content" in pages[0]
 
-    def test_vision_mode(self, loader, test_file_path):
+    def test_stream_loading(self, test_file_path, loader):
+        """Test loading from BytesIO stream"""
+        with open(test_file_path, 'rb') as f:
+            stream = BytesIO(f.read())
+            pages = loader.load(stream)
+            
+            assert isinstance(pages, list)
+            assert len(pages) > 0
+            assert "content" in pages[0]
+
+    def test_vision_mode(self, test_file_path, loader):
         """Test vision mode functionality"""
         loader.set_vision_mode(True)
-        pages = loader.load(test_file_path)
         
+        pages = loader.load(test_file_path)
         assert isinstance(pages, list)
         assert len(pages) > 0
         
@@ -95,65 +121,88 @@ def test_vision_mode(self, loader, test_file_path):
                 assert "image" in page
                 assert isinstance(page["image"], bytes)
 
-    def test_stream_loading(self, loader, test_file_path):
-        """Test loading from BytesIO stream"""
-        with open(test_file_path, 'rb') as f:
-            stream = BytesIO(f.read())
-            pages = loader.load(stream)
-            
-            assert isinstance(pages, list)
-            assert len(pages) > 0
-            assert "content" in pages[0]
-
-    def test_pagination(self, loader, test_file_path):
+    def test_pagination(self, test_file_path, loader):
         """Test pagination functionality"""
         pages = loader.load(test_file_path)
-        
         assert isinstance(pages, list)
         if loader.can_handle_paginate(test_file_path):
             assert len(pages) > 0
             for page in pages:
                 assert "content" in page
                 assert isinstance(page["content"], str)
 
-    def test_no_ocr_loading(self, loader_no_ocr, test_file_path):
-        """Test loading with OCR disabled"""
-        pages = loader_no_ocr.load(test_file_path)
+    def test_supported_formats(self, loader):
+        """Test that supported formats are correctly defined"""
+        assert isinstance(loader.SUPPORTED_FORMATS, list)
+        assert "pdf" in loader.SUPPORTED_FORMATS
+        assert "docx" in loader.SUPPORTED_FORMATS
+        assert "txt" in loader.SUPPORTED_FORMATS
+
+    def test_ocr_disabled(self, test_file_path):
+        """Test that OCR is disabled by default"""
+        config = DoclingConfig()  # Default config
+        loader = DocumentLoaderDocling(config)
         
+        pages = loader.load(test_file_path)
         assert isinstance(pages, list)
         assert len(pages) > 0
         assert "content" in pages[0]
 
-    def test_config_features(self, test_file_path):
-        """Test various configuration features"""
-        # Test with custom OCR settings
+    def test_ocr_enabled(self, test_file_path, default_pipeline_options):
+        """Test with OCR enabled using tesseract"""
+        # Create format options with OCR
+        format_options = {
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=default_pipeline_options
+            )
+        }
+        
         config = DoclingConfig(
+            format_options=format_options,
             ocr_enabled=True,
-            tesseract_cmd="/opt/homebrew/bin/tesseract",
             force_full_page_ocr=True
         )
         loader = DocumentLoaderDocling(config)
+        
         pages = loader.load(test_file_path)
+        assert isinstance(pages, list)
         assert len(pages) > 0
+        assert "content" in pages[0]
 
-        # Test with custom table settings
+    def test_custom_ocr_config(self, test_file_path):
+        """Test with custom OCR configuration"""
+        # Set up OCR options
+        ocr_options = TesseractCliOcrOptions(
+            force_full_page_ocr=True,
+            tesseract_cmd="/opt/homebrew/bin/tesseract"
+        )
+        
+        # Set up pipeline options with OCR
+        pipeline_options = PdfPipelineOptions(
+            do_table_structure=True,
+            do_ocr=True,
+            ocr_options=ocr_options,
+            table_structure_options=TableStructureOptions(
+                do_cell_matching=True
+            )
+        )
+        
+        # Create format options
+        format_options = {
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options
+            )
+        }
+        
+        # Create config with OCR enabled
         config = DoclingConfig(
-            table_structure_enabled=True,
-            do_cell_matching=False
+            format_options=format_options,
+            ocr_enabled=True,
+            force_full_page_ocr=True
         )
         loader = DocumentLoaderDocling(config)
-        pages = loader.load(test_file_path)
-        assert len(pages) > 0
-
-    def test_simple_initialization(self, test_file_path):
-        """Test simple initialization and basic functionality without any special configurations"""
-        # Simple initialization like before
-        loader = DocumentLoaderDocling()
         
-        # Basic load and verify
         pages = loader.load(test_file_path)
         assert isinstance(pages, list)
         assert len(pages) > 0
         assert "content" in pages[0]
-        assert isinstance(pages[0]["content"], str)
-        assert len(pages[0]["content"]) > 0  # Should have extracted some text