|
| 1 | +from io import BytesIO |
| 2 | +from typing import Any, Dict, List, Union, Optional |
| 3 | + |
| 4 | +from cachetools import cachedmethod |
| 5 | +from cachetools.keys import hashkey |
| 6 | + |
| 7 | +# Import your DocumentLoader base |
| 8 | +from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader |
| 9 | + |
| 10 | +class DocumentLoaderDocling(CachedDocumentLoader): |
| 11 | + """ |
| 12 | + Document loader that uses Docling to extract content from various file formats. |
| 13 | + Produces a list of pages, each with: |
| 14 | + - "content": text from that page |
| 15 | + - "image": optional page image bytes if vision_mode is True |
| 16 | + - "markdown": Markdown string of that page |
| 17 | + """ |
| 18 | + |
| 19 | + SUPPORTED_FORMATS = [ |
| 20 | + # Microsoft Word family |
| 21 | + "docx", "dotx", "docm", "dotm", |
| 22 | + # Microsoft PowerPoint family |
| 23 | + "pptx", "potx", "ppsx", "pptm", "potm", "ppsm", |
| 24 | + # Microsoft Excel family |
| 25 | + "xlsx", |
| 26 | + # PDF |
| 27 | + "pdf", |
| 28 | + # HTML variants |
| 29 | + "html", "htm", "xhtml", |
| 30 | + # Markdown |
| 31 | + "md", |
| 32 | + # AsciiDoc variants |
| 33 | + "adoc", "asciidoc", "asc", |
| 34 | + # Common image types |
| 35 | + "png", "jpg", "jpeg", "tif", "tiff", "bmp", |
| 36 | + # XML (including PubMed .nxml) |
| 37 | + "xml", "nxml", |
| 38 | + # Plain text |
| 39 | + "txt" |
| 40 | + ] |
| 41 | + |
| 42 | + def __init__( |
| 43 | + self, |
| 44 | + content: Any = None, |
| 45 | + cache_ttl: int = 300, |
| 46 | + format_options: Optional[Dict[str, Any]] = None, |
| 47 | + ): |
| 48 | + """Initialize loader. |
| 49 | + |
| 50 | + Args: |
| 51 | + content: Initial content |
| 52 | + cache_ttl: Cache time-to-live in seconds |
| 53 | + format_options: Dictionary mapping input formats to their FormatOption configurations |
| 54 | + Example: |
| 55 | + { |
| 56 | + InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options), |
| 57 | + InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_options), |
| 58 | + ... |
| 59 | + } |
| 60 | + """ |
| 61 | + from docling.datamodel.base_models import InputFormat |
| 62 | + from docling.document_converter import FormatOption |
| 63 | + |
| 64 | + # Check dependencies before any initialization |
| 65 | + self._check_dependencies() |
| 66 | + |
| 67 | + super().__init__(content, cache_ttl) |
| 68 | + self.format_options = format_options |
| 69 | + self.converter = self._init_docling_converter() |
| 70 | + |
| 71 | + @staticmethod |
| 72 | + def _check_dependencies(): |
| 73 | + """Check if required dependencies are installed.""" |
| 74 | + try: |
| 75 | + import docling |
| 76 | + import docling.document_converter |
| 77 | + import docling.datamodel.document |
| 78 | + import docling.datamodel.pipeline_options |
| 79 | + import docling_core.types.doc |
| 80 | + except ImportError: |
| 81 | + raise ImportError( |
| 82 | + "Could not import docling python package. " |
| 83 | + "Please install it with `pip install docling`." |
| 84 | + ) |
| 85 | + |
| 86 | + def _init_docling_converter(self): |
| 87 | + """Initialize the Docling document converter.""" |
| 88 | + from docling.document_converter import DocumentConverter |
| 89 | + return DocumentConverter() |
| 90 | + |
| 91 | + @cachedmethod(cache=lambda self: self.cache, |
| 92 | + key=lambda self, source: hashkey( |
| 93 | + source if isinstance(source, str) else source.getvalue(), |
| 94 | + self.vision_mode |
| 95 | + )) |
| 96 | + def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: |
| 97 | + """ |
| 98 | + Load and parse the document using Docling. |
| 99 | + |
| 100 | + Returns: |
| 101 | + A list of dictionaries, each representing a "page" with: |
| 102 | + - "content": text from that page |
| 103 | + - "image": optional image bytes if vision_mode is True |
| 104 | + - "markdown": Markdown string of that page |
| 105 | + """ |
| 106 | + if not self.can_handle(source): |
| 107 | + raise ValueError(f"Cannot handle source: {source}") |
| 108 | + |
| 109 | + # Convert the source to a docling "ConversionResult" |
| 110 | + conv_result = self._docling_convert(source) |
| 111 | + |
| 112 | + test = conv_result.document.export_to_markdown() |
| 113 | + print(test) |
| 114 | + |
| 115 | + # Build the output list of page data |
| 116 | + pages_output = [] |
| 117 | + for p in conv_result.pages: |
| 118 | + page_dict = { |
| 119 | + "content": conv_result.document.export_to_markdown(page_no=p.page_no+1), |
| 120 | + "image": None |
| 121 | + } |
| 122 | + |
| 123 | + # Handle image extraction if vision_mode is enabled |
| 124 | + if self.vision_mode: |
| 125 | + images_dict = self.convert_to_images(source) |
| 126 | + page_dict["image"] = images_dict.get(p.page_no) |
| 127 | + |
| 128 | + pages_output.append(page_dict) |
| 129 | + |
| 130 | + # Fallback for documents without explicit pages |
| 131 | + if not pages_output: |
| 132 | + doc_text = conv_result.document.export_to_markdown() |
| 133 | + pages_output = [{"content": doc_text, "image": None}] |
| 134 | + |
| 135 | + return pages_output |
| 136 | + |
| 137 | + def _docling_convert(self, source: Union[str, BytesIO]) -> Any: |
| 138 | + """ |
| 139 | + Internal method that runs the docling convert pipeline. |
| 140 | + Uses format_options if provided during initialization, otherwise uses default settings. |
| 141 | + """ |
| 142 | + from docling.document_converter import DocumentConverter |
| 143 | + from docling_core.types.io import DocumentStream |
| 144 | + import uuid |
| 145 | + |
| 146 | + # Create converter with optional format options from initialization |
| 147 | + docling_converter = DocumentConverter( |
| 148 | + format_options=self.format_options if self.format_options else None |
| 149 | + ) |
| 150 | + |
| 151 | + # Handle different input types |
| 152 | + if isinstance(source, BytesIO): |
| 153 | + # Generate a unique filename using UUID |
| 154 | + unique_filename = f"{uuid.uuid4()}.pdf" |
| 155 | + doc_stream = DocumentStream(name=unique_filename, stream=source) |
| 156 | + conv_result = docling_converter.convert(doc_stream, raises_on_error=True) |
| 157 | + elif isinstance(source, str): |
| 158 | + # Handle string paths or URLs directly |
| 159 | + conv_result = docling_converter.convert(source, raises_on_error=True) |
| 160 | + else: |
| 161 | + raise ValueError(f"Unsupported source type: {type(source)}") |
| 162 | + |
| 163 | + return conv_result |
| 164 | + |
| 165 | + def _extract_page_text(self, page: Any) -> str: |
| 166 | + """ |
| 167 | + Gather text from a docling Page object. |
| 168 | + Handles both text and table items. |
| 169 | + """ |
| 170 | + |
| 171 | + from docling_core.types.doc import DocItemLabel, TableItem |
| 172 | + |
| 173 | + lines = [] |
| 174 | + if page.assembled and page.assembled.elements: |
| 175 | + for element in page.assembled.elements: |
| 176 | + # Normal text |
| 177 | + if element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]: |
| 178 | + lines.append(element.text or "") |
| 179 | + # Tables |
| 180 | + elif element.label == DocItemLabel.TABLE and isinstance(element, TableItem): |
| 181 | + table_text = self.convert_table_to_text(element) |
| 182 | + lines.append(table_text) |
| 183 | + else: |
| 184 | + # If no "assembled" data, fallback to the raw text cells |
| 185 | + # or produce an empty string |
| 186 | + if page.cells: |
| 187 | + # Join cell texts. Not always great, but a fallback: |
| 188 | + return "\n".join(cell.text for cell in page.cells if cell.text) |
| 189 | + return "" |
| 190 | + |
| 191 | + return "\n".join(lines) |
| 192 | + |
| 193 | + def convert_table_to_text(self, table_item: Any) -> str: |
| 194 | + """ |
| 195 | + Convert a TableItem to a Markdown table string. |
| 196 | + """ |
| 197 | + headers = [] |
| 198 | + rows = [] |
| 199 | + |
| 200 | + # Assuming the first row is the header |
| 201 | + for idx, row in enumerate(table_item.table_rows): |
| 202 | + row_text = [] |
| 203 | + for cell in row.table_cells: |
| 204 | + row_text.append(cell.text.strip() if cell.text else "") |
| 205 | + if idx == 0: |
| 206 | + headers = row_text |
| 207 | + rows.append("| " + " | ".join(headers) + " |") |
| 208 | + rows.append("| " + " | ".join(['---'] * len(headers)) + " |") |
| 209 | + else: |
| 210 | + rows.append("| " + " | ".join(row_text) + " |") |
| 211 | + |
| 212 | + return "\n".join(rows) |
| 213 | + |
0 commit comments