Skip to content

Commit 2a6ae3a

Browse files
feat: Add extensive docstrings to core modules
This commit adds comprehensive Google-style docstrings to a significant portion of the `docling` codebase to improve readability and maintainability. The primary goal of this effort is to document all public classes, methods, and functions, explaining their purpose, arguments, and return values. The following modules have been documented: - `docling/datamodel`: Documented all data model files. - `docling/utils`: Documented all utility files. - `docling/exceptions.py`: Documented custom exceptions. - `docling/backend`: Documented all document parsing backends. - `docling/models`: Documented the majority of the model files, including base classes, factories, and various model implementations. - `docling/pipeline`: Started documentation of the pipeline infrastructure, covering `base_pipeline.py`, `asr_pipeline.py`, and `base_extraction_pipeline.py`.
1 parent 10bb0ae commit 2a6ae3a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+4427
-431
lines changed

docling/backend/abstract_backend.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,49 @@
1111

1212

1313
class AbstractDocumentBackend(ABC):
14+
"""An abstract base class for all document processing backends.
15+
16+
This class defines the common interface that all document backends must
17+
implement. A backend is responsible for parsing a specific document format
18+
and providing access to its content.
19+
20+
Attributes:
21+
file: The path to the document file.
22+
path_or_stream: The source of the document, either a path or a stream.
23+
document_hash: The hash of the document's content.
24+
input_format: The `InputFormat` of the document.
25+
"""
26+
1427
@abstractmethod
1528
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
29+
"""Initializes the document backend.
30+
31+
Args:
32+
in_doc: The `InputDocument` object representing the source document.
33+
path_or_stream: The path or stream of the document content.
34+
"""
1635
self.file = in_doc.file
1736
self.path_or_stream = path_or_stream
1837
self.document_hash = in_doc.document_hash
1938
self.input_format = in_doc.format
2039

2140
@abstractmethod
2241
def is_valid(self) -> bool:
42+
"""Checks if the document is valid and can be processed by this backend."""
2343
pass
2444

2545
@classmethod
2646
@abstractmethod
2747
def supports_pagination(cls) -> bool:
48+
"""Returns `True` if the backend supports paginated access to the document."""
2849
pass
2950

3051
def unload(self):
52+
"""Releases any resources held by the backend.
53+
54+
This method should be called when the backend is no longer needed to
55+
ensure that file handles and other resources are properly closed.
56+
"""
3157
if isinstance(self.path_or_stream, BytesIO):
3258
self.path_or_stream.close()
3359

@@ -36,28 +62,38 @@ def unload(self):
3662
@classmethod
3763
@abstractmethod
3864
def supported_formats(cls) -> Set["InputFormat"]:
65+
"""Returns a set of `InputFormat` enums that this backend supports."""
3966
pass
4067

4168

4269
class PaginatedDocumentBackend(AbstractDocumentBackend):
43-
"""DeclarativeDocumentBackend.
70+
"""An abstract base class for backends that support paginated documents.
4471
45-
A declarative document backend is a backend that can transform to DoclingDocument
46-
straight without a recognition pipeline.
72+
This class extends `AbstractDocumentBackend` with an abstract method for
73+
retrieving the total number of pages in a document.
4774
"""
4875

4976
@abstractmethod
5077
def page_count(self) -> int:
78+
"""Returns the total number of pages in the document."""
5179
pass
5280

5381

5482
class DeclarativeDocumentBackend(AbstractDocumentBackend):
55-
"""DeclarativeDocumentBackend.
83+
"""An abstract base class for backends that can directly convert to a `DoclingDocument`.
5684
57-
A declarative document backend is a backend that can transform to DoclingDocument
58-
straight without a recognition pipeline.
85+
This class is for backends that handle formats with a clear, declarative
86+
structure (like HTML, Markdown, or JATS XML). These backends can transform
87+
the source directly into a `DoclingDocument` without needing a complex
88+
recognition pipeline involving layout analysis or OCR.
5989
"""
6090

6191
@abstractmethod
6292
def convert(self) -> DoclingDocument:
93+
"""Converts the source document into a `DoclingDocument`.
94+
95+
Returns:
96+
A `DoclingDocument` object representing the content and structure
97+
of the source document.
98+
"""
6399
pass

docling/backend/asciidoc_backend.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,26 @@
2727

2828

2929
class AsciiDocBackend(DeclarativeDocumentBackend):
30+
"""A backend for parsing AsciiDoc files.
31+
32+
This class implements the `DeclarativeDocumentBackend` interface to provide
33+
a parser for documents written in the AsciiDoc format. It converts the
34+
AsciiDoc source into a `DoclingDocument` object.
35+
"""
36+
3037
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
38+
"""Initializes the AsciiDocBackend.
39+
40+
This reads the content of the AsciiDoc file from the given path or stream
41+
and prepares it for parsing.
42+
43+
Args:
44+
in_doc: The `InputDocument` object representing the source document.
45+
path_or_stream: The path or stream of the AsciiDoc content.
46+
47+
Raises:
48+
RuntimeError: If the backend cannot be initialized.
49+
"""
3150
super().__init__(in_doc, path_or_stream)
3251

3352
self.path_or_stream = path_or_stream
@@ -48,22 +67,31 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
4867
return
4968

5069
def is_valid(self) -> bool:
70+
"""Checks if the backend was initialized successfully."""
5171
return self.valid
5272

5373
@classmethod
5474
def supports_pagination(cls) -> bool:
75+
"""AsciiDoc is not a paginated format."""
5576
return False
5677

5778
def unload(self):
79+
"""No resources to unload for this backend."""
5880
return
5981

6082
@classmethod
6183
def supported_formats(cls) -> Set[InputFormat]:
84+
"""Returns the set of supported formats, which is just ASCIIDOC."""
6285
return {InputFormat.ASCIIDOC}
6386

6487
def convert(self) -> DoclingDocument:
65-
"""
66-
Parses the ASCII into a structured document model.
88+
"""Parses the AsciiDoc content into a `DoclingDocument`.
89+
90+
This method orchestrates the parsing of the AsciiDoc source and builds
91+
a `DoclingDocument` object representing its structure and content.
92+
93+
Returns:
94+
A `DoclingDocument` object.
6795
"""
6896

6997
origin = DocumentOrigin(

docling/backend/csv_backend.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,32 @@
1515

1616

1717
class CsvDocumentBackend(DeclarativeDocumentBackend):
18+
"""A backend for parsing CSV (Comma-Separated Values) files.
19+
20+
This class implements the `DeclarativeDocumentBackend` interface to provide
21+
a parser for CSV data. It automatically detects the CSV dialect (e.g.,
22+
delimiter) and converts the entire file into a single table within a
23+
`DoclingDocument`.
24+
25+
Attributes:
26+
content: A `StringIO` object containing the content of the CSV file.
27+
"""
28+
1829
content: StringIO
1930

2031
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
32+
"""Initializes the CsvDocumentBackend.
33+
34+
This reads the content of the CSV file from the given path or stream
35+
and prepares it for parsing.
36+
37+
Args:
38+
in_doc: The `InputDocument` object representing the source document.
39+
path_or_stream: The path or stream of the CSV content.
40+
41+
Raises:
42+
RuntimeError: If the backend cannot be initialized.
43+
"""
2144
super().__init__(in_doc, path_or_stream)
2245

2346
# Load content
@@ -34,24 +57,33 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
3457
return
3558

3659
def is_valid(self) -> bool:
60+
"""Checks if the backend was initialized successfully."""
3761
return self.valid
3862

3963
@classmethod
4064
def supports_pagination(cls) -> bool:
65+
"""CSV is not a paginated format."""
4166
return False
4267

4368
def unload(self):
69+
"""Closes the underlying stream if it's a `BytesIO` object."""
4470
if isinstance(self.path_or_stream, BytesIO):
4571
self.path_or_stream.close()
4672
self.path_or_stream = None
4773

4874
@classmethod
4975
def supported_formats(cls) -> Set[InputFormat]:
76+
"""Returns the set of supported formats, which is just CSV."""
5077
return {InputFormat.CSV}
5178

5279
def convert(self) -> DoclingDocument:
53-
"""
54-
Parses the CSV data into a structured document model.
80+
"""Parses the CSV data into a `DoclingDocument`.
81+
82+
This method detects the CSV dialect, reads the data, and converts it
83+
into a single table within a new `DoclingDocument`.
84+
85+
Returns:
86+
A `DoclingDocument` object containing the CSV data as a table.
5587
"""
5688

5789
# Detect CSV dialect

docling/backend/docling_parse_backend.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,26 @@
2424

2525

2626
class DoclingParsePageBackend(PdfPageBackend):
27+
"""A page-level backend that uses the `docling-parse` library to process a single PDF page.
28+
29+
This class handles the extraction of text, images, and other content from a
30+
single page of a PDF file, leveraging the `docling-parse` v1 parser.
31+
32+
Attributes:
33+
valid: A boolean indicating whether the page was parsed successfully.
34+
"""
35+
2736
def __init__(
2837
self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
2938
):
39+
"""Initializes the DoclingParsePageBackend.
40+
41+
Args:
42+
parser: An instance of the `pdf_parser_v1` from `docling-parse`.
43+
document_hash: The hash of the parent document.
44+
page_no: The page number (1-based).
45+
page_obj: The `pypdfium2.PdfPage` object for this page.
46+
"""
3047
self._ppage = page_obj
3148
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
3249

@@ -39,10 +56,11 @@ def __init__(
3956
)
4057

4158
def is_valid(self) -> bool:
59+
"""Checks if the page was parsed successfully by `docling-parse`."""
4260
return self.valid
4361

4462
def _compute_text_cells(self) -> List[TextCell]:
45-
"""Compute text cells from docling-parse data."""
63+
"""Computes a list of `TextCell` objects from the parsed data."""
4664
cells: List[TextCell] = []
4765
cell_counter = 0
4866

@@ -87,6 +105,15 @@ def _compute_text_cells(self) -> List[TextCell]:
87105
return cells
88106

89107
def get_text_in_rect(self, bbox: BoundingBox) -> str:
108+
"""Extracts text from a given rectangular area of the page.
109+
110+
Args:
111+
bbox: The `BoundingBox` defining the area to extract text from.
112+
113+
Returns:
114+
A string containing the concatenated text of all cells that
115+
significantly overlap with the bounding box.
116+
"""
90117
if not self.valid:
91118
return ""
92119
# Find intersecting cells on the page
@@ -120,6 +147,15 @@ def get_text_in_rect(self, bbox: BoundingBox) -> str:
120147
return text_piece
121148

122149
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
150+
"""Constructs a `SegmentedPdfPage` object from the parsed data.
151+
152+
This method combines the text cells extracted by `docling-parse` with
153+
the page geometry information from `pypdfium2` to create a complete
154+
representation of the segmented page.
155+
156+
Returns:
157+
A `SegmentedPdfPage` object, or `None` if the page is not valid.
158+
"""
123159
if not self.valid:
124160
return None
125161

@@ -140,9 +176,18 @@ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
140176
)
141177

142178
def get_text_cells(self) -> Iterable[TextCell]:
179+
"""Returns an iterable of all text cells on the page."""
143180
return self._compute_text_cells()
144181

145182
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
183+
"""Yields the bounding boxes of bitmap images on the page.
184+
185+
Args:
186+
scale: A scaling factor to apply to the bounding box coordinates.
187+
188+
Yields:
189+
A `BoundingBox` for each bitmap image on the page.
190+
"""
146191
AREA_THRESHOLD = 0 # 32 * 32
147192

148193
for i in range(len(self._dpage["images"])):
@@ -159,6 +204,18 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
159204
def get_page_image(
160205
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
161206
) -> Image.Image:
207+
"""Renders an image of the page.
208+
209+
This method uses `pypdfium2` to render the page as a PIL Image,
210+
allowing for scaling and cropping.
211+
212+
Args:
213+
scale: The scaling factor for the rendered image.
214+
cropbox: An optional `BoundingBox` to crop the image to.
215+
216+
Returns:
217+
A `PIL.Image.Image` object of the page.
218+
"""
162219
page_size = self.get_size()
163220

164221
if not cropbox:
@@ -190,15 +247,33 @@ def get_page_image(
190247
return image
191248

192249
def get_size(self) -> Size:
250+
"""Returns the size of the page in points."""
193251
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
194252

195253
def unload(self):
254+
"""Releases the page objects to free up memory."""
196255
self._ppage = None
197256
self._dpage = None
198257

199258

200259
class DoclingParseDocumentBackend(PdfDocumentBackend):
260+
"""A document-level backend that uses the `docling-parse` library to process a PDF.
261+
262+
This class orchestrates the processing of a PDF file by loading it into both
263+
`pypdfium2` and the `docling-parse` v1 parser. It provides a method to load
264+
individual pages, which are then handled by the `DoclingParsePageBackend`.
265+
"""
266+
201267
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
268+
"""Initializes the DoclingParseDocumentBackend.
269+
270+
Args:
271+
in_doc: The `InputDocument` object representing the source PDF.
272+
path_or_stream: The path or stream of the PDF content.
273+
274+
Raises:
275+
RuntimeError: If `docling-parse` fails to load the document.
276+
"""
202277
super().__init__(in_doc, path_or_stream)
203278

204279
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
@@ -220,17 +295,28 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
220295
)
221296

222297
def page_count(self) -> int:
298+
"""Returns the total number of pages in the document."""
223299
return len(self._pdoc) # To be replaced with docling-parse API
224300

225301
def load_page(self, page_no: int) -> DoclingParsePageBackend:
302+
"""Loads a single page and returns a `DoclingParsePageBackend` for it.
303+
304+
Args:
305+
page_no: The page number to load (0-indexed).
306+
307+
Returns:
308+
A `DoclingParsePageBackend` instance for the specified page.
309+
"""
226310
return DoclingParsePageBackend(
227311
self.parser, self.document_hash, page_no, self._pdoc[page_no]
228312
)
229313

230314
def is_valid(self) -> bool:
315+
"""Checks if the document is valid (i.e., has at least one page)."""
231316
return self.page_count() > 0
232317

233318
def unload(self):
319+
"""Unloads the document from `docling-parse` and closes the `pypdfium2` document."""
234320
super().unload()
235321
self.parser.unload_document(self.document_hash)
236322
self._pdoc.close()

0 commit comments

Comments
 (0)