xxmikexx1
diff --git a/‎docling/backend/abstract_backend.py‎
Lines changed: 42 additions & 6 deletions b/‎docling/backend/abstract_backend.py‎
Lines changed: 42 additions & 6 deletions
diff --git a/‎docling/backend/asciidoc_backend.py‎
Lines changed: 30 additions & 2 deletions b/‎docling/backend/asciidoc_backend.py‎
Lines changed: 30 additions & 2 deletions
diff --git a/‎docling/backend/csv_backend.py‎
Lines changed: 34 additions & 2 deletions b/‎docling/backend/csv_backend.py‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎docling/backend/docling_parse_backend.py‎
Lines changed: 87 additions & 1 deletion b/‎docling/backend/docling_parse_backend.py‎
Lines changed: 87 additions & 1 deletion
@@ -11,23 +11,49 @@
 
 
 class AbstractDocumentBackend(ABC):
+    """An abstract base class for all document processing backends.
+
+    This class defines the common interface that all document backends must
+    implement. A backend is responsible for parsing a specific document format
+    and providing access to its content.
+
+    Attributes:
+        file: The path to the document file.
+        path_or_stream: The source of the document, either a path or a stream.
+        document_hash: The hash of the document's content.
+        input_format: The `InputFormat` of the document.
+    """
+
     @abstractmethod
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        """Initializes the document backend.
+
+        Args:
+            in_doc: The `InputDocument` object representing the source document.
+            path_or_stream: The path or stream of the document content.
+        """
         self.file = in_doc.file
         self.path_or_stream = path_or_stream
         self.document_hash = in_doc.document_hash
         self.input_format = in_doc.format
 
     @abstractmethod
     def is_valid(self) -> bool:
+        """Checks if the document is valid and can be processed by this backend."""
         pass
 
     @classmethod
     @abstractmethod
     def supports_pagination(cls) -> bool:
+        """Returns `True` if the backend supports paginated access to the document."""
         pass
 
     def unload(self):
+        """Releases any resources held by the backend.
+
+        This method should be called when the backend is no longer needed to
+        ensure that file handles and other resources are properly closed.
+        """
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()
 
@@ -36,28 +62,38 @@ def unload(self):
     @classmethod
     @abstractmethod
     def supported_formats(cls) -> Set["InputFormat"]:
+        """Returns a set of `InputFormat` enums that this backend supports."""
         pass
 
 
 class PaginatedDocumentBackend(AbstractDocumentBackend):
-    """DeclarativeDocumentBackend.
+    """An abstract base class for backends that support paginated documents.
 
-    A declarative document backend is a backend that can transform to DoclingDocument
-    straight without a recognition pipeline.
+    This class extends `AbstractDocumentBackend` with an abstract method for
+    retrieving the total number of pages in a document.
     """
 
     @abstractmethod
     def page_count(self) -> int:
+        """Returns the total number of pages in the document."""
         pass
 
 
 class DeclarativeDocumentBackend(AbstractDocumentBackend):
-    """DeclarativeDocumentBackend.
+    """An abstract base class for backends that can directly convert to a `DoclingDocument`.
 
-    A declarative document backend is a backend that can transform to DoclingDocument
-    straight without a recognition pipeline.
+    This class is for backends that handle formats with a clear, declarative
+    structure (like HTML, Markdown, or JATS XML). These backends can transform
+    the source directly into a `DoclingDocument` without needing a complex
+    recognition pipeline involving layout analysis or OCR.
     """
 
     @abstractmethod
     def convert(self) -> DoclingDocument:
+        """Converts the source document into a `DoclingDocument`.
+
+        Returns:
+            A `DoclingDocument` object representing the content and structure
+            of the source document.
+        """
         pass
@@ -27,7 +27,26 @@
 
 
 class AsciiDocBackend(DeclarativeDocumentBackend):
+    """A backend for parsing AsciiDoc files.
+
+    This class implements the `DeclarativeDocumentBackend` interface to provide
+    a parser for documents written in the AsciiDoc format. It converts the
+    AsciiDoc source into a `DoclingDocument` object.
+    """
+
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+        """Initializes the AsciiDocBackend.
+
+        This reads the content of the AsciiDoc file from the given path or stream
+        and prepares it for parsing.
+
+        Args:
+            in_doc: The `InputDocument` object representing the source document.
+            path_or_stream: The path or stream of the AsciiDoc content.
+
+        Raises:
+            RuntimeError: If the backend cannot be initialized.
+        """
         super().__init__(in_doc, path_or_stream)
 
         self.path_or_stream = path_or_stream
@@ -48,22 +67,31 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         return
 
     def is_valid(self) -> bool:
+        """Checks if the backend was initialized successfully."""
         return self.valid
 
     @classmethod
     def supports_pagination(cls) -> bool:
+        """AsciiDoc is not a paginated format."""
         return False
 
     def unload(self):
+        """No resources to unload for this backend."""
         return
 
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
+        """Returns the set of supported formats, which is just ASCIIDOC."""
         return {InputFormat.ASCIIDOC}
 
     def convert(self) -> DoclingDocument:
-        """
-        Parses the ASCII into a structured document model.
+        """Parses the AsciiDoc content into a `DoclingDocument`.
+
+        This method orchestrates the parsing of the AsciiDoc source and builds
+        a `DoclingDocument` object representing its structure and content.
+
+        Returns:
+            A `DoclingDocument` object.
         """
 
         origin = DocumentOrigin(
 
@@ -15,9 +15,32 @@
 
 
 class CsvDocumentBackend(DeclarativeDocumentBackend):
+    """A backend for parsing CSV (Comma-Separated Values) files.
+
+    This class implements the `DeclarativeDocumentBackend` interface to provide
+    a parser for CSV data. It automatically detects the CSV dialect (e.g.,
+    delimiter) and converts the entire file into a single table within a
+    `DoclingDocument`.
+
+    Attributes:
+        content: A `StringIO` object containing the content of the CSV file.
+    """
+
     content: StringIO
 
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        """Initializes the CsvDocumentBackend.
+
+        This reads the content of the CSV file from the given path or stream
+        and prepares it for parsing.
+
+        Args:
+            in_doc: The `InputDocument` object representing the source document.
+            path_or_stream: The path or stream of the CSV content.
+
+        Raises:
+            RuntimeError: If the backend cannot be initialized.
+        """
         super().__init__(in_doc, path_or_stream)
 
         # Load content
@@ -34,24 +57,33 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
         return
 
     def is_valid(self) -> bool:
+        """Checks if the backend was initialized successfully."""
         return self.valid
 
     @classmethod
     def supports_pagination(cls) -> bool:
+        """CSV is not a paginated format."""
         return False
 
     def unload(self):
+        """Closes the underlying stream if it's a `BytesIO` object."""
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()
         self.path_or_stream = None
 
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
+        """Returns the set of supported formats, which is just CSV."""
         return {InputFormat.CSV}
 
     def convert(self) -> DoclingDocument:
-        """
-        Parses the CSV data into a structured document model.
+        """Parses the CSV data into a `DoclingDocument`.
+
+        This method detects the CSV dialect, reads the data, and converts it
+        into a single table within a new `DoclingDocument`.
+
+        Returns:
+            A `DoclingDocument` object containing the CSV data as a table.
         """
 
         # Detect CSV dialect
 
@@ -24,9 +24,26 @@
 
 
 class DoclingParsePageBackend(PdfPageBackend):
+    """A page-level backend that uses the `docling-parse` library to process a single PDF page.
+
+    This class handles the extraction of text, images, and other content from a
+    single page of a PDF file, leveraging the `docling-parse` v1 parser.
+
+    Attributes:
+        valid: A boolean indicating whether the page was parsed successfully.
+    """
+
     def __init__(
         self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
     ):
+        """Initializes the DoclingParsePageBackend.
+
+        Args:
+            parser: An instance of the `pdf_parser_v1` from `docling-parse`.
+            document_hash: The hash of the parent document.
+            page_no: The page number (1-based).
+            page_obj: The `pypdfium2.PdfPage` object for this page.
+        """
         self._ppage = page_obj
         parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
 
@@ -39,10 +56,11 @@ def __init__(
             )
 
     def is_valid(self) -> bool:
+        """Checks if the page was parsed successfully by `docling-parse`."""
         return self.valid
 
     def _compute_text_cells(self) -> List[TextCell]:
-        """Compute text cells from docling-parse data."""
+        """Computes a list of `TextCell` objects from the parsed data."""
         cells: List[TextCell] = []
         cell_counter = 0
 
@@ -87,6 +105,15 @@ def _compute_text_cells(self) -> List[TextCell]:
         return cells
 
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        """Extracts text from a given rectangular area of the page.
+
+        Args:
+            bbox: The `BoundingBox` defining the area to extract text from.
+
+        Returns:
+            A string containing the concatenated text of all cells that
+            significantly overlap with the bounding box.
+        """
         if not self.valid:
             return ""
         # Find intersecting cells on the page
@@ -120,6 +147,15 @@ def get_text_in_rect(self, bbox: BoundingBox) -> str:
         return text_piece
 
     def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        """Constructs a `SegmentedPdfPage` object from the parsed data.
+
+        This method combines the text cells extracted by `docling-parse` with
+        the page geometry information from `pypdfium2` to create a complete
+        representation of the segmented page.
+
+        Returns:
+            A `SegmentedPdfPage` object, or `None` if the page is not valid.
+        """
         if not self.valid:
             return None
 
@@ -140,9 +176,18 @@ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
         )
 
     def get_text_cells(self) -> Iterable[TextCell]:
+        """Returns an iterable of all text cells on the page."""
         return self._compute_text_cells()
 
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        """Yields the bounding boxes of bitmap images on the page.
+
+        Args:
+            scale: A scaling factor to apply to the bounding box coordinates.
+
+        Yields:
+            A `BoundingBox` for each bitmap image on the page.
+        """
         AREA_THRESHOLD = 0  # 32 * 32
 
         for i in range(len(self._dpage["images"])):
@@ -159,6 +204,18 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
+        """Renders an image of the page.
+
+        This method uses `pypdfium2` to render the page as a PIL Image,
+        allowing for scaling and cropping.
+
+        Args:
+            scale: The scaling factor for the rendered image.
+            cropbox: An optional `BoundingBox` to crop the image to.
+
+        Returns:
+            A `PIL.Image.Image` object of the page.
+        """
         page_size = self.get_size()
 
         if not cropbox:
@@ -190,15 +247,33 @@ def get_page_image(
         return image
 
     def get_size(self) -> Size:
+        """Returns the size of the page in points."""
         return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
 
     def unload(self):
+        """Releases the page objects to free up memory."""
         self._ppage = None
         self._dpage = None
 
 
 class DoclingParseDocumentBackend(PdfDocumentBackend):
+    """A document-level backend that uses the `docling-parse` library to process a PDF.
+
+    This class orchestrates the processing of a PDF file by loading it into both
+    `pypdfium2` and the `docling-parse` v1 parser. It provides a method to load
+    individual pages, which are then handled by the `DoclingParsePageBackend`.
+    """
+
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        """Initializes the DoclingParseDocumentBackend.
+
+        Args:
+            in_doc: The `InputDocument` object representing the source PDF.
+            path_or_stream: The path or stream of the PDF content.
+
+        Raises:
+            RuntimeError: If `docling-parse` fails to load the document.
+        """
         super().__init__(in_doc, path_or_stream)
 
         self._pdoc = pdfium.PdfDocument(self.path_or_stream)
@@ -220,17 +295,28 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
             )
 
     def page_count(self) -> int:
+        """Returns the total number of pages in the document."""
         return len(self._pdoc)  # To be replaced with docling-parse API
 
     def load_page(self, page_no: int) -> DoclingParsePageBackend:
+        """Loads a single page and returns a `DoclingParsePageBackend` for it.
+
+        Args:
+            page_no: The page number to load (0-indexed).
+
+        Returns:
+            A `DoclingParsePageBackend` instance for the specified page.
+        """
         return DoclingParsePageBackend(
             self.parser, self.document_hash, page_no, self._pdoc[page_no]
         )
 
     def is_valid(self) -> bool:
+        """Checks if the document is valid (i.e., has at least one page)."""
         return self.page_count() > 0
 
     def unload(self):
+        """Unloads the document from `docling-parse` and closes the `pypdfium2` document."""
         super().unload()
         self.parser.unload_document(self.document_hash)
         self._pdoc.close()