docling-project · HuangruiChu · Oct 7, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py
@@ -290,6 +290,53 @@ def _find_tables_in_sheet(
 
         return doc
 
+    def _find_true_data_bounds(self, sheet: Worksheet) -> tuple[int, int, int, int]:
+        """Find the true data boundaries (min/max rows and columns) in an Excel worksheet.
+
+        This function scans all cells to find the smallest rectangular region that contains
+        all non-empty cells or merged cell ranges. It returns the minimal and maximal
+        row/column indices that bound the actual data region.
+
+        Args:
+            sheet: The Excel worksheet to analyze.
+
+        Returns:
+            A tuple of four integers:
+                (min_row, max_row, min_col, max_col)
+            representing the smallest rectangle that covers all data and merged cells.
+            If the sheet is empty, returns (1, 1, 1, 1) by default.
+        """
+        min_row, min_col = None, None
+        max_row, max_col = 0, 0
+
+        # Check all cells for non-empty values
+        for row in sheet.iter_rows(values_only=False):
+            for cell in row:
+                if cell.value is not None:
+                    r, c = cell.row, cell.column
+                    min_row = r if min_row is None else min(min_row, r)
+                    min_col = c if min_col is None else min(min_col, c)
+                    max_row = max(max_row, r)
+                    max_col = max(max_col, c)
+
+        # Expand bounds to include merged cells
+        for merged in sheet.merged_cells.ranges:
+            min_row = (
+                merged.min_row if min_row is None else min(min_row, merged.min_row)
+            )
+            min_col = (
+                merged.min_col if min_col is None else min(min_col, merged.min_col)
+            )
+            max_row = max(max_row, merged.max_row)
+            max_col = max(max_col, merged.max_col)
+
+        # If no data found, default to (1, 1, 1, 1)
+        if min_row is None or min_col is None:
+            min_row = min_col = max_row = max_col = 1
+
+        return min_row, max_row, min_col, max_col
+
+    # Example integration in MsExcelDocumentBackend._find_data_tables
     def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
         """Find all compact rectangular data tables in an Excel worksheet.
 
@@ -299,13 +346,24 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
         Returns:
             A list of ExcelTable objects representing the data tables.
         """
+        min_row, max_row, min_col, max_col = self._find_true_data_bounds(
+            sheet
+        )  # The true data boundaries
         tables: list[ExcelTable] = []  # List to store found tables
         visited: set[tuple[int, int]] = set()  # Track already visited cells
 
-        # Iterate over all cells in the sheet
-        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
-            for rj, cell in enumerate(row):
-                # Skip empty or already visited cells
+        # Limit scan to actual data bounds
+        for ri, row in enumerate(
+            sheet.iter_rows(
+                min_row=min_row,
+                max_row=max_row,
+                min_col=min_col,
+                max_col=max_col,
+                values_only=False,
+            ),
+            start=min_row - 1,
+        ):
+            for rj, cell in enumerate(row, start=min_col - 1):
                 if cell.value is None or (ri, rj) in visited:
                     continue
 
@@ -314,7 +372,6 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
 
                 visited.update(visited_cells)  # Mark these cells as visited
                 tables.append(table_bounds)
-
         return tables
 
     def _find_table_bounds(

diff --git a/tests/data/groundtruth/docling_v2/test-02.xlsx.itxt b/tests/data/groundtruth/docling_v2/test-02.xlsx.itxt
@@ -0,0 +1,11 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group sheet: Sheet1
+    item-2 at level 2: table with [7x3]
+  item-3 at level 1: section: group sheet: Sheet2
+    item-4 at level 2: table with [9x4]
+    item-5 at level 2: table with [5x3]
+    item-6 at level 2: table with [5x3]
+  item-7 at level 1: section: group sheet: Sheet3
+    item-8 at level 2: table with [7x3]
+    item-9 at level 2: table with [7x3]
+    item-10 at level 2: picture
diff --git a/tests/data/groundtruth/docling_v2/test-02.xlsx.json b/tests/data/groundtruth/docling_v2/test-02.xlsx.json
diff --git a/tests/data/groundtruth/docling_v2/test-02.xlsx.md b/tests/data/groundtruth/docling_v2/test-02.xlsx.md
@@ -0,0 +1,53 @@
+|   first  |   second  |   third |
+|----------|-----------|---------|
+|        1 |         5 |       9 |
+|        2 |         4 |       6 |
+|        3 |         3 |       3 |
+|        4 |         2 |       0 |
+|        5 |         1 |      -3 |
+|        6 |         0 |      -6 |
+
+|   col-1 |   col-2 |   col-3 |   col-4 |
+|---------|---------|---------|---------|
+|       1 |       2 |       3 |       4 |
+|       2 |       4 |       6 |       8 |
+|       3 |       6 |       9 |      12 |
+|       4 |       8 |      12 |      16 |
+|       5 |      10 |      15 |      20 |
+|       6 |      12 |      18 |      24 |
+|       7 |      14 |      21 |      28 |
+|       8 |      16 |      24 |      32 |
+
+|   col-1 |   col-2 |   col-3 |
+|---------|---------|---------|
+|       1 |       2 |       3 |
+|       2 |       4 |       6 |
+|       3 |       6 |       9 |
+|       4 |       8 |      12 |
+
+|   col-1 |   col-2 |   col-3 |
+|---------|---------|---------|
+|       1 |       2 |       3 |
+|       2 |       4 |       6 |
+|       3 |       6 |       9 |
+|       4 |       8 |      12 |
+
+| first    | header   | header   |
+|----------|----------|----------|
+| first    | second   | third    |
+| 1        | 2        | 3        |
+| 3        | 4        | 5        |
+| 3        | 6        | 7        |
+| 8        | 9        | 9        |
+| 10       | 9        | 9        |
+
+| first (f)   | header (f)   | header (f)   |
+|-------------|--------------|--------------|
+| first (f)   | second       | third        |
+| 1           | 2            | 3            |
+| 3           | 4            | 5            |
+| 3           | 6            | 7            |
+| 8           | 9            | 9            |
+| 10          | 9            | 9            |
+
+<!-- image -->
diff --git a/tests/data/xlsx/test-02.xlsx b/tests/data/xlsx/test-02.xlsx
diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py
@@ -1,100 +0,0 @@
-import logging
-from pathlib import Path
-
-import pytest
-
-from docling.backend.msexcel_backend import MsExcelDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
-from docling.document_converter import DocumentConverter
-
-from .test_data_gen_flag import GEN_TEST_DATA
-from .verify_utils import verify_document, verify_export
-
-_log = logging.getLogger(__name__)
-
-GENERATE = GEN_TEST_DATA
-
-
-def get_excel_paths():
-    # Define the directory you want to search
-    directory = Path("./tests/data/xlsx/")
-
-    # List all Excel files in the directory and its subdirectories
-    excel_files = sorted(directory.rglob("*.xlsx")) + sorted(directory.rglob("*.xlsm"))
-    return excel_files
-
-
-def get_converter():
-    converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
-
-    return converter
-
-
-@pytest.fixture(scope="module")
-def documents() -> list[tuple[Path, DoclingDocument]]:
-    documents: list[dict[Path, DoclingDocument]] = []
-
-    excel_paths = get_excel_paths()
-    converter = get_converter()
-
-    for excel_path in excel_paths:
-        _log.debug(f"converting {excel_path}")
-
-        gt_path = (
-            excel_path.parent.parent / "groundtruth" / "docling_v2" / excel_path.name
-        )
-
-        conv_result: ConversionResult = converter.convert(excel_path)
-
-        doc: DoclingDocument = conv_result.document
-
-        assert doc, f"Failed to convert document from file {gt_path}"
-        documents.append((gt_path, doc))
-
-    return documents
-
-
-def test_e2e_excel_conversions(documents) -> None:
-    for gt_path, doc in documents:
-        pred_md: str = doc.export_to_markdown()
-        assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
-
-        pred_itxt: str = doc._export_to_indented_text(
-            max_text_len=70, explicit_tables=False
-        )
-        assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
-            "export to indented-text"
-        )
-
-        assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
-            "document document"
-        )
-
-
-def test_pages(documents) -> None:
-    """Test the page count and page size of converted documents.
-
-    Args:
-        documents: The paths and converted documents.
-    """
-    # number of pages from the backend method
-    path = next(item for item in get_excel_paths() if item.stem == "test-01")
-    in_doc = InputDocument(
-        path_or_stream=path,
-        format=InputFormat.XLSX,
-        filename=path.stem,
-        backend=MsExcelDocumentBackend,
-    )
-    backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
-    assert backend.page_count() == 4
-
-    # number of pages from the converted document
-    doc = next(item for path, item in documents if path.stem == "test-01")
-    assert len(doc.pages) == 4
-
-    # page sizes as number of cells
-    assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
-    assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
-    assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
-    assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)