Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 62 additions & 5 deletions docling/backend/msexcel_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,53 @@ def _find_tables_in_sheet(

return doc

def _find_true_data_bounds(self, sheet: Worksheet) -> tuple[int, int, int, int]:
"""Find the true data boundaries (min/max rows and columns) in an Excel worksheet.

This function scans all cells to find the smallest rectangular region that contains
all non-empty cells or merged cell ranges. It returns the minimal and maximal
row/column indices that bound the actual data region.

Args:
sheet: The Excel worksheet to analyze.

Returns:
A tuple of four integers:
(min_row, max_row, min_col, max_col)
representing the smallest rectangle that covers all data and merged cells.
If the sheet is empty, returns (1, 1, 1, 1) by default.
"""
min_row, min_col = None, None
max_row, max_col = 0, 0

# Check all cells for non-empty values
for row in sheet.iter_rows(values_only=False):
for cell in row:
if cell.value is not None:
r, c = cell.row, cell.column
min_row = r if min_row is None else min(min_row, r)
min_col = c if min_col is None else min(min_col, c)
max_row = max(max_row, r)
max_col = max(max_col, c)

# Expand bounds to include merged cells
for merged in sheet.merged_cells.ranges:
min_row = (
merged.min_row if min_row is None else min(min_row, merged.min_row)
)
min_col = (
merged.min_col if min_col is None else min(min_col, merged.min_col)
)
max_row = max(max_row, merged.max_row)
max_col = max(max_col, merged.max_col)

# If no data found, default to (1, 1, 1, 1)
if min_row is None or min_col is None:
min_row = min_col = max_row = max_col = 1

return min_row, max_row, min_col, max_col

# Example integration in MsExcelDocumentBackend._find_data_tables
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
"""Find all compact rectangular data tables in an Excel worksheet.

Expand All @@ -299,13 +346,24 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
Returns:
A list of ExcelTable objects representing the data tables.
"""
min_row, max_row, min_col, max_col = self._find_true_data_bounds(
sheet
) # The true data boundaries
tables: list[ExcelTable] = [] # List to store found tables
visited: set[tuple[int, int]] = set() # Track already visited cells

# Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
for rj, cell in enumerate(row):
# Skip empty or already visited cells
# Limit scan to actual data bounds
for ri, row in enumerate(
sheet.iter_rows(
min_row=min_row,
max_row=max_row,
min_col=min_col,
max_col=max_col,
values_only=False,
),
start=min_row - 1,
):
for rj, cell in enumerate(row, start=min_col - 1):
if cell.value is None or (ri, rj) in visited:
continue

Expand All @@ -314,7 +372,6 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:

visited.update(visited_cells) # Mark these cells as visited
tables.append(table_bounds)

return tables

def _find_table_bounds(
Expand Down
11 changes: 11 additions & 0 deletions tests/data/groundtruth/docling_v2/test-02.xlsx.itxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group sheet: Sheet1
item-2 at level 2: table with [7x3]
item-3 at level 1: section: group sheet: Sheet2
item-4 at level 2: table with [9x4]
item-5 at level 2: table with [5x3]
item-6 at level 2: table with [5x3]
item-7 at level 1: section: group sheet: Sheet3
item-8 at level 2: table with [7x3]
item-9 at level 2: table with [7x3]
item-10 at level 2: picture
3,775 changes: 3,775 additions & 0 deletions tests/data/groundtruth/docling_v2/test-02.xlsx.json

Large diffs are not rendered by default.

53 changes: 53 additions & 0 deletions tests/data/groundtruth/docling_v2/test-02.xlsx.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
| first | second | third |
|----------|-----------|---------|
| 1 | 5 | 9 |
| 2 | 4 | 6 |
| 3 | 3 | 3 |
| 4 | 2 | 0 |
| 5 | 1 | -3 |
| 6 | 0 | -6 |

| col-1 | col-2 | col-3 | col-4 |
|---------|---------|---------|---------|
| 1 | 2 | 3 | 4 |
| 2 | 4 | 6 | 8 |
| 3 | 6 | 9 | 12 |
| 4 | 8 | 12 | 16 |
| 5 | 10 | 15 | 20 |
| 6 | 12 | 18 | 24 |
| 7 | 14 | 21 | 28 |
| 8 | 16 | 24 | 32 |

| col-1 | col-2 | col-3 |
|---------|---------|---------|
| 1 | 2 | 3 |
| 2 | 4 | 6 |
| 3 | 6 | 9 |
| 4 | 8 | 12 |

| col-1 | col-2 | col-3 |
|---------|---------|---------|
| 1 | 2 | 3 |
| 2 | 4 | 6 |
| 3 | 6 | 9 |
| 4 | 8 | 12 |

| first | header | header |
|----------|----------|----------|
| first | second | third |
| 1 | 2 | 3 |
| 3 | 4 | 5 |
| 3 | 6 | 7 |
| 8 | 9 | 9 |
| 10 | 9 | 9 |

| first (f) | header (f) | header (f) |
|-------------|--------------|--------------|
| first (f) | second | third |
| 1 | 2 | 3 |
| 3 | 4 | 5 |
| 3 | 6 | 7 |
| 8 | 9 | 9 |
| 10 | 9 | 9 |

<!-- image -->
Binary file added tests/data/xlsx/test-02.xlsx
Binary file not shown.
100 changes: 0 additions & 100 deletions tests/test_backend_msexcel.py
Original file line number Diff line number Diff line change
@@ -1,100 +0,0 @@
import logging
from pathlib import Path

import pytest

from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
from docling.document_converter import DocumentConverter

from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export

_log = logging.getLogger(__name__)

GENERATE = GEN_TEST_DATA


def get_excel_paths():
# Define the directory you want to search
directory = Path("./tests/data/xlsx/")

# List all Excel files in the directory and its subdirectories
excel_files = sorted(directory.rglob("*.xlsx")) + sorted(directory.rglob("*.xlsm"))
return excel_files


def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])

return converter


@pytest.fixture(scope="module")
def documents() -> list[tuple[Path, DoclingDocument]]:
documents: list[dict[Path, DoclingDocument]] = []

excel_paths = get_excel_paths()
converter = get_converter()

for excel_path in excel_paths:
_log.debug(f"converting {excel_path}")

gt_path = (
excel_path.parent.parent / "groundtruth" / "docling_v2" / excel_path.name
)

conv_result: ConversionResult = converter.convert(excel_path)

doc: DoclingDocument = conv_result.document

assert doc, f"Failed to convert document from file {gt_path}"
documents.append((gt_path, doc))

return documents


def test_e2e_excel_conversions(documents) -> None:
for gt_path, doc in documents:
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"

pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
"export to indented-text"
)

assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
"document document"
)


def test_pages(documents) -> None:
"""Test the page count and page size of converted documents.

Args:
documents: The paths and converted documents.
"""
# number of pages from the backend method
path = next(item for item in get_excel_paths() if item.stem == "test-01")
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,
filename=path.stem,
backend=MsExcelDocumentBackend,
)
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
assert backend.page_count() == 4

# number of pages from the converted document
doc = next(item for path, item in documents if path.stem == "test-01")
assert len(doc.pages) == 4

# page sizes as number of cells
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)