From 5c4366a0132206760435e672dfeef88419d9fd11 Mon Sep 17 00:00:00 2001 From: Aarif Ansari Date: Fri, 19 Jun 2026 13:13:01 +0530 Subject: [PATCH] Fix parsing of tables inside SDT content controls Signed-off-by: Aarif Ansari --- docling/backend/msword_backend.py | 29 +++++++++++++++++--------- tests/test_backend_msword.py | 34 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index d05d932ebb..45cc04f864 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -486,16 +486,27 @@ def _in_textbox(elem): # Check for the sdt containers, like table of contents elif tag_name == "sdt": sdt_content = element.find( - ".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ".//w:sdtContent", + namespaces=MsWordDocumentBackend._BLIP_NAMESPACES, ) + if sdt_content is not None: - # Iterate paragraphs, runs, or text inside . - paragraphs = sdt_content.findall( - ".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES - ) - for p in paragraphs: - te = self._handle_text_elements(p, doc) - added_elements.extend(te) + for child in sdt_content: + child_tag = etree.QName(child).localname + + if child_tag == "tbl": + try: + t = self._handle_tables(child, doc) + added_elements.extend(t) + except Exception: + _log.debug( + "could not parse table inside sdt", + exc_info=True, + ) + + elif child_tag == "p": + te = self._handle_text_elements(child, doc) + added_elements.extend(te) # Check for Text elif tag_name == "p": # "tcPr", "sectPr" @@ -1985,7 +1996,6 @@ def _handle_tables( num_rows = len(table.rows) num_cols = len(table.columns) _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns") - if num_rows == 1 and num_cols == 1: cell_element = table.rows[0].cells[0] # In case we have a table of only 1 cell, we consider it furniture @@ -2000,7 +2010,6 @@ def _handle_tables( data=data, parent=self.parents[level - 1], content_layer=self.content_layer ) elem_ref.append(docling_table.get_ref()) - cell_set: set[CT_Tc] = set() for row_idx, row in enumerate(table.rows): _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells") diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 10932f39be..3d0727800f 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -851,3 +851,37 @@ def test_text_after_drawingml_images(documents): "Skipping DrawingML text extraction test." ) pytest.skip(f"Test document '{name}' not available") + + +def test_sdt_table_parsing(tmp_path): + from docx import Document + from docx.oxml import OxmlElement + + doc = Document() + + table = doc.add_table(rows=2, cols=2) + table.cell(0, 0).text = "Feature" + table.cell(0, 1).text = "Action" + + table.cell(1, 0).text = "Test" + table.cell(1, 1).text = "Verify" + + tbl_xml = table._tbl + + sdt = OxmlElement("w:sdt") + sdt_pr = OxmlElement("w:sdtPr") + sdt_content = OxmlElement("w:sdtContent") + + sdt_content.append(tbl_xml) + + sdt.append(sdt_pr) + sdt.append(sdt_content) + + doc._body._element.append(sdt) + + docx_path = tmp_path / "sdt_table.docx" + doc.save(docx_path) + + conv_result = get_converter().convert(docx_path) + + assert len(conv_result.document.tables) == 1