diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index f25f356681..080b1652fa 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -906,21 +906,55 @@ def _handle_equations_in_text(self, element, text): only_texts = [] only_equations = [] texts_and_equations = [] - for subt in element.iter(): - tag_name = etree.QName(subt).localname - if tag_name == "t" and "math" not in subt.tag: - if isinstance(subt.text, str): - only_texts.append(subt.text) - texts_and_equations.append(subt.text) - elif "oMath" in subt.tag and "oMathPara" not in subt.tag: - latex_equation = str(oMath2Latex(subt)).strip() - if len(latex_equation) > 0: - only_equations.append( - self.equation_bookends.format(EQ=latex_equation) - ) - texts_and_equations.append( - self.equation_bookends.format(EQ=latex_equation) - ) + + # Collect oMath elements and text runs from the paragraph. + # Use direct children iteration first; fall back to deep iteration + # only if no oMath elements are found at the direct level. + direct_omaths = [ + child + for child in element + if "oMath" in child.tag and "oMathPara" not in child.tag + ] + + if direct_omaths: + # Iterate direct children to preserve sibling order and avoid + # processing nested oMath descendants of an already-converted node. + for child in element: + if "oMath" in child.tag and "oMathPara" not in child.tag: + latex_equation = str(oMath2Latex(child)).strip() + if len(latex_equation) > 0: + only_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) + texts_and_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) + else: + # Collect text from non-math children (e.g. runs) + for t_elem in child.iter(): + t_tag = etree.QName(t_elem).localname + if t_tag == "t" and "math" not in t_elem.tag: + if isinstance(t_elem.text, str): + only_texts.append(t_elem.text) + texts_and_equations.append(t_elem.text) + else: + # Original deep-iteration fallback for nested oMath (e.g. + # inside oMathPara or other wrapper elements). + for subt in element.iter(): + tag_name = etree.QName(subt).localname + if tag_name == "t" and "math" not in subt.tag: + if isinstance(subt.text, str): + only_texts.append(subt.text) + texts_and_equations.append(subt.text) + elif "oMath" in subt.tag and "oMathPara" not in subt.tag: + latex_equation = str(oMath2Latex(subt)).strip() + if len(latex_equation) > 0: + only_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) + texts_and_equations.append( + self.equation_bookends.format(EQ=latex_equation) + ) if len(only_equations) < 1: return text, [] @@ -1055,15 +1089,28 @@ def _handle_text_elements( if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len( text ) > 0: - # Standalone equation + # Standalone equation(s) — emit each as a separate formula level = self._get_level() - t1 = doc.add_text( - label=DocItemLabel.FORMULA, - parent=self.parents[level - 1], - text=text.replace("", "").replace("", ""), - content_layer=self.content_layer, - ) - elem_ref.append(t1.get_ref()) + parent = self.parents[level - 1] + if len(equations) > 1: + for eq in equations: + eq_text = eq.replace("", "").replace("", "").strip() + if len(eq_text) > 0: + t1 = doc.add_text( + label=DocItemLabel.FORMULA, + parent=parent, + text=eq_text, + content_layer=self.content_layer, + ) + elem_ref.append(t1.get_ref()) + else: + t1 = doc.add_text( + label=DocItemLabel.FORMULA, + parent=parent, + text=text.replace("", "").replace("", ""), + content_layer=self.content_layer, + ) + elem_ref.append(t1.get_ref()) else: # Inline equation level = self._get_level() diff --git a/tests/data/docx/omml_multi_equation_paragraph.docx b/tests/data/docx/omml_multi_equation_paragraph.docx new file mode 100644 index 0000000000..0629e858fd Binary files /dev/null and b/tests/data/docx/omml_multi_equation_paragraph.docx differ diff --git a/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.itxt b/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.itxt new file mode 100644 index 0000000000..bbda4b82cf --- /dev/null +++ b/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.itxt @@ -0,0 +1,7 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-0 + item-2 at level 2: section_header: Issue 3: Concatenated equation blocks + item-3 at level 3: text: The paragraph below contains thr ... ts are siblings inside a single . + item-4 at level 3: formula: a=b + item-5 at level 3: formula: c=d + item-6 at level 3: formula: e=f \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.json b/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.json new file mode 100644 index 0000000000..3839623ee5 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.json @@ -0,0 +1,132 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.9.0", + "name": "omml_multi_equation_paragraph", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 17520448227351822398, + "filename": "omml_multi_equation_paragraph.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + } + ], + "content_layer": "body", + "name": "header-0", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Issue 3: Concatenated equation blocks", + "text": "Issue 3: Concatenated equation blocks", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "The paragraph below contains three separate elements.\nExpected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)\nDocling produces: one $$ block with all equations concatenated.\n\nAll three elements are siblings inside a single .", + "text": "The paragraph below contains three separate elements.\nExpected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)\nDocling produces: one $$ block with all equations concatenated.\n\nAll three elements are siblings inside a single .", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "a=b", + "text": "a=b" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "c=d", + "text": "c=d" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "e=f", + "text": "e=f" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.md b/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.md new file mode 100644 index 0000000000..dd2f3f1e28 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/omml_multi_equation_paragraph.docx.md @@ -0,0 +1,13 @@ +## Issue 3: Concatenated equation blocks + +The paragraph below contains three separate <m:oMath> elements. +Expected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$) +Docling produces: one $$ block with all equations concatenated. + +All three <m:oMath> elements are siblings inside a single <w:p>. + +$$a=b$$ + +$$c=d$$ + +$$e=f$$ \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 06df14cb9f..9a175ca86b 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -2,10 +2,13 @@ import os import warnings from pathlib import Path +from types import SimpleNamespace import pytest from docling_core.types.doc import GroupItem +from lxml import etree +import docling.backend.msword_backend as msword_backend_module from docling.backend.docx.drawingml.utils import get_libreoffice_cmd from docling.backend.msword_backend import MsWordDocumentBackend from docling.datamodel.base_models import InputFormat @@ -44,6 +47,17 @@ def get_converter(): return converter +@pytest.fixture(scope="module") +def backend(docx_paths) -> MsWordDocumentBackend: + docx_path = docx_paths[0] + in_doc = InputDocument( + path_or_stream=docx_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + return in_doc._backend + + @pytest.fixture(scope="module") def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]: documents: list[dict[Path, DoclingDocument]] = [] @@ -464,3 +478,119 @@ def test_list_counter_and_enum_marker(docx_paths): assert backend.list_counters[(1, 0)] == 0 assert backend.list_counters[(1, 1)] == 0 assert backend.list_counters[(2, 0)] == 1 # unaffected + + +def test_handle_equations_in_text_returns_original_text_on_mismatch( + backend, monkeypatch +): + element = etree.Element("p") + run = etree.SubElement(element, "r") + text_elem = etree.SubElement(run, "t") + text_elem.text = "alpha" + etree.SubElement(element, "oMath") + + monkeypatch.setattr(msword_backend_module, "oMath2Latex", lambda _: "x") + + text, equations = backend._handle_equations_in_text(element=element, text="beta") + + assert text == "beta" + assert equations == [] + + +def test_handle_equations_in_text_skips_empty_substrings(backend, monkeypatch): + equation = backend.equation_bookends.format(EQ="x") + + element = etree.Element("p") + empty_run = etree.SubElement(element, "r") + empty_text = etree.SubElement(empty_run, "t") + empty_text.text = "" + etree.SubElement(element, "oMath") + tail_run = etree.SubElement(element, "r") + tail_text = etree.SubElement(tail_run, "t") + tail_text.text = "tail" + + monkeypatch.setattr(msword_backend_module, "oMath2Latex", lambda _: "x") + + text, equations = backend._handle_equations_in_text(element=element, text="tail") + + assert equations == [equation] + assert text == f"{equation}tail" + + +def test_handle_text_elements_returns_empty_refs_when_text_is_none( + backend, monkeypatch +): + element = backend.docx_obj.paragraphs[0]._element + + monkeypatch.setattr( + backend, "_handle_equations_in_text", lambda element, text: (None, []) + ) + + refs = backend._handle_text_elements(element, DoclingDocument(name="test")) + + assert refs == [] + + +def test_handle_text_elements_heading_defaults_to_non_numbered_when_style_missing( + backend, monkeypatch +): + captured: dict[str, tuple[int, str, bool]] = {} + + class FakeParagraph: + def __init__(self, element, docx_obj): + self.text = "Heading text" + self.style = SimpleNamespace() + + monkeypatch.setattr(msword_backend_module, "Paragraph", FakeParagraph) + monkeypatch.setattr(backend, "_get_paragraph_elements", lambda paragraph: []) + monkeypatch.setattr( + backend, "_handle_equations_in_text", lambda element, text: (text, []) + ) + monkeypatch.setattr(backend, "_get_comment_ids_for_element", lambda element: []) + monkeypatch.setattr( + backend, "_get_label_and_level", lambda paragraph: ("Heading", 1) + ) + monkeypatch.setattr(backend, "_get_numId_and_ilvl", lambda paragraph: (None, None)) + + def fake_add_heading(doc, level, text, is_numbered_style): + captured["heading"] = (level, text, is_numbered_style) + return [] + + monkeypatch.setattr(backend, "_add_heading", fake_add_heading) + + refs = backend._handle_text_elements(object(), DoclingDocument(name="test")) + + assert refs == [] + assert captured["heading"] == (1, "Heading text", False) + + +def test_handle_text_elements_inline_equations_stop_when_text_is_consumed( + backend, monkeypatch +): + equation_one = backend.equation_bookends.format(EQ="a") + equation_two = backend.equation_bookends.format(EQ="b") + + class FakeParagraph: + def __init__(self, element, docx_obj): + self.text = "inline eq" + self.style = SimpleNamespace() + + monkeypatch.setattr(msword_backend_module, "Paragraph", FakeParagraph) + monkeypatch.setattr(backend, "_get_paragraph_elements", lambda paragraph: []) + monkeypatch.setattr( + backend, + "_handle_equations_in_text", + lambda element, text: (equation_one, [equation_one, equation_two]), + ) + monkeypatch.setattr(backend, "_get_comment_ids_for_element", lambda element: []) + monkeypatch.setattr( + backend, "_get_label_and_level", lambda paragraph: ("Normal", None) + ) + monkeypatch.setattr(backend, "_get_numId_and_ilvl", lambda paragraph: (None, None)) + monkeypatch.setattr(backend, "_prev_numid", lambda: None) + monkeypatch.setattr(backend, "_get_level", lambda: 1) + backend.parents[0] = None + + refs = backend._handle_text_elements(object(), DoclingDocument(name="test")) + + assert len(refs) == 2