Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 70 additions & 23 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,21 +906,55 @@ def _handle_equations_in_text(self, element, text):
only_texts = []
only_equations = []
texts_and_equations = []
for subt in element.iter():
tag_name = etree.QName(subt).localname
if tag_name == "t" and "math" not in subt.tag:
if isinstance(subt.text, str):
only_texts.append(subt.text)
texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt)).strip()
if len(latex_equation) > 0:
only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)

# Collect oMath elements and text runs from the paragraph.
# Use direct children iteration first; fall back to deep iteration
# only if no oMath elements are found at the direct level.
direct_omaths = [
child
for child in element
if "oMath" in child.tag and "oMathPara" not in child.tag
]

if direct_omaths:
# Iterate direct children to preserve sibling order and avoid
# processing nested oMath descendants of an already-converted node.
for child in element:
if "oMath" in child.tag and "oMathPara" not in child.tag:
latex_equation = str(oMath2Latex(child)).strip()
if len(latex_equation) > 0:
only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
else:
# Collect text from non-math children (e.g. <w:r> runs)
for t_elem in child.iter():
t_tag = etree.QName(t_elem).localname
if t_tag == "t" and "math" not in t_elem.tag:
if isinstance(t_elem.text, str):
only_texts.append(t_elem.text)
texts_and_equations.append(t_elem.text)
else:
# Original deep-iteration fallback for nested oMath (e.g.
# inside oMathPara or other wrapper elements).
for subt in element.iter():
tag_name = etree.QName(subt).localname
if tag_name == "t" and "math" not in subt.tag:
if isinstance(subt.text, str):
only_texts.append(subt.text)
texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt)).strip()
if len(latex_equation) > 0:
only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)

if len(only_equations) < 1:
return text, []
Expand Down Expand Up @@ -1055,15 +1089,28 @@ def _handle_text_elements(
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
text
) > 0:
# Standalone equation
# Standalone equation(s) — emit each as a separate formula
level = self._get_level()
t1 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
text=text.replace("<eq>", "").replace("</eq>", ""),
content_layer=self.content_layer,
)
elem_ref.append(t1.get_ref())
parent = self.parents[level - 1]
if len(equations) > 1:
for eq in equations:
eq_text = eq.replace("<eq>", "").replace("</eq>", "").strip()
if len(eq_text) > 0:
t1 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=parent,
text=eq_text,
content_layer=self.content_layer,
)
elem_ref.append(t1.get_ref())
else:
t1 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=parent,
text=text.replace("<eq>", "").replace("</eq>", ""),
content_layer=self.content_layer,
)
elem_ref.append(t1.get_ref())
else:
# Inline equation
level = self._get_level()
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0
item-2 at level 2: section_header: Issue 3: Concatenated equation blocks
item-3 at level 3: text: The paragraph below contains thr ... ts are siblings inside a single <w:p>.
item-4 at level 3: formula: a=b
item-5 at level 3: formula: c=d
item-6 at level 3: formula: e=f
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{
"schema_name": "DoclingDocument",
"version": "1.9.0",
"name": "omml_multi_equation_paragraph",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 17520448227351822398,
"filename": "omml_multi_equation_paragraph.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "header-0",
"label": "section"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Issue 3: Concatenated equation blocks",
"text": "Issue 3: Concatenated equation blocks",
"level": 1
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "The paragraph below contains three separate <m:oMath> elements.\nExpected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)\nDocling produces: one $$ block with all equations concatenated.\n\nAll three <m:oMath> elements are siblings inside a single <w:p>.",
"text": "The paragraph below contains three separate <m:oMath> elements.\nExpected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)\nDocling produces: one $$ block with all equations concatenated.\n\nAll three <m:oMath> elements are siblings inside a single <w:p>.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "a=b",
"text": "a=b"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "c=d",
"text": "c=d"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "e=f",
"text": "e=f"
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Issue 3: Concatenated equation blocks

The paragraph below contains three separate &lt;m:oMath&gt; elements.
Expected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)
Docling produces: one $$ block with all equations concatenated.

All three &lt;m:oMath&gt; elements are siblings inside a single &lt;w:p&gt;.

$$a=b$$

$$c=d$$

$$e=f$$
Loading
Loading