Skip to content

Commit 23ba52f

Browse files
author
Maksym Lysak
committed
Fixes to address PR reviews
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
1 parent 57d2dfb commit 23ba52f

16 files changed

Lines changed: 6329 additions & 6161 deletions

docling/backend/html_backend.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,13 +1301,21 @@ def _is_rich_table_cell(self, table_cell: Tag) -> bool:
13011301
has_custom_checkbox = any(
13021302
self._is_custom_checkbox_tag(child) for child in children
13031303
)
1304+
has_line_break = any(child.name == "br" for child in children)
1305+
direct_block_text_children = [
1306+
child
1307+
for child in table_cell.find_all(recursive=False)
1308+
if isinstance(child, Tag) and child.name in {"p", "div", "li"}
1309+
]
13041310
has_nested_form_semantic_id = any(
13051311
self._is_form_semantic_tag(child)
13061312
for child in children
13071313
if isinstance(child, Tag)
13081314
)
13091315
if has_nested_form_semantic_id:
13101316
return True
1317+
if has_line_break or len(direct_block_text_children) > 1:
1318+
return True
13111319
if not children:
13121320
content = [
13131321
item
@@ -1528,6 +1536,7 @@ def _flush_buffer() -> None:
15281536

15291537
for node in element.contents:
15301538
if isinstance(node, Tag):
1539+
name = node.name.lower()
15311540
if form_field := self._consume_form_field_for_tag(node):
15321541
_flush_buffer()
15331542
added_refs.extend(
@@ -1539,8 +1548,11 @@ def _flush_buffer() -> None:
15391548
)
15401549
continue
15411550
if self._is_suppressed_tag(node):
1551+
if name == "br":
1552+
# Keep explicit line breaks as text boundaries even when
1553+
# the <br> tag itself has no rendered bbox.
1554+
_flush_buffer()
15421555
continue
1543-
name = node.name.lower()
15441556
has_block_descendants = bool(
15451557
node.find(_BLOCK_TAGS)
15461558
or node.find("input")
@@ -1610,16 +1622,28 @@ def _flush_buffer() -> None:
16101622
_flush_buffer()
16111623
wk4 = self._walk(node, doc)
16121624
added_refs.extend(wk4)
1613-
else:
1625+
elif self._should_buffer_tag_text_inline(node):
16141626
buffer.extend(
16151627
self._extract_text_and_hyperlink_recursively(
16161628
node, find_parent_annotation=True, keep_newlines=False
16171629
)
16181630
)
1631+
else:
1632+
_flush_buffer()
1633+
wk5 = self._walk(node, doc)
1634+
added_refs.extend(wk5)
16191635
elif isinstance(node, NavigableString) and not isinstance(
16201636
node, PreformattedString
16211637
):
1622-
if str(node).strip("\n\r") == "":
1638+
node_text = str(node)
1639+
if node_text.strip("\n\r") == "":
1640+
parent_tag = node.parent if isinstance(node.parent, Tag) else None
1641+
if (
1642+
parent_tag is not None
1643+
and parent_tag.name in {"td", "th"}
1644+
and "\n" in node_text
1645+
):
1646+
_flush_buffer()
16231647
continue
16241648
else:
16251649
buffer.extend(
@@ -2675,6 +2699,26 @@ def _get_tag_classes(tag: Tag) -> set[str]:
26752699
return {classes}
26762700
return {str(value) for value in classes if isinstance(value, str)}
26772701

2702+
@staticmethod
2703+
def _has_inline_display_style(tag: Tag) -> bool:
2704+
style_attr = tag.get("style")
2705+
if not isinstance(style_attr, str):
2706+
return False
2707+
display_match = re.search(r"display\s*:\s*([^;]+)", style_attr, flags=re.I)
2708+
if display_match is None:
2709+
return False
2710+
display_value = display_match.group(1).strip().lower()
2711+
return display_value.startswith("inline") or display_value == "contents"
2712+
2713+
def _should_buffer_tag_text_inline(self, tag: Tag) -> bool:
2714+
tag_name = tag.name.lower()
2715+
if tag_name in _INLINE_HTML_TAGS:
2716+
return True
2717+
# Treat explicit inline-styled divs like inline wrappers.
2718+
if tag_name == "div" and self._has_inline_display_style(tag):
2719+
return True
2720+
return False
2721+
26782722
@staticmethod
26792723
def _is_input_checkbox_or_radio_tag(tag: Tag) -> bool:
26802724
if tag.name != "input":
@@ -3610,6 +3654,7 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]:
36103654
key_prov = values[0].prov
36113655

36123656
component_tag_obj_ids: set[int] = {id(value.tag) for value in values}
3657+
component_tag_obj_ids.update(id(tag) for tag in value_tags)
36133658
if key_tag is not None:
36143659
component_tag_obj_ids.add(id(key_tag))
36153660
if marker is not None:
@@ -3872,6 +3917,7 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem
38723917
consumed_tag_obj_ids: set[int] = set()
38733918
fields_by_key_id: dict[str, _ExtractedFormField] = {}
38743919
if form_region is not None:
3920+
consumed_tag_ids.update(form_region.consumed_tag_ids)
38753921
for field in form_region.fields:
38763922
key_tag_id = self._get_html_id(field.key_tag)
38773923
if not field.values:
Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
item-0 at level 0: unspecified: group _root_
2-
item-1 at level 1: text: This is a div with text. This is another div with text.
3-
item-2 at level 1: text: This is a regular paragraph.
4-
item-3 at level 1: text: This is a third div with a new line.
5-
item-4 at level 1: section: group details
6-
item-5 at level 2: text: Heading for the details element
7-
item-6 at level 2: text: Description of the details element.
8-
item-7 at level 1: inline: group group
9-
item-8 at level 2: text: This is a fourth div with a
10-
item-9 at level 2: text: bold
11-
item-10 at level 2: text: paragraph.
2+
item-1 at level 1: text: This is a div with text.
3+
item-2 at level 1: text: This is another div with text.
4+
item-3 at level 1: text: This is a regular paragraph.
5+
item-4 at level 1: text: This is a third div with a new line.
6+
item-5 at level 1: section: group details
7+
item-6 at level 2: text: Heading for the details element
8+
item-7 at level 2: text: Description of the details element.
9+
item-8 at level 1: inline: group group
10+
item-9 at level 2: text: This is a fourth div with a
11+
item-10 at level 2: text: bold
12+
item-11 at level 2: text: paragraph.

tests/data/groundtruth/docling_v2/example_06.html.json

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
{
3030
"$ref": "#/texts/3"
3131
},
32+
{
33+
"$ref": "#/texts/4"
34+
},
3235
{
3336
"$ref": "#/groups/0"
3437
},
@@ -48,10 +51,10 @@
4851
},
4952
"children": [
5053
{
51-
"$ref": "#/texts/4"
54+
"$ref": "#/texts/5"
5255
},
5356
{
54-
"$ref": "#/texts/5"
57+
"$ref": "#/texts/6"
5558
}
5659
],
5760
"content_layer": "body",
@@ -64,14 +67,14 @@
6467
"$ref": "#/body"
6568
},
6669
"children": [
67-
{
68-
"$ref": "#/texts/6"
69-
},
7070
{
7171
"$ref": "#/texts/7"
7272
},
7373
{
7474
"$ref": "#/texts/8"
75+
},
76+
{
77+
"$ref": "#/texts/9"
7578
}
7679
],
7780
"content_layer": "body",
@@ -101,8 +104,8 @@
101104
"content_layer": "body",
102105
"label": "text",
103106
"prov": [],
104-
"orig": "This is a div with text. This is another div with text.",
105-
"text": "This is a div with text. This is another div with text."
107+
"orig": "This is a div with text.",
108+
"text": "This is a div with text."
106109
},
107110
{
108111
"self_ref": "#/texts/2",
@@ -113,11 +116,23 @@
113116
"content_layer": "body",
114117
"label": "text",
115118
"prov": [],
119+
"orig": "This is another div with text.",
120+
"text": "This is another div with text."
121+
},
122+
{
123+
"self_ref": "#/texts/3",
124+
"parent": {
125+
"$ref": "#/body"
126+
},
127+
"children": [],
128+
"content_layer": "body",
129+
"label": "text",
130+
"prov": [],
116131
"orig": "This is a regular paragraph.",
117132
"text": "This is a regular paragraph."
118133
},
119134
{
120-
"self_ref": "#/texts/3",
135+
"self_ref": "#/texts/4",
121136
"parent": {
122137
"$ref": "#/body"
123138
},
@@ -129,7 +144,7 @@
129144
"text": "This is a third div with a new line."
130145
},
131146
{
132-
"self_ref": "#/texts/4",
147+
"self_ref": "#/texts/5",
133148
"parent": {
134149
"$ref": "#/groups/0"
135150
},
@@ -141,7 +156,7 @@
141156
"text": "Heading for the details element"
142157
},
143158
{
144-
"self_ref": "#/texts/5",
159+
"self_ref": "#/texts/6",
145160
"parent": {
146161
"$ref": "#/groups/0"
147162
},
@@ -153,7 +168,7 @@
153168
"text": "Description of the details element."
154169
},
155170
{
156-
"self_ref": "#/texts/6",
171+
"self_ref": "#/texts/7",
157172
"parent": {
158173
"$ref": "#/groups/1"
159174
},
@@ -165,7 +180,7 @@
165180
"text": "This is a fourth div with a"
166181
},
167182
{
168-
"self_ref": "#/texts/7",
183+
"self_ref": "#/texts/8",
169184
"parent": {
170185
"$ref": "#/groups/1"
171186
},
@@ -184,7 +199,7 @@
184199
}
185200
},
186201
{
187-
"self_ref": "#/texts/8",
202+
"self_ref": "#/texts/9",
188203
"parent": {
189204
"$ref": "#/groups/1"
190205
},

tests/data/groundtruth/docling_v2/example_06.html.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
This is a div with text. This is another div with text.
1+
This is a div with text.
2+
3+
This is another div with text.
24

35
This is a regular paragraph.
46

tests/data/groundtruth/docling_v2/html_rich_table_cells.html.itxt

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,30 +20,33 @@ item-0 at level 0: unspecified: group _root_
2020
item-19 at level 5: list_item: Build nest on ground
2121
item-20 at level 2: table with [4x2]
2222
item-21 at level 3: unspecified: group rich_cell_group_2_0_1
23-
item-22 at level 4: text: Aythya (Diving ducks)
24-
item-23 at level 3: unspecified: group rich_cell_group_2_0_2
25-
item-24 at level 4: text: Lophonetta (Pintail group)
26-
item-25 at level 3: unspecified: group rich_cell_group_2_0_3
27-
item-26 at level 4: text: Oxyura (Benthic ducks)
28-
item-27 at level 2: table with [4x2]
29-
item-28 at level 3: unspecified: group rich_cell_group_3_0_1
30-
item-29 at level 4: text: Swim
31-
item-30 at level 3: unspecified: group rich_cell_group_3_0_1
32-
item-31 at level 4: text: Gracefully glide on H
33-
item-32 at level 4: text: 2
34-
item-33 at level 4: text: O surfaces.
35-
item-34 at level 3: unspecified: group rich_cell_group_3_0_2
36-
item-35 at level 4: text: Fly
37-
item-36 at level 3: unspecified: group rich_cell_group_3_0_3
38-
item-37 at level 4: text: Quack
39-
item-38 at level 3: unspecified: group rich_cell_group_4_0_3
40-
item-39 at level 4: table with [3x2]
41-
item-40 at level 2: table with [5x3]
42-
item-41 at level 3: unspecified: group rich_cell_group_5_1_1
43-
item-42 at level 4: text: View PNG
44-
item-43 at level 3: unspecified: group rich_cell_group_5_1_2
45-
item-44 at level 4: picture
46-
item-44 at level 5: caption: White-headed duck thumbnail
47-
item-45 at level 3: unspecified: group rich_cell_group_5_1_3
48-
item-46 at level 4: text: View Full-Size Image
49-
item-47 at level 1: caption: White-headed duck thumbnail
23+
item-22 at level 4: text: Aythya
24+
item-23 at level 4: text: (Diving ducks)
25+
item-24 at level 3: unspecified: group rich_cell_group_2_0_2
26+
item-25 at level 4: text: Lophonetta
27+
item-26 at level 4: text: (Pintail group)
28+
item-27 at level 3: unspecified: group rich_cell_group_2_0_3
29+
item-28 at level 4: text: Oxyura
30+
item-29 at level 4: text: (Benthic ducks)
31+
item-30 at level 2: table with [4x2]
32+
item-31 at level 3: unspecified: group rich_cell_group_3_0_1
33+
item-32 at level 4: text: Swim
34+
item-33 at level 3: unspecified: group rich_cell_group_3_0_1
35+
item-34 at level 4: text: Gracefully glide on H
36+
item-35 at level 4: text: 2
37+
item-36 at level 4: text: O surfaces.
38+
item-37 at level 3: unspecified: group rich_cell_group_3_0_2
39+
item-38 at level 4: text: Fly
40+
item-39 at level 3: unspecified: group rich_cell_group_3_0_3
41+
item-40 at level 4: text: Quack
42+
item-41 at level 3: unspecified: group rich_cell_group_4_0_3
43+
item-42 at level 4: table with [3x2]
44+
item-43 at level 2: table with [5x3]
45+
item-44 at level 3: unspecified: group rich_cell_group_5_1_1
46+
item-45 at level 4: text: View PNG
47+
item-46 at level 3: unspecified: group rich_cell_group_5_1_2
48+
item-47 at level 4: picture
49+
item-47 at level 5: caption: White-headed duck thumbnail
50+
item-48 at level 3: unspecified: group rich_cell_group_5_1_3
51+
item-49 at level 4: text: View Full-Size Image
52+
item-50 at level 1: caption: White-headed duck thumbnail

0 commit comments

Comments
 (0)