Skip to content

Commit 211b82a

Browse files
committed
feat: Visualize reading-order inside pictures
Signed-off-by: Christoph Auer <[email protected]>
1 parent 1e2ff30 commit 211b82a

16 files changed

+88
-52
lines changed

docling_core/transforms/visualizer/reading_order_visualizer.py

Lines changed: 64 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010
from typing_extensions import override
1111

1212
from docling_core.transforms.visualizer.base import BaseVisualizer
13-
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
13+
from docling_core.types.doc.document import (
14+
ContentLayer,
15+
DocItem,
16+
DoclingDocument,
17+
PictureItem,
18+
)
1419

1520

1621
class _NumberDrawingData(BaseModel):
@@ -33,6 +38,20 @@ class Params(BaseModel):
3338
base_visualizer: Optional[BaseVisualizer] = None
3439
params: Params = Params()
3540

41+
def _get_picture_context(
42+
self, elem: DocItem, doc: DoclingDocument
43+
) -> Optional[str]:
44+
"""Get the picture self_ref if element is nested inside a PictureItem, None otherwise."""
45+
current = elem
46+
while current.parent is not None:
47+
parent = current.parent.resolve(doc)
48+
if isinstance(parent, PictureItem):
49+
return parent.self_ref
50+
if not isinstance(parent, DocItem):
51+
break
52+
current = parent
53+
return None
54+
3655
def _draw_arrow(
3756
self,
3857
draw: ImageDraw.ImageDraw,
@@ -55,10 +74,10 @@ def _draw_arrow(
5574
# Calculate the arrowhead points
5675
dx = end_point[0] - start_point[0]
5776
dy = end_point[1] - start_point[1]
58-
angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft
77+
distance = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft
5978

6079
# Normalized direction vector for the arrow shaft
61-
ux, uy = dx / angle, dy / angle
80+
ux, uy = dx / distance, dy / distance
6281

6382
# Base of the arrowhead
6483
base_x = end_point[0] - ux * arrowhead_length
@@ -89,16 +108,34 @@ def _draw_doc_reading_order(
89108
except OSError:
90109
# Fallback to default font if arial is not available
91110
font = ImageFont.load_default()
92-
x0, y0 = None, None
93-
number_data_to_draw: dict[Optional[int], list[_NumberDrawingData]] = {}
94-
my_images: dict[Optional[int], Image] = images or {}
95-
prev_page = None
96-
i = 0
111+
112+
# Separate reading order paths for outside vs inside pictures
113+
# Key: (page_no, picture_ref_or_None) -> (x0, y0, element_index)
114+
# picture_ref is None for elements outside any picture, otherwise the picture's self_ref
115+
reading_order_state: dict[
116+
tuple[int, Optional[str]], tuple[float, float, int]
117+
] = {}
118+
number_data_to_draw: dict[int, list[_NumberDrawingData]] = {}
119+
# Only int keys are used (from prov.page_no), even if input images has Optional[int] keys
120+
my_images: dict[int, Image] = {
121+
k: v for k, v in (images or {}).items() if k is not None
122+
}
123+
prev_page: Optional[int] = None
124+
element_index = 0
125+
97126
for elem, _ in doc.iterate_items(
98127
included_content_layers=self.params.content_layers,
128+
traverse_pictures=True,
99129
):
100130
if not isinstance(elem, DocItem):
101131
continue
132+
133+
picture_ref = self._get_picture_context(elem, doc)
134+
# Include all elements in reading order:
135+
# - Top-level PictureItems are part of the outer reading order (picture_ref is None)
136+
# - Nested PictureItems are part of their parent picture's reading order (picture_ref is not None)
137+
# - Other elements follow the same pattern
138+
102139
if len(elem.prov) == 0:
103140
continue # Skip elements without provenances
104141

@@ -110,9 +147,9 @@ def _draw_doc_reading_order(
110147
number_data_to_draw[page_no] = []
111148

112149
if image is None or prev_page is None or page_no != prev_page:
113-
# new page begins
150+
# new page begins - reset all reading order paths
114151
prev_page = page_no
115-
x0 = y0 = None
152+
reading_order_state.clear()
116153

117154
if image is None:
118155
page_image = doc.pages[page_no].image
@@ -140,35 +177,34 @@ def _draw_doc_reading_order(
140177
if ro_bbox.b > ro_bbox.t:
141178
ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
142179

143-
if x0 is None and y0 is None:
144-
# is_root= True
145-
x0 = (ro_bbox.l + ro_bbox.r) / 2.0
146-
y0 = (ro_bbox.b + ro_bbox.t) / 2.0
180+
path_key = (page_no, picture_ref)
181+
state = reading_order_state.get(path_key)
147182

183+
x1 = (ro_bbox.l + ro_bbox.r) / 2.0
184+
y1 = (ro_bbox.b + ro_bbox.t) / 2.0
185+
186+
if state is None:
187+
# Start of a new reading order path (outside or inside picture)
188+
reading_order_state[path_key] = (x1, y1, element_index)
148189
number_data_to_draw[page_no].append(
149190
_NumberDrawingData(
150-
xy=(x0, y0),
151-
text=f"{i}",
191+
xy=(x1, y1),
192+
text=f"{element_index}",
152193
)
153194
)
154-
i += 1
155-
195+
element_index += 1
156196
else:
157-
# is_root = False
158-
assert x0 is not None
159-
assert y0 is not None
160-
161-
x1 = (ro_bbox.l + ro_bbox.r) / 2.0
162-
y1 = (ro_bbox.b + ro_bbox.t) / 2.0
163-
197+
# Continue existing reading order path
198+
x0, y0, _ = state
199+
# Use different color for picture-internal paths
200+
arrow_color = "blue" if picture_ref is not None else "red"
164201
draw = self._draw_arrow(
165202
draw=draw,
166203
arrow_coords=(x0, y0, x1, y1),
167204
line_width=2,
168-
color="red",
205+
color=arrow_color,
169206
)
170-
171-
x0, y0 = x1, y1
207+
reading_order_state[path_key] = (x1, y1, state[2])
172208

173209
if self.params.show_branch_numbering:
174210
# post-drawing the numbers to ensure they are rendered on top-layer

test/data/doc/2408.09869v3_enriched.out.dt.json

Lines changed: 19 additions & 19 deletions
Large diffs are not rendered by default.

test/data/doc/doc_with_kv.dt.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

test/data/doc/page_with_pic.dt.json

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

test/data/doc/page_with_pic_from_files.dt.json

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.
-133 Bytes
Loading
-18 Bytes
Loading
46 Bytes
Loading
-75 Bytes
Loading
-128 Bytes
Loading

0 commit comments

Comments
 (0)