Skip to content

Commit 13ebdb2

Browse files
author
Maksym Lysak
committed
Cleaned up bbox helper functions, reusing more of the existing code
Signed-off-by: Maksym Lysak <[email protected]>
1 parent d506650 commit 13ebdb2

File tree

1 file changed

+18
-54
lines changed

1 file changed

+18
-54
lines changed

docling_core/types/doc/regions.py

Lines changed: 18 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -11,55 +11,19 @@
1111
from docling_core.types.doc.document import TableCell, TableData
1212

1313

14-
def bbox_fraction_inside(
15-
inner: BoundingBox, outer: BoundingBox, *, eps: float = 1.0e-9
16-
) -> float:
17-
"""Return the fraction of ``inner`` area that lies inside ``outer``."""
18-
area = inner.area()
19-
if area <= eps:
20-
return 0.0
21-
intersection = inner.intersection_area_with(outer)
22-
return intersection / max(area, eps)
23-
24-
25-
def bbox_contains(
26-
inner: BoundingBox, outer: BoundingBox, *, threshold: float, eps: float = 1.0e-9
27-
) -> bool:
28-
"""Return ``True`` when ``inner`` is contained in ``outer`` above ``threshold``."""
29-
return bbox_fraction_inside(inner, outer, eps=eps) >= threshold
30-
31-
32-
def bbox_iou(a: BoundingBox, b: BoundingBox, *, eps: float = 1.0e-6) -> float:
33-
"""Return the intersection over union between two bounding boxes."""
34-
return a.intersection_over_union(b, eps=eps)
35-
36-
37-
class HasBoundingBox(Protocol):
38-
"""Protocol for objects exposing a bounding box."""
39-
40-
bbox: BoundingBox
41-
42-
43-
def dedupe_bboxes(
14+
def _dedupe_bboxes(
4415
elements: Sequence[BoundingBox],
4516
*,
4617
iou_threshold: float = 0.9,
4718
) -> list[BoundingBox]:
4819
"""Return elements whose bounding boxes are unique within ``iou_threshold``."""
4920
deduped: list[BoundingBox] = []
5021
for element in elements:
51-
if all(bbox_iou(element, kept) < iou_threshold for kept in deduped):
22+
if all(element.intersection_over_union(kept) < iou_threshold for kept in deduped):
5223
deduped.append(element)
5324
return deduped
5425

5526

56-
def is_bbox_within(
57-
bbox_a: BoundingBox, bbox_b: BoundingBox, threshold: float = 0.5
58-
) -> bool:
59-
"""Return ``True`` when ``bbox_b`` lies within ``bbox_a`` above ``threshold``."""
60-
return bbox_contains(bbox_b, bbox_a, threshold=threshold)
61-
62-
6327
def _process_table_headers(
6428
bbox: BoundingBox,
6529
row_headers: List[BoundingBox] = [],
@@ -71,13 +35,13 @@ def _process_table_headers(
7135
c_row_section = False
7236

7337
for col_header in col_headers:
74-
if is_bbox_within(col_header, bbox):
38+
if bbox.intersection_over_self(col_header)>=0.5:
7539
c_column_header = True
7640
for row_header in row_headers:
77-
if is_bbox_within(row_header, bbox):
41+
if bbox.intersection_over_self(row_header)>=0.5:
7842
c_row_header = True
7943
for row_section in row_sections:
80-
if is_bbox_within(row_section, bbox):
44+
if bbox.intersection_over_self(row_section)>=0.5:
8145
c_row_section = True
8246
return c_column_header, c_row_header, c_row_section
8347

@@ -255,49 +219,49 @@ def regions_to_table(
255219
256220
Adds semantics for regions of row_headers, col_headers, row_section
257221
"""
258-
default_containment_thresh = 0.50
222+
default_containment_thresh = 0.5
259223
rows.extend(row_sections) # use row sections to compensate for missing rows
260-
rows = dedupe_bboxes(
224+
rows = _dedupe_bboxes(
261225
[
262226
e
263227
for e in rows
264-
if bbox_contains(e, table_bbox, threshold=default_containment_thresh)
228+
if e.intersection_over_self(table_bbox)>=default_containment_thresh
265229
]
266230
)
267-
cols = dedupe_bboxes(
231+
cols = _dedupe_bboxes(
268232
[
269233
e
270234
for e in cols
271-
if bbox_contains(e, table_bbox, threshold=default_containment_thresh)
235+
if e.intersection_over_self(table_bbox)>=default_containment_thresh
272236
]
273237
)
274-
merges = dedupe_bboxes(
238+
merges = _dedupe_bboxes(
275239
[
276240
e
277241
for e in merges
278-
if bbox_contains(e, table_bbox, threshold=default_containment_thresh)
242+
if e.intersection_over_self(table_bbox)>=default_containment_thresh
279243
]
280244
)
281245

282-
col_headers = dedupe_bboxes(
246+
col_headers = _dedupe_bboxes(
283247
[
284248
e
285249
for e in col_headers
286-
if bbox_contains(e, table_bbox, threshold=default_containment_thresh)
250+
if e.intersection_over_self(table_bbox)>=default_containment_thresh
287251
]
288252
)
289-
row_headers = dedupe_bboxes(
253+
row_headers = _dedupe_bboxes(
290254
[
291255
e
292256
for e in row_headers
293-
if bbox_contains(e, table_bbox, threshold=default_containment_thresh)
257+
if e.intersection_over_self(table_bbox)>=default_containment_thresh
294258
]
295259
)
296-
row_sections = dedupe_bboxes(
260+
row_sections = _dedupe_bboxes(
297261
[
298262
e
299263
for e in row_sections
300-
if bbox_contains(e, table_bbox, threshold=default_containment_thresh)
264+
if e.intersection_over_self(table_bbox)>=default_containment_thresh
301265
]
302266
)
303267

0 commit comments

Comments
 (0)