1111from docling_core .types .doc .document import TableCell , TableData
1212
1313
14- def bbox_fraction_inside (
15- inner : BoundingBox , outer : BoundingBox , * , eps : float = 1.0e-9
16- ) -> float :
17- """Return the fraction of ``inner`` area that lies inside ``outer``."""
18- area = inner .area ()
19- if area <= eps :
20- return 0.0
21- intersection = inner .intersection_area_with (outer )
22- return intersection / max (area , eps )
23-
24-
25- def bbox_contains (
26- inner : BoundingBox , outer : BoundingBox , * , threshold : float , eps : float = 1.0e-9
27- ) -> bool :
28- """Return ``True`` when ``inner`` is contained in ``outer`` above ``threshold``."""
29- return bbox_fraction_inside (inner , outer , eps = eps ) >= threshold
30-
31-
32- def bbox_iou (a : BoundingBox , b : BoundingBox , * , eps : float = 1.0e-6 ) -> float :
33- """Return the intersection over union between two bounding boxes."""
34- return a .intersection_over_union (b , eps = eps )
35-
36-
37- class HasBoundingBox (Protocol ):
38- """Protocol for objects exposing a bounding box."""
39-
40- bbox : BoundingBox
41-
42-
43- def dedupe_bboxes (
14+ def _dedupe_bboxes (
4415 elements : Sequence [BoundingBox ],
4516 * ,
4617 iou_threshold : float = 0.9 ,
4718) -> list [BoundingBox ]:
4819 """Return elements whose bounding boxes are unique within ``iou_threshold``."""
4920 deduped : list [BoundingBox ] = []
5021 for element in elements :
51- if all (bbox_iou ( element , kept ) < iou_threshold for kept in deduped ):
22+ if all (element . intersection_over_union ( kept ) < iou_threshold for kept in deduped ):
5223 deduped .append (element )
5324 return deduped
5425
5526
56- def is_bbox_within (
57- bbox_a : BoundingBox , bbox_b : BoundingBox , threshold : float = 0.5
58- ) -> bool :
59- """Return ``True`` when ``bbox_b`` lies within ``bbox_a`` above ``threshold``."""
60- return bbox_contains (bbox_b , bbox_a , threshold = threshold )
61-
62-
6327def _process_table_headers (
6428 bbox : BoundingBox ,
6529 row_headers : List [BoundingBox ] = [],
@@ -71,13 +35,13 @@ def _process_table_headers(
7135 c_row_section = False
7236
7337 for col_header in col_headers :
74- if is_bbox_within (col_header , bbox ) :
38+ if bbox . intersection_over_self (col_header ) >= 0.5 :
7539 c_column_header = True
7640 for row_header in row_headers :
77- if is_bbox_within (row_header , bbox ) :
41+ if bbox . intersection_over_self (row_header ) >= 0.5 :
7842 c_row_header = True
7943 for row_section in row_sections :
80- if is_bbox_within (row_section , bbox ) :
44+ if bbox . intersection_over_self (row_section ) >= 0.5 :
8145 c_row_section = True
8246 return c_column_header , c_row_header , c_row_section
8347
@@ -255,49 +219,49 @@ def regions_to_table(
255219
256220 Adds semantics for regions of row_headers, col_headers, row_section
257221 """
258- default_containment_thresh = 0.50
222+ default_containment_thresh = 0.5
259223 rows .extend (row_sections ) # use row sections to compensate for missing rows
260- rows = dedupe_bboxes (
224+ rows = _dedupe_bboxes (
261225 [
262226 e
263227 for e in rows
264- if bbox_contains ( e , table_bbox , threshold = default_containment_thresh )
228+ if e . intersection_over_self ( table_bbox ) > =default_containment_thresh
265229 ]
266230 )
267- cols = dedupe_bboxes (
231+ cols = _dedupe_bboxes (
268232 [
269233 e
270234 for e in cols
271- if bbox_contains ( e , table_bbox , threshold = default_containment_thresh )
235+ if e . intersection_over_self ( table_bbox ) > =default_containment_thresh
272236 ]
273237 )
274- merges = dedupe_bboxes (
238+ merges = _dedupe_bboxes (
275239 [
276240 e
277241 for e in merges
278- if bbox_contains ( e , table_bbox , threshold = default_containment_thresh )
242+ if e . intersection_over_self ( table_bbox ) > =default_containment_thresh
279243 ]
280244 )
281245
282- col_headers = dedupe_bboxes (
246+ col_headers = _dedupe_bboxes (
283247 [
284248 e
285249 for e in col_headers
286- if bbox_contains ( e , table_bbox , threshold = default_containment_thresh )
250+ if e . intersection_over_self ( table_bbox ) > =default_containment_thresh
287251 ]
288252 )
289- row_headers = dedupe_bboxes (
253+ row_headers = _dedupe_bboxes (
290254 [
291255 e
292256 for e in row_headers
293- if bbox_contains ( e , table_bbox , threshold = default_containment_thresh )
257+ if e . intersection_over_self ( table_bbox ) > =default_containment_thresh
294258 ]
295259 )
296- row_sections = dedupe_bboxes (
260+ row_sections = _dedupe_bboxes (
297261 [
298262 e
299263 for e in row_sections
300- if bbox_contains ( e , table_bbox , threshold = default_containment_thresh )
264+ if e . intersection_over_self ( table_bbox ) > =default_containment_thresh
301265 ]
302266 )
303267
0 commit comments