Skip to content

fix: Table model - optimizing align_table_cells_to_pdf in matching_post_cessor #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -468,112 +468,77 @@ def find_overlapping_pairs_indexes(bboxes):
return table_cells

def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
r"""
USED in 8.a step
NOT USED in 6. step

Align table cell bboxes with good matches
to encapsulate matching pdf cells

Parameters
----------
table_cells : list of dict
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
pdf_cells : list of dict
List of PDF cells as defined by Docling
matches : dictionary of lists of table_cells
A dictionary which is indexed by the pdf_cell_id as key and the value is a list
of the table_cells that fall inside that pdf cell

Returns
-------
clean_table_cells : list of dict
Aligned and cleaned table cells
"""
# 6
# align table cells with matching pdf cells
new_table_cells = []

for pdf_cell_id in matches:
match_list = matches[pdf_cell_id]
one_table_cells = []
for i in range(len(match_list)):
otc = int(match_list[i]["table_cell_id"])
if otc not in one_table_cells:
one_table_cells.append(otc)

# Get bbox of pdf_cell:
pdf_cell_bbox = []
for pdf_cell in pdf_cells:
if pdf_cell["id"] == int(pdf_cell_id):
pdf_cell_bbox = pdf_cell["bbox"]

# Get bbox of pdf_cell:
for table_cell in table_cells:
if table_cell["cell_id"] in one_table_cells:
# Align bbox vertically to cover PDF cell
new_bbox = [
pdf_cell_bbox[0],
pdf_cell_bbox[1],
pdf_cell_bbox[2],
pdf_cell_bbox[3],
]
# We are sure cell is not empty,
# because we assign PDF cell to it
new_table_cell_class = "2"

if "cell_class" in table_cell:
new_table_cell_class = table_cell["cell_class"]

new_table_cell = {
"bbox": new_bbox,
"cell_id": table_cell["cell_id"],
"column_id": table_cell["column_id"],
"label": table_cell["label"],
"row_id": table_cell["row_id"],
"cell_class": new_table_cell_class,
}
Align table cell bboxes with good matches to encapsulate matching pdf cells
"""
pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells}
table_cell_dict = {cell["cell_id"]: cell for cell in table_cells}

if "colspan_val" in table_cell:
new_table_cell["colspan_val"] = table_cell["colspan_val"]
if "rowspan_val" in table_cell:
new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
new_table_cells.append(new_table_cell)
# Track unique cells we're going to add
processed_cells = set()

# Rebuild table_cells list deduplicating repeating cells,
# encapsulating all duplicate cells dimensions
# First pass - create initial new_table_cells with aligned bboxes
new_table_cells = []

for new_table_cell in new_table_cells:
cell_id_to_find = new_table_cell["cell_id"]
for pdf_cell_id, match_list in matches.items():
# Extract unique table cell ids from match_list
table_cell_ids = set(int(match["table_cell_id"]) for match in match_list)

x1s = []
y1s = []
x2s = []
y2s = []
# Get bbox of pdf_cell
pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id))
if not pdf_cell_bbox:
continue

found = 0
# Process each unique table cell
for cell_id in table_cell_ids:
if cell_id in processed_cells:
continue

table_cell = table_cell_dict.get(cell_id)
if not table_cell:
continue

# Create new table cell with aligned bbox
new_table_cell = table_cell.copy()
new_table_cell["bbox"] = list(pdf_cell_bbox)

# Set cell class
if "cell_class" not in new_table_cell:
new_table_cell["cell_class"] = "2"

new_table_cells.append(new_table_cell)
processed_cells.add(cell_id)

# Second pass - aggregate bboxes for duplicate cells
cell_to_bboxes = {}
for cell in new_table_cells:
cell_id = cell["cell_id"]
if cell_id not in cell_to_bboxes:
cell_to_bboxes[cell_id] = []
cell_to_bboxes[cell_id].append(cell["bbox"])

# Create final clean table cells
clean_table_cells = []
processed_ids = set()

for cell in new_table_cells:
cell_id = cell["cell_id"]
if cell_id in processed_ids:
continue

for found_cell in new_table_cells:
if found_cell["cell_id"] == cell_id_to_find:
found += 1
x1s.append(found_cell["bbox"][0])
y1s.append(found_cell["bbox"][1])
x2s.append(found_cell["bbox"][2])
y2s.append(found_cell["bbox"][3])
bboxes = cell_to_bboxes[cell_id]
if len(bboxes) > 1:
# Merge bboxes
x1s = [bbox[0] for bbox in bboxes]
y1s = [bbox[1] for bbox in bboxes]
x2s = [bbox[2] for bbox in bboxes]
y2s = [bbox[3] for bbox in bboxes]

min_x1 = min(x1s)
min_y1 = min(y1s)
max_x2 = max(x2s)
max_y2 = max(y2s)
cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)]

if found > 1:
new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
clean_table_cells.append(cell)
processed_ids.add(cell_id)

clean_table_cells = [
i
for n, i in enumerate(new_table_cells)
if i not in new_table_cells[n + 1 :]
]
return clean_table_cells

def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):
Expand Down