diff --git a/docling_ibm_models/tableformer/data_management/matching_post_processor.py b/docling_ibm_models/tableformer/data_management/matching_post_processor.py index 4dbcdf4..aaa6e0b 100644 --- a/docling_ibm_models/tableformer/data_management/matching_post_processor.py +++ b/docling_ibm_models/tableformer/data_management/matching_post_processor.py @@ -468,112 +468,77 @@ def find_overlapping_pairs_indexes(bboxes): return table_cells def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches): - r""" - USED in 8.a step - NOT USED in 6. step - - Align table cell bboxes with good matches - to encapsulate matching pdf cells - - Parameters - ---------- - table_cells : list of dict - Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label" - pdf_cells : list of dict - List of PDF cells as defined by Docling - matches : dictionary of lists of table_cells - A dictionary which is indexed by the pdf_cell_id as key and the value is a list - of the table_cells that fall inside that pdf cell - - Returns - ------- - clean_table_cells : list of dict - Aligned and cleaned table cells """ - # 6 - # align table cells with matching pdf cells - new_table_cells = [] - - for pdf_cell_id in matches: - match_list = matches[pdf_cell_id] - one_table_cells = [] - for i in range(len(match_list)): - otc = int(match_list[i]["table_cell_id"]) - if otc not in one_table_cells: - one_table_cells.append(otc) - - # Get bbox of pdf_cell: - pdf_cell_bbox = [] - for pdf_cell in pdf_cells: - if pdf_cell["id"] == int(pdf_cell_id): - pdf_cell_bbox = pdf_cell["bbox"] - - # Get bbox of pdf_cell: - for table_cell in table_cells: - if table_cell["cell_id"] in one_table_cells: - # Align bbox vertically to cover PDF cell - new_bbox = [ - pdf_cell_bbox[0], - pdf_cell_bbox[1], - pdf_cell_bbox[2], - pdf_cell_bbox[3], - ] - # We are sure cell is not empty, - # because we assign PDF cell to it - new_table_cell_class = "2" - - if "cell_class" in table_cell: - new_table_cell_class = table_cell["cell_class"] - - new_table_cell = { - "bbox": new_bbox, - "cell_id": table_cell["cell_id"], - "column_id": table_cell["column_id"], - "label": table_cell["label"], - "row_id": table_cell["row_id"], - "cell_class": new_table_cell_class, - } + Align table cell bboxes with good matches to encapsulate matching pdf cells + """ + pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells} + table_cell_dict = {cell["cell_id"]: cell for cell in table_cells} - if "colspan_val" in table_cell: - new_table_cell["colspan_val"] = table_cell["colspan_val"] - if "rowspan_val" in table_cell: - new_table_cell["rowspan_val"] = table_cell["rowspan_val"] - new_table_cells.append(new_table_cell) + # Track unique cells we're going to add + processed_cells = set() - # Rebuild table_cells list deduplicating repeating cells, - # encapsulating all duplicate cells dimensions + # First pass - create initial new_table_cells with aligned bboxes + new_table_cells = [] - for new_table_cell in new_table_cells: - cell_id_to_find = new_table_cell["cell_id"] + for pdf_cell_id, match_list in matches.items(): + # Extract unique table cell ids from match_list + table_cell_ids = set(int(match["table_cell_id"]) for match in match_list) - x1s = [] - y1s = [] - x2s = [] - y2s = [] + # Get bbox of pdf_cell + pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id)) + if not pdf_cell_bbox: + continue - found = 0 + # Process each unique table cell + for cell_id in table_cell_ids: + if cell_id in processed_cells: + continue + + table_cell = table_cell_dict.get(cell_id) + if not table_cell: + continue + + # Create new table cell with aligned bbox + new_table_cell = table_cell.copy() + new_table_cell["bbox"] = list(pdf_cell_bbox) + + # Set cell class + if "cell_class" not in new_table_cell: + new_table_cell["cell_class"] = "2" + + new_table_cells.append(new_table_cell) + processed_cells.add(cell_id) + + # Second pass - aggregate bboxes for duplicate cells + cell_to_bboxes = {} + for cell in new_table_cells: + cell_id = cell["cell_id"] + if cell_id not in cell_to_bboxes: + cell_to_bboxes[cell_id] = [] + cell_to_bboxes[cell_id].append(cell["bbox"]) + + # Create final clean table cells + clean_table_cells = [] + processed_ids = set() + + for cell in new_table_cells: + cell_id = cell["cell_id"] + if cell_id in processed_ids: + continue - for found_cell in new_table_cells: - if found_cell["cell_id"] == cell_id_to_find: - found += 1 - x1s.append(found_cell["bbox"][0]) - y1s.append(found_cell["bbox"][1]) - x2s.append(found_cell["bbox"][2]) - y2s.append(found_cell["bbox"][3]) + bboxes = cell_to_bboxes[cell_id] + if len(bboxes) > 1: + # Merge bboxes + x1s = [bbox[0] for bbox in bboxes] + y1s = [bbox[1] for bbox in bboxes] + x2s = [bbox[2] for bbox in bboxes] + y2s = [bbox[3] for bbox in bboxes] - min_x1 = min(x1s) - min_y1 = min(y1s) - max_x2 = max(x2s) - max_y2 = max(y2s) + cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)] - if found > 1: - new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2] + clean_table_cells.append(cell) + processed_ids.add(cell_id) - clean_table_cells = [ - i - for n, i in enumerate(new_table_cells) - if i not in new_table_cells[n + 1 :] - ] return clean_table_cells def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):