diff --git a/CHANGELOG.md b/CHANGELOG.md index e3d615d1de..ef4236a8ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.18.32-dev0 + +### Enhancements +- Optimized `merge_out_layout_with_ocr_layout` (codeflash) + ## 0.18.31 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6f7befa14b..cae707a4c3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.31" # pragma: no cover +__version__ = "0.18.32-dev0" # pragma: no cover diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 95975040bd..a266fd87cc 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -471,13 +471,9 @@ def supplement_layout_with_ocr_elements( else: ocr_regions_to_add = ocr_layout else: - mask = ( - ~bboxes1_is_almost_subregion_of_bboxes2( - ocr_layout.element_coords, layout.element_coords, subregion_threshold - ) - .sum(axis=1) - .astype(bool) - ) + mask = ~bboxes1_is_almost_subregion_of_bboxes2( + ocr_layout.element_coords, layout.element_coords, subregion_threshold + ).any(axis=1) # add ocr regions that are not covered by layout ocr_regions_to_add = ocr_layout.slice(mask) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 991d5c5d6f..24d2dc29aa 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -812,31 +812,28 @@ def aggregate_embedded_text_by_block( if len(source_regions) == 0 or len(target_region) == 0: return "", None - mask = ( - bboxes1_is_almost_subregion_of_bboxes2( - source_regions.element_coords, - target_region.element_coords, - subregion_threshold, - ) - .sum(axis=1) - .astype(bool) - ) - - text = " ".join([text for text in source_regions.slice(mask).texts if text]) - - if sum(mask): - source_bboxes = source_regions.slice(mask).element_coords + mask = bboxes1_is_almost_subregion_of_bboxes2( + source_regions.element_coords, + target_region.element_coords, + subregion_threshold, + ).any(axis=1) + + if mask.any(): + sliced = source_regions.slice(mask) + text = " ".join([text for text in sliced.texts if text]) + source_bboxes = sliced.element_coords target_bboxes = target_region.element_coords iou = _aggregated_iou(source_bboxes, target_bboxes[0, :]) fully_filled = ( - all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array) + all(flag == IsExtracted.TRUE for flag in sliced.is_extracted_array) and iou > text_coverage_threshold ) is_extracted = IsExtracted.TRUE if fully_filled else IsExtracted.PARTIAL else: # if nothing is sliced then it is not extracted + text = "" is_extracted = IsExtracted.FALSE return text, is_extracted