From aa82531af2b6a162ef52cc62d455e9bbb5ea4697 Mon Sep 17 00:00:00 2001 From: JQQ Date: Wed, 9 Apr 2025 19:19:02 +0800 Subject: [PATCH 1/2] bug fix: - fix: https://github.com/Unstructured-IO/unstructured/issues/3983 --- unstructured/partition/pdf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index d38658ed64..a5fe0d279d 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -36,7 +36,7 @@ PageBreak, Text, Title, - process_metadata, + process_metadata, TableChunk, Table, ) from unstructured.errors import PageCountExceededError from unstructured.file_utils.filetype import add_metadata_with_filetype @@ -823,8 +823,9 @@ def _partition_pdf_or_image_local( if isinstance(el, Image): out_elements.append(cast(Element, el)) - # NOTE(crag): this is probably always a Text object, but check for the sake of typing - elif isinstance(el, Text): + # NOTE(crag): this is probably always a Text object, but check for the sake of typing. + # NOTE(JQQ): Escape RE for Table and TableChunk + elif isinstance(el, Text) and not isinstance(el, (Table, TableChunk)): el.text = re.sub( RE_MULTISPACE_INCLUDING_NEWLINES, " ", From 176ce32ec9a718078c7a5c7af8b2ace6c2491c20 Mon Sep 17 00:00:00 2001 From: JQQ Date: Wed, 9 Apr 2025 19:19:02 +0800 Subject: [PATCH 2/2] bug fix: - fix: https://github.com/Unstructured-IO/unstructured/issues/3983 --- CHANGELOG.md | 1 + unstructured/partition/pdf.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62ae488af3..40fa0639fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Two executions of the same code, on the same file, produce different results. The order of the elements is random. This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) +- **RE_MULTISPACE_INCLUDING_NEWLINES was incorrectly used for Table or TableChunk** Newlines should not be removed from Table or TableChunk elements. The re.sub is not needed for Table or TableChunk elements now. ## 0.17.5 diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index d38658ed64..a5fe0d279d 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -36,7 +36,7 @@ PageBreak, Text, Title, - process_metadata, + process_metadata, TableChunk, Table, ) from unstructured.errors import PageCountExceededError from unstructured.file_utils.filetype import add_metadata_with_filetype @@ -823,8 +823,9 @@ def _partition_pdf_or_image_local( if isinstance(el, Image): out_elements.append(cast(Element, el)) - # NOTE(crag): this is probably always a Text object, but check for the sake of typing - elif isinstance(el, Text): + # NOTE(crag): this is probably always a Text object, but check for the sake of typing. + # NOTE(JQQ): Escape RE for Table and TableChunk + elif isinstance(el, Text) and not isinstance(el, (Table, TableChunk)): el.text = re.sub( RE_MULTISPACE_INCLUDING_NEWLINES, " ",