diff --git a/CHANGELOG.md b/CHANGELOG.md index 62ae488af3..40fa0639fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Two executions of the same code, on the same file, produce different results. The order of the elements is random. This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) +- **RE_MULTISPACE_INCLUDING_NEWLINES was incorrectly used for Table or TableChunk** Newlines should not be removed from Table or TableChunk elements. The re.sub is not needed for Table or TableChunk elements now. ## 0.17.5 diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index d38658ed64..948cfa527e 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -34,6 +34,8 @@ Link, ListItem, PageBreak, + Table, + TableChunk, Text, Title, process_metadata, @@ -823,8 +825,9 @@ def _partition_pdf_or_image_local( if isinstance(el, Image): out_elements.append(cast(Element, el)) - # NOTE(crag): this is probably always a Text object, but check for the sake of typing - elif isinstance(el, Text): + # NOTE(crag): this is probably always a Text object, but check for the sake of typing. + # NOTE(JQQ): Escape RE for Table and TableChunk + elif isinstance(el, Text) and not isinstance(el, (Table, TableChunk)): el.text = re.sub( RE_MULTISPACE_INCLUDING_NEWLINES, " ",