diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d7adf2496..706c7ef751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ -## 0.18.33-dev0 +## 0.18.33-dev1 ### Enhancements - **Add `group_elements_by_parent_id` utility function**: Groups elements by their `parent_id` metadata field for easier document hierarchy traversal (fixes #1489) +### Fixes +- **Preserve newlines in Table/TableChunk elements during PDF partitioning**: Skip whitespace normalization for Table and TableChunk elements so newlines that carry structural meaning (row separation) are preserved (fixes #3983) + ## 0.18.32 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index be19c0c8c8..8b7be26adc 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.33-dev0" # pragma: no cover +__version__ = "0.18.33-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index cafe261fd6..fa37200d83 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -34,6 +34,8 @@ Link, ListItem, PageBreak, + Table, + TableChunk, Text, Title, ) @@ -823,11 +825,16 @@ def _partition_pdf_or_image_local( out_elements.append(cast(Element, el)) # NOTE(crag): this is probably always a Text object, but check for the sake of typing elif isinstance(el, Text): - el.text = re.sub( - RE_MULTISPACE_INCLUDING_NEWLINES, - " ", - el.text or "", - ).strip() + if isinstance(el, (Table, TableChunk)): + # For Table/TableChunk, preserve newlines (they carry structural meaning) + # but still collapse multiple horizontal whitespace (spaces, tabs) to single space + el.text = re.sub(r"[^\S\n]+", " ", el.text or "").strip() + else: + el.text = re.sub( + RE_MULTISPACE_INCLUDING_NEWLINES, + " ", + el.text or "", + ).strip() if el.text or isinstance(el, PageBreak): out_elements.append(cast(Element, el))