diff --git a/CHANGELOG.md b/CHANGELOG.md index 260842ee5f..97f88d7d8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -## 0.18.30-dev3 +## 0.18.30-dev4 ### Enhancement - `is_text_embedded` now considers rotated text as low fidelity and and elements with no trivial amount of it are considered not embedded - Replace `pdf2image` with PyPDFium2 for PDF rendering +- Add new chunking option `isolate_tables`, default to `False`. When `True`, table elements are not chunked with any other elements. ### Fixes - **Fix EN DASH not cleaned by `clean_bullets`**: Added EN DASH (`\u2013`) to `UNICODE_BULLETS` pattern so `clean_bullets` properly removes EN DASH bullet points without requiring `clean_dashes` (fixes #4105) diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py index 88e01563fe..be31cb32e1 100644 --- a/test_unstructured/chunking/test_basic.py +++ b/test_unstructured/chunking/test_basic.py @@ -14,6 +14,7 @@ from unstructured.chunking.basic import chunk_elements from unstructured.documents.elements import CompositeElement, Text, Title from unstructured.partition.docx import partition_docx +from unstructured.partition.xlsx import partition_xlsx def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_partition_function(): @@ -152,17 +153,29 @@ class Describe_chunk_elements: ], ) def it_supports_the_include_orig_elements_option( - self, kwargs: dict[str, Any], expected_value: bool, _chunk_elements_: Mock + self, kwargs: dict[str, Any], expected_value: bool, mocked_chunk_elements_: Mock ): # -- this line would raise if "include_orig_elements" was not an available parameter on # -- `chunk_elements()`. chunk_elements([], **kwargs) - _, opts = _chunk_elements_.call_args.args + _, opts = mocked_chunk_elements_.call_args.args assert opts.include_orig_elements is expected_value # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() - def _chunk_elements_(self, request: FixtureRequest): + def mocked_chunk_elements_(self, request: FixtureRequest): return function_mock(request, "unstructured.chunking.basic._chunk_elements") + + +def test_basic_chunk_isolates_tables(): + elements = partition_xlsx("example-docs/stanley-cups.xlsx") + assert elements[1].category == "Table" + assert elements[3].category == "Table" + chunks = chunk_elements(elements) + assert len(chunks) == 1 + assert isinstance(chunks[0], CompositeElement) + chunks = chunk_elements(elements, isolate_tables=True) + assert chunks[1].category == "Table" + assert chunks[3].category == "Table" diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 443b073755..c8ea000afd 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -44,6 +44,16 @@ def test_it_chunks_text_followed_by_table_together_when_both_fit(): assert isinstance(chunks[0], CompositeElement) +def test_isolate_table_works(): + elements = elements_from_json(input_path("chunking/title_table_200.json")) + + chunks = chunk_by_title(elements, combine_text_under_n_chars=0, isolate_tables=True) + + assert len(chunks) == 2 + assert isinstance(chunks[0], CompositeElement) + assert isinstance(chunks[1], Table) + + def test_it_chunks_table_followed_by_text_together_when_both_fit(): elements = elements_from_json(input_path("chunking/table_text_200.json")) @@ -456,19 +466,19 @@ class Describe_chunk_by_title: ], ) def it_supports_the_include_orig_elements_option( - self, kwargs: dict[str, Any], expected_value: bool, _chunk_by_title_: Mock + self, kwargs: dict[str, Any], expected_value: bool, mocked_chunk_by_title_: Mock ): # -- this line would raise if "include_orig_elements" was not an available parameter on # -- `chunk_by_title()`. chunk_by_title([], **kwargs) - _, opts = _chunk_by_title_.call_args.args + _, opts = mocked_chunk_by_title_.call_args.args assert opts.include_orig_elements is expected_value # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() - def _chunk_by_title_(self, request: FixtureRequest): + def mocked_chunk_by_title_(self, request: FixtureRequest): return function_mock(request, "unstructured.chunking.title._chunk_by_title") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 86dd67ac48..07ffca09ea 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.30-dev3" # pragma: no cover +__version__ = "0.18.30-dev4" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index a54e66d63f..823a245df9 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -133,6 +133,14 @@ def hard_max(self) -> int: arg_value = self._kwargs.get("max_characters") return arg_value if arg_value is not None else CHUNK_MAX_CHARS_DEFAULT + @lazyproperty + def isolate_tables(self) -> bool: + """when True tables are not combined with any other elements no matter the circumstances. + Default to False. + """ + arg_value = self._kwargs.get("isolate_tables") + return False if arg_value is None else bool(arg_value) + @lazyproperty def include_orig_elements(self) -> bool: """When True, add original elements from pre-chunk to `.metadata.orig_elements` of chunk. @@ -383,6 +391,11 @@ def will_fit(self, element: Element) -> bool: # -- an empty pre-chunk will accept any element (including an oversized-element) -- if len(self._elements) == 0: return True + # -- table will only fit into an empty pre-chunk if isolate tables-- + if self._opts.isolate_tables and ( + element.category == "Table" or self._elements[-1].category == "Table" + ): + return False # -- a pre-chunk that already exceeds the soft-max is considered "full" -- if self._text_length > self._opts.soft_max: return False @@ -448,6 +461,10 @@ def can_combine(self, pre_chunk: PreChunk) -> bool: """True when `pre_chunk` can be combined with this one without exceeding size limits.""" if len(self._text) >= self._opts.combine_text_under_n_chars: return False + if self._opts.isolate_tables and ( + self._elements[-1].category == "Table" or pre_chunk._elements[0].category == "Table" + ): + return False # -- avoid duplicating length computations by doing a trial-combine which is just as # -- efficient and definitely more robust than hoping two different computations of combined # -- length continue to get the same answer as the code evolves. Only possible because diff --git a/unstructured/chunking/basic.py b/unstructured/chunking/basic.py index eeaaf52614..f9d67fd76f 100644 --- a/unstructured/chunking/basic.py +++ b/unstructured/chunking/basic.py @@ -29,6 +29,7 @@ def chunk_elements( new_after_n_chars: Optional[int] = None, overlap: Optional[int] = None, overlap_all: Optional[bool] = None, + isolate_tables: bool = False, ) -> list[Element]: """Combine sequential `elements` into chunks, respecting specified text-length limits. @@ -71,6 +72,7 @@ def chunk_elements( new_after_n_chars=new_after_n_chars, overlap=overlap, overlap_all=overlap_all, + isolate_tables=isolate_tables, ) return _chunk_elements(elements, opts) diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 53b50b5655..5d9f45767e 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -30,6 +30,7 @@ def chunk_by_title( new_after_n_chars: Optional[int] = None, overlap: Optional[int] = None, overlap_all: Optional[bool] = None, + isolate_tables: bool = False, ) -> list[Element]: """Uses title elements to identify sections within the document for chunking. @@ -80,6 +81,7 @@ def chunk_by_title( new_after_n_chars=new_after_n_chars, overlap=overlap, overlap_all=overlap_all, + isolate_tables=isolate_tables, ) return _chunk_by_title(elements, opts)