Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
## 0.18.30-dev3
## 0.18.30-dev4

### Enhancement
- `is_text_embedded` now considers rotated text as low fidelity and and elements with no trivial amount of it are considered not embedded
- Replace `pdf2image` with PyPDFium2 for PDF rendering
- Add new chunking option `isolate_tables`, default to `False`. When `True`, table elements are not chunked with any other elements.

### Fixes
- **Fix EN DASH not cleaned by `clean_bullets`**: Added EN DASH (`\u2013`) to `UNICODE_BULLETS` pattern so `clean_bullets` properly removes EN DASH bullet points without requiring `clean_dashes` (fixes #4105)
Expand Down
19 changes: 16 additions & 3 deletions test_unstructured/chunking/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import CompositeElement, Text, Title
from unstructured.partition.docx import partition_docx
from unstructured.partition.xlsx import partition_xlsx


def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_partition_function():
Expand Down Expand Up @@ -152,17 +153,29 @@ class Describe_chunk_elements:
],
)
def it_supports_the_include_orig_elements_option(
self, kwargs: dict[str, Any], expected_value: bool, _chunk_elements_: Mock
self, kwargs: dict[str, Any], expected_value: bool, mocked_chunk_elements_: Mock
):
# -- this line would raise if "include_orig_elements" was not an available parameter on
# -- `chunk_elements()`.
chunk_elements([], **kwargs)

_, opts = _chunk_elements_.call_args.args
_, opts = mocked_chunk_elements_.call_args.args
assert opts.include_orig_elements is expected_value

# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture()
def _chunk_elements_(self, request: FixtureRequest):
def mocked_chunk_elements_(self, request: FixtureRequest):
return function_mock(request, "unstructured.chunking.basic._chunk_elements")


def test_basic_chunk_isolates_tables():
elements = partition_xlsx("example-docs/stanley-cups.xlsx")
assert elements[1].category == "Table"
assert elements[3].category == "Table"
chunks = chunk_elements(elements)
assert len(chunks) == 1
assert isinstance(chunks[0], CompositeElement)
chunks = chunk_elements(elements, isolate_tables=True)
assert chunks[1].category == "Table"
assert chunks[3].category == "Table"
16 changes: 13 additions & 3 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ def test_it_chunks_text_followed_by_table_together_when_both_fit():
assert isinstance(chunks[0], CompositeElement)


def test_isolate_table_works():
elements = elements_from_json(input_path("chunking/title_table_200.json"))

chunks = chunk_by_title(elements, combine_text_under_n_chars=0, isolate_tables=True)

assert len(chunks) == 2
assert isinstance(chunks[0], CompositeElement)
assert isinstance(chunks[1], Table)


def test_it_chunks_table_followed_by_text_together_when_both_fit():
elements = elements_from_json(input_path("chunking/table_text_200.json"))

Expand Down Expand Up @@ -456,19 +466,19 @@ class Describe_chunk_by_title:
],
)
def it_supports_the_include_orig_elements_option(
self, kwargs: dict[str, Any], expected_value: bool, _chunk_by_title_: Mock
self, kwargs: dict[str, Any], expected_value: bool, mocked_chunk_by_title_: Mock
):
# -- this line would raise if "include_orig_elements" was not an available parameter on
# -- `chunk_by_title()`.
chunk_by_title([], **kwargs)

_, opts = _chunk_by_title_.call_args.args
_, opts = mocked_chunk_by_title_.call_args.args
assert opts.include_orig_elements is expected_value

# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture()
def _chunk_by_title_(self, request: FixtureRequest):
def mocked_chunk_by_title_(self, request: FixtureRequest):
return function_mock(request, "unstructured.chunking.title._chunk_by_title")


Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.30-dev3" # pragma: no cover
__version__ = "0.18.30-dev4" # pragma: no cover
17 changes: 17 additions & 0 deletions unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ def hard_max(self) -> int:
arg_value = self._kwargs.get("max_characters")
return arg_value if arg_value is not None else CHUNK_MAX_CHARS_DEFAULT

@lazyproperty
def isolate_tables(self) -> bool:
"""when True tables are not combined with any other elements no matter the circumstances.
Default to False.
"""
arg_value = self._kwargs.get("isolate_tables")
return False if arg_value is None else bool(arg_value)

@lazyproperty
def include_orig_elements(self) -> bool:
"""When True, add original elements from pre-chunk to `.metadata.orig_elements` of chunk.
Expand Down Expand Up @@ -383,6 +391,11 @@ def will_fit(self, element: Element) -> bool:
# -- an empty pre-chunk will accept any element (including an oversized-element) --
if len(self._elements) == 0:
return True
# -- table will only fit into an empty pre-chunk if isolate tables--
if self._opts.isolate_tables and (
element.category == "Table" or self._elements[-1].category == "Table"
):
return False
# -- a pre-chunk that already exceeds the soft-max is considered "full" --
if self._text_length > self._opts.soft_max:
return False
Expand Down Expand Up @@ -448,6 +461,10 @@ def can_combine(self, pre_chunk: PreChunk) -> bool:
"""True when `pre_chunk` can be combined with this one without exceeding size limits."""
if len(self._text) >= self._opts.combine_text_under_n_chars:
return False
if self._opts.isolate_tables and (
self._elements[-1].category == "Table" or pre_chunk._elements[0].category == "Table"
):
return False
# -- avoid duplicating length computations by doing a trial-combine which is just as
# -- efficient and definitely more robust than hoping two different computations of combined
# -- length continue to get the same answer as the code evolves. Only possible because
Expand Down
2 changes: 2 additions & 0 deletions unstructured/chunking/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def chunk_elements(
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
isolate_tables: bool = False,
) -> list[Element]:
"""Combine sequential `elements` into chunks, respecting specified text-length limits.

Expand Down Expand Up @@ -71,6 +72,7 @@ def chunk_elements(
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
isolate_tables=isolate_tables,
)

return _chunk_elements(elements, opts)
Expand Down
2 changes: 2 additions & 0 deletions unstructured/chunking/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def chunk_by_title(
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
isolate_tables: bool = False,
) -> list[Element]:
"""Uses title elements to identify sections within the document for chunking.

Expand Down Expand Up @@ -80,6 +81,7 @@ def chunk_by_title(
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
isolate_tables=isolate_tables,
)
return _chunk_by_title(elements, opts)

Expand Down
Loading