Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions test_unstructured/partition/test_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,42 @@ def test_partition_xlsx_from_file_with_header():
assert e.metadata.text_as_html is not None


def test_partition_xlsx_no_future_warning_for_bytes():
"""Test that partition_xlsx doesn't raise FutureWarning when passing bytes to read_excel.

This test ensures that bytes are properly wrapped in BytesIO before being passed to
pd.read_excel(), preventing the deprecation warning.
"""
import warnings

with open("example-docs/stanley-cups.xlsx", "rb") as f:
file_bytes = f.read()

# Create a BytesIO object from bytes to simulate the scenario
file_like = io.BytesIO(file_bytes)
Comment on lines +180 to +184
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this points to a good solution to the issue #4036 : internally in pd.read_excel line in partition excel function, we should open file and wrap in BytesIO.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I would suggest this PR to actually implement that solution instead of adding this test

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I would suggest this PR to actually implement that solution instead of adding this test
So, can you let me know what do I have to do?


# Capture warnings
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
elements = partition_xlsx(file=file_like, include_header=False)

# Filter for FutureWarning related to read_excel
future_warnings = [
warning
for warning in w
if issubclass(warning.category, FutureWarning)
and "read_excel" in str(warning.message).lower()
and "bytes" in str(warning.message).lower()
]

# Assert no FutureWarning was raised
assert len(future_warnings) == 0, f"FutureWarning raised: {[str(w.message) for w in future_warnings]}"

# Verify the function still works correctly
assert len(elements) > 0
assert sum(isinstance(element, Table) for element in elements) == 2


def test_partition_xlsx_password_protected_raises_exception():
with pytest.raises(UnprocessableEntityError):
partition_xlsx(filename="example-docs/password_protected.xlsx")
Expand Down
2 changes: 2 additions & 0 deletions unstructured/partition/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ def sheets(self) -> dict[str, pd.DataFrame]:
if office_file.is_encrypted():
raise UnprocessableEntityError("XLSX file is password protected.")

# Wrap bytes in BytesIO to avoid FutureWarning from pd.read_excel
# See: https://github.com/Unstructured-IO/unstructured/issues/4036
return pd.read_excel(
io.BytesIO(self._file_bytes), sheet_name=None, header=self.header_row_idx
)
Expand Down