Skip to content

Commit 5f5da65

Browse files
Fix/handle-spooled-temp-file-eml (#800)
This PR is for the unstructured-api smoke tests pass.
1 parent 901ef16 commit 5f5da65

File tree

5 files changed

+36
-17
lines changed

5 files changed

+36
-17
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.7.8-dev1
1+
## 0.7.8
22

33
### Enhancements
44

@@ -8,6 +8,7 @@
88

99
### Fixes
1010

11+
* Updates the `parse_email` for `partition_eml` so that `unstructured-api` passes the smoke tests
1112
* `partition_email` now works if there is no message content
1213
* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
1314
* Adds recursive functionality to all fsspec connectors

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.8-dev1" # pragma: no cover
1+
__version__ = "0.7.8" # pragma: no cover

Diff for: unstructured/file_utils/encoding.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import chardet
44

5+
from unstructured.partition.common import convert_to_bytes
6+
57
ENCODE_REC_THRESHOLD = 0.5
68

79
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
@@ -46,14 +48,7 @@ def detect_file_encoding(
4648
with open(filename, "rb") as f:
4749
byte_data = f.read()
4850
elif file:
49-
if isinstance(file, bytes):
50-
byte_data = file
51-
else:
52-
if not hasattr(file, "mode") or "b" in file.mode:
53-
byte_data = file.read()
54-
else:
55-
with open(file.name, "rb") as f:
56-
byte_data = f.read()
51+
byte_data = convert_to_bytes(file)
5752
else:
5853
raise FileNotFoundError("No filename nor file were specified")
5954

Diff for: unstructured/partition/common.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from __future__ import annotations
22

33
import subprocess
4-
from io import BytesIO
4+
from io import BufferedReader, BytesIO, TextIOWrapper
55
from tempfile import SpooledTemporaryFile
6-
from typing import TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
6+
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
77

88
from docx import table as docxtable
99
from tabulate import tabulate
@@ -184,6 +184,25 @@ def spooled_to_bytes_io_if_needed(
184184
return file_obj
185185

186186

187+
def convert_to_bytes(
188+
file: Optional[Union[bytes, SpooledTemporaryFile, IO]] = None,
189+
) -> bytes:
190+
if isinstance(file, bytes):
191+
f_bytes = file
192+
elif isinstance(file, SpooledTemporaryFile):
193+
file.seek(0)
194+
f_bytes = file.read()
195+
elif isinstance(file, BytesIO):
196+
f_bytes = file.getvalue()
197+
elif isinstance(file, (TextIOWrapper, BufferedReader)):
198+
with open(file.name, "rb") as f:
199+
f_bytes = f.read()
200+
else:
201+
raise ValueError("Invalid file-like object type")
202+
203+
return f_bytes
204+
205+
187206
def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True):
188207
"""
189208
Convert a table object from a Word document to an HTML table string using the tabulate library.

Diff for: unstructured/partition/email.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,18 @@
44
import sys
55
from email.message import Message
66
from functools import partial
7+
from tempfile import SpooledTemporaryFile
78
from typing import IO, Dict, List, Optional, Tuple, Union
89

910
from unstructured.file_utils.encoding import (
1011
COMMON_ENCODINGS,
1112
format_encoding_str,
1213
read_txt_file,
1314
)
14-
from unstructured.partition.common import exactly_one
15+
from unstructured.partition.common import (
16+
convert_to_bytes,
17+
exactly_one,
18+
)
1519

1620
if sys.version_info < (3, 8):
1721
from typing_extensions import Final
@@ -189,14 +193,14 @@ def find_embedded_image(
189193

190194
def parse_email(
191195
filename: Optional[str] = None,
192-
file: Optional[IO] = None,
196+
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
193197
) -> Tuple[Optional[str], Message]:
194198
if filename is not None:
195199
with open(filename, "rb") as f:
196200
msg = email.message_from_binary_file(f)
197201
elif file is not None:
198-
with open(file.name, "rb") as f:
199-
msg = email.message_from_binary_file(f)
202+
f_bytes = convert_to_bytes(file)
203+
msg = email.message_from_bytes(f_bytes)
200204
else:
201205
raise ValueError("Either 'filename' or 'file' must be provided.")
202206

@@ -216,7 +220,7 @@ def parse_email(
216220
@add_metadata_with_filetype(FileType.EML)
217221
def partition_email(
218222
filename: Optional[str] = None,
219-
file: Optional[IO] = None,
223+
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
220224
text: Optional[str] = None,
221225
content_source: str = "text/html",
222226
encoding: Optional[str] = None,

0 commit comments

Comments
 (0)