Skip to content

Commit b52638f

Browse files
chore: add support for SpooledTemporaryFiles (#569)
1 parent 19beb24 commit b52638f

File tree

11 files changed

+98
-25
lines changed

11 files changed

+98
-25
lines changed

Diff for: CHANGELOG.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
## 0.6.5-dev0
1+
## 0.6.5
22

33
### Enhancements
44

5-
* PLACEHOLDER - delete this line when there is an actual changelog item to report for 0.6.5
5+
* Added support for SpooledTemporaryFile file argument.
66

77
### Features
88

Diff for: test_unstructured/partition/test_docx.py

+15
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,21 @@ def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
6262
assert elements == expected_elements
6363

6464

65+
def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
66+
# Test that the partition_docx function can handle a SpooledTemporaryFile
67+
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
68+
mock_document.save(filename)
69+
70+
from tempfile import SpooledTemporaryFile
71+
72+
with open(filename, "rb") as test_file:
73+
spooled_temp_file = SpooledTemporaryFile()
74+
spooled_temp_file.write(test_file.read())
75+
spooled_temp_file.seek(0)
76+
elements = partition_docx(file=spooled_temp_file)
77+
assert elements == expected_elements
78+
79+
6580
def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
6681
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
6782
mock_document.save(filename)

Diff for: test_unstructured/partition/test_pdf.py

+19
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
from tempfile import SpooledTemporaryFile
23
from unittest import mock
34

45
import pytest
@@ -172,6 +173,24 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch):
172173
assert pdf._partition_pdf_or_image_local.called == local_called
173174

174175

176+
@pytest.mark.parametrize(
177+
("strategy"),
178+
[("fast"), ("hi_res"), ("ocr_only")],
179+
)
180+
def test_partition_pdf_with_spooled_file(
181+
strategy,
182+
filename="example-docs/layout-parser-paper-fast.pdf",
183+
):
184+
# Test that the partition_pdf function can handle a SpooledTemporaryFile
185+
with open(filename, "rb") as test_file:
186+
spooled_temp_file = SpooledTemporaryFile()
187+
spooled_temp_file.write(test_file.read())
188+
spooled_temp_file.seek(0)
189+
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
190+
# validate that the result is a non-empty list of dicts
191+
assert len(result) > 10
192+
193+
175194
@pytest.mark.parametrize(
176195
("url", "api_called", "local_called"),
177196
[("fakeurl", True, False), (None, False, True)],

Diff for: test_unstructured/partition/test_pptx.py

+13
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,19 @@ def test_partition_pptx_from_filename():
3232
assert elements == EXPECTED_PPTX_OUTPUT
3333

3434

35+
def test_partition_pptx_with_spooled_file():
36+
# Test that the partition_pptx function can handle a SpooledTemporaryFile
37+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
38+
from tempfile import SpooledTemporaryFile
39+
40+
with open(filename, "rb") as test_file:
41+
spooled_temp_file = SpooledTemporaryFile()
42+
spooled_temp_file.write(test_file.read())
43+
spooled_temp_file.seek(0)
44+
elements = partition_pptx(file=spooled_temp_file)
45+
assert elements == EXPECTED_PPTX_OUTPUT
46+
47+
3548
def test_partition_pptx_from_file():
3649
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
3750
with open(filename, "rb") as f:

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.5-dev0" # pragma: no cover
1+
__version__ = "0.6.5" # pragma: no cover

Diff for: unstructured/partition/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
def _partition_via_api(
1010
filename: str = "",
11-
file: Optional[bytes] = None,
11+
file: Optional[Union[BinaryIO, bytes]] = None,
1212
url: str = "https://ml.unstructured.io/layout/pdf",
1313
token: Optional[str] = None,
1414
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing

Diff for: unstructured/partition/common.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import subprocess
2-
from typing import List, Optional, Tuple, Union
2+
from io import BytesIO
3+
from tempfile import SpooledTemporaryFile
4+
from typing import BinaryIO, List, Optional, Tuple, Union
35

46
from unstructured.documents.elements import (
57
TYPE_TO_TEXT_ELEMENT_MAP,
@@ -157,3 +159,15 @@ def exactly_one(**kwargs) -> None:
157159
else:
158160
message = f"{names[0]} must be specified."
159161
raise ValueError(message)
162+
163+
164+
def spooled_to_bytes_io_if_needed(
165+
file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]],
166+
) -> Optional[Union[bytes, BinaryIO]]:
167+
if isinstance(file_obj, SpooledTemporaryFile):
168+
file_obj.seek(0)
169+
contents = file_obj.read()
170+
return BytesIO(contents)
171+
else:
172+
# Return the original file object if it's not a SpooledTemporaryFile
173+
return file_obj

Diff for: unstructured/partition/docx.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import tempfile
3-
from typing import IO, List, Optional
3+
from tempfile import SpooledTemporaryFile
4+
from typing import IO, BinaryIO, List, Optional, Union, cast
45

56
import docx
67
import pypandoc
@@ -15,7 +16,7 @@
1516
Text,
1617
Title,
1718
)
18-
from unstructured.partition.common import exactly_one
19+
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
1920
from unstructured.partition.text_type import (
2021
is_bulleted_text,
2122
is_possible_narrative_text,
@@ -62,7 +63,7 @@
6263

6364
def partition_docx(
6465
filename: Optional[str] = None,
65-
file: Optional[IO] = None,
66+
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
6667
metadata_filename: Optional[str] = None,
6768
) -> List[Element]:
6869
"""Partitions Microsoft Word Documents in .docx format into its document elements.
@@ -85,7 +86,9 @@ def partition_docx(
8586
if filename is not None:
8687
document = docx.Document(filename)
8788
elif file is not None:
88-
document = docx.Document(file)
89+
document = docx.Document(
90+
spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
91+
)
8992

9093
metadata_filename = metadata_filename or filename
9194
elements: List[Element] = []

Diff for: unstructured/partition/pdf.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import re
22
import warnings
3-
from typing import BinaryIO, List, Optional, cast
3+
from tempfile import SpooledTemporaryFile
4+
from typing import BinaryIO, List, Optional, Union, cast
45

56
import pdf2image
67
import pytesseract
@@ -16,6 +17,7 @@
1617
add_element_metadata,
1718
document_to_element_list,
1819
exactly_one,
20+
spooled_to_bytes_io_if_needed,
1921
)
2022
from unstructured.partition.strategies import determine_pdf_or_image_strategy
2123
from unstructured.partition.text import partition_text
@@ -24,7 +26,7 @@
2426

2527
def partition_pdf(
2628
filename: str = "",
27-
file: Optional[bytes] = None,
29+
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
2830
url: Optional[str] = None,
2931
template: str = "layout/pdf",
3032
token: Optional[str] = None,
@@ -86,7 +88,7 @@ def partition_pdf(
8688

8789
def partition_pdf_or_image(
8890
filename: str = "",
89-
file: Optional[bytes] = None,
91+
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
9092
url: Optional[str] = "https://ml.unstructured.io/",
9193
template: str = "layout/pdf",
9294
token: Optional[str] = None,
@@ -122,7 +124,7 @@ def partition_pdf_or_image(
122124
warnings.simplefilter("ignore")
123125
layout_elements = _partition_pdf_or_image_local(
124126
filename=filename,
125-
file=file,
127+
file=spooled_to_bytes_io_if_needed(file),
126128
template=out_template,
127129
is_image=is_image,
128130
infer_table_structure=infer_table_structure,
@@ -133,7 +135,7 @@ def partition_pdf_or_image(
133135
elif strategy == "fast":
134136
return _partition_pdf_with_pdfminer(
135137
filename=filename,
136-
file=file,
138+
file=spooled_to_bytes_io_if_needed(file),
137139
include_page_breaks=include_page_breaks,
138140
encoding=encoding,
139141
)
@@ -159,7 +161,7 @@ def partition_pdf_or_image(
159161
# NOTE(alan): Remove "data=data" after different models are handled by routing
160162
layout_elements = _partition_via_api(
161163
filename=filename,
162-
file=file,
164+
file=cast(BinaryIO, file),
163165
url=url,
164166
token=token,
165167
data=data,
@@ -175,7 +177,7 @@ def partition_pdf_or_image(
175177

176178
def _partition_pdf_or_image_local(
177179
filename: str = "",
178-
file: Optional[bytes] = None,
180+
file: Optional[Union[bytes, BinaryIO]] = None,
179181
template: Optional[str] = None,
180182
is_image: bool = False,
181183
infer_table_structure: bool = False,
@@ -226,7 +228,7 @@ def _partition_pdf_or_image_local(
226228
@requires_dependencies("pdfminer", "local-inference")
227229
def _partition_pdf_with_pdfminer(
228230
filename: str = "",
229-
file: Optional[bytes] = None,
231+
file: Optional[BinaryIO] = None,
230232
include_page_breaks: bool = False,
231233
encoding: str = "utf-8",
232234
) -> List[Element]:
@@ -300,7 +302,7 @@ def _process_pdfminer_pages(
300302

301303
def _partition_pdf_or_image_with_ocr(
302304
filename: str = "",
303-
file: Optional[bytes] = None,
305+
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
304306
include_page_breaks: bool = False,
305307
ocr_languages: str = "eng",
306308
is_image: bool = False,

Diff for: unstructured/partition/pptx.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from typing import IO, List, Optional
1+
from tempfile import SpooledTemporaryFile
2+
from typing import IO, BinaryIO, List, Optional, Union, cast
23

34
import pptx
45

@@ -11,7 +12,7 @@
1112
Text,
1213
Title,
1314
)
14-
from unstructured.partition.common import exactly_one
15+
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
1516
from unstructured.partition.text_type import (
1617
is_possible_narrative_text,
1718
is_possible_title,
@@ -22,7 +23,7 @@
2223

2324
def partition_pptx(
2425
filename: Optional[str] = None,
25-
file: Optional[IO] = None,
26+
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
2627
include_page_breaks: bool = True,
2728
metadata_filename: Optional[str] = None,
2829
) -> List[Element]:
@@ -48,7 +49,9 @@ def partition_pptx(
4849
if filename is not None:
4950
presentation = pptx.Presentation(filename)
5051
elif file is not None:
51-
presentation = pptx.Presentation(file)
52+
presentation = pptx.Presentation(
53+
spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
54+
)
5255

5356
elements: List[Element] = []
5457
metadata_filename = metadata_filename or filename

Diff for: unstructured/partition/strategies.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from typing import BinaryIO, Dict, List, Optional, cast
1+
from tempfile import SpooledTemporaryFile
2+
from typing import BinaryIO, Dict, List, Optional, Union, cast
23

34
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
45
from pdfminer.utils import open_filename
@@ -31,7 +32,10 @@ def validate_strategy(strategy: str, filetype: str):
3132
raise ValueError(f"{strategy} is not a valid strategy for filetype {filetype}.")
3233

3334

34-
def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
35+
def is_pdf_text_extractable(
36+
filename: str = "",
37+
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
38+
):
3539
"""Checks to see if the text from a PDF document is extractable. Sometimes the
3640
text is not extractable due to PDF security settings."""
3741
exactly_one(filename=filename, file=file)
@@ -56,7 +60,7 @@ def _fp_is_extractable(fp):
5660
def determine_pdf_or_image_strategy(
5761
strategy: str,
5862
filename: str = "",
59-
file: Optional[bytes] = None,
63+
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
6064
is_image: bool = False,
6165
):
6266
"""Determines what strategy to use for processing PDFs or images, accounting for fallback

0 commit comments

Comments
 (0)