Skip to content

Commit 71e035c

Browse files
amanda103cragwolfe
andauthored
Adding content_type and file_filename to autopartition (#394)
Co-authored-by: cragwolfe <[email protected]>
1 parent 8ffd310 commit 71e035c

File tree

6 files changed

+110
-23
lines changed

6 files changed

+110
-23
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
## 0.5.7-dev3
1+
## 0.5.7
22

33
### Enhancements
44

55
* Refactored codebase using `exactly_one`
66
* Adds ability to pass headers when passing a url in partition_html()
7+
* Added optional `content_type` and `file_filename` parameters to `partition()` to bypass file detection
78

89
### Features
910

Diff for: docs/source/bricks.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ In cases where ``libmagic`` is not available, filetype detection will fall back
3030
As shown in the examples below, the ``partition`` function accepts both filenames and file-like objects as input.
3131
``partition`` also has some optional kwargs.
3232
For example, if you set ``include_page_breaks=True``, the output will include ``PageBreak`` elements if the filetype supports it.
33+
Additionally you can bypass the filetype detection logic with the optional ``content_type`` argument which may be specified with either the ``filename`` or file-like object, ``file``.
3334
You can find a full listing of optional kwargs in the documentation below.
3435

3536
.. code:: python
@@ -38,7 +39,7 @@ You can find a full listing of optional kwargs in the documentation below.
3839
3940
4041
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
41-
elements = partition(filename=filename)
42+
elements = partition(filename=filename, content_type="application/pdf")
4243
print("\n\n".join([str(el) for el in elements][:10]))
4344
4445
@@ -57,7 +58,7 @@ The ``unstructured`` library also includes partitioning bricks targeted at speci
5758
The ``partition`` brick uses these document-specific partitioning bricks under the hood.
5859
There are a few reasons you may want to use a document-specific partitioning brick instead of ``partition``:
5960

60-
* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly will make your program run faster.
61+
* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly, or passing in the ``content_type`` will make your program run faster.
6162
* Fewer dependencies. You don't need to install ``libmagic`` for filetype detection if you're only using document-specific bricks.
6263
* Additional features. The API for partition is the least common denominator for all document types. Certain document-specific brick include extra features that you may want to take advantage of. For example, ``partition_html`` allows you to pass in a URL so you don't have to store the ``.html`` file locally. See the documentation below learn about the options available in each partitioning brick.
6364

Diff for: test_unstructured/partition/test_auto.py

+61-15
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,27 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
105105
assert elements == expected_docx_elements
106106

107107

108-
def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
108+
@pytest.mark.parametrize(
109+
("pass_file_filename", "content_type"),
110+
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
111+
)
112+
def test_auto_partition_doc_with_filename(
113+
mock_docx_document,
114+
expected_docx_elements,
115+
tmpdir,
116+
pass_file_filename,
117+
content_type,
118+
):
109119
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
110120
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
111121
mock_docx_document.save(docx_filename)
112122
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
113-
114-
elements = partition(filename=doc_filename)
123+
file_filename = doc_filename if pass_file_filename else None
124+
elements = partition(
125+
filename=doc_filename,
126+
file_filename=file_filename,
127+
content_type=content_type,
128+
)
115129
assert elements == expected_docx_elements
116130
assert elements[0].metadata.filename == doc_filename
117131

@@ -130,17 +144,27 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
130144
assert elements == expected_docx_elements
131145

132146

133-
def test_auto_partition_html_from_filename():
147+
@pytest.mark.parametrize(
148+
("pass_file_filename", "content_type"),
149+
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
150+
)
151+
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
134152
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
135-
elements = partition(filename=filename)
153+
file_filename = filename if pass_file_filename else None
154+
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
136155
assert len(elements) > 0
137156
assert elements[0].metadata.filename == filename
138157

139158

140-
def test_auto_partition_html_from_file():
159+
@pytest.mark.parametrize(
160+
("pass_file_filename", "content_type"),
161+
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
162+
)
163+
def test_auto_partition_html_from_file(pass_file_filename, content_type):
141164
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
165+
file_filename = filename if pass_file_filename else None
142166
with open(filename) as f:
143-
elements = partition(file=f)
167+
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
144168
assert len(elements) > 0
145169

146170

@@ -177,9 +201,15 @@ def test_auto_partition_text_from_file():
177201
assert elements == EXPECTED_TEXT_OUTPUT
178202

179203

180-
def test_auto_partition_pdf_from_filename():
204+
@pytest.mark.parametrize(
205+
("pass_file_filename", "content_type"),
206+
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
207+
)
208+
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
181209
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
182-
elements = partition(filename=filename)
210+
file_filename = filename if pass_file_filename else None
211+
212+
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
183213

184214
assert isinstance(elements[0], Title)
185215
assert elements[0].text.startswith("LayoutParser")
@@ -207,10 +237,16 @@ def test_auto_partition_pdf_with_fast_strategy():
207237
)
208238

209239

210-
def test_auto_partition_pdf_from_file():
240+
@pytest.mark.parametrize(
241+
("pass_file_filename", "content_type"),
242+
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
243+
)
244+
def test_auto_partition_pdf_from_file(pass_file_filename, content_type):
211245
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
246+
file_filename = filename if pass_file_filename else None
247+
212248
with open(filename, "rb") as f:
213-
elements = partition(file=f)
249+
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
214250

215251
assert isinstance(elements[0], Title)
216252
assert elements[0].text.startswith("LayoutParser")
@@ -230,16 +266,26 @@ def test_partition_pdf_doesnt_raise_warning():
230266
partition(filename=filename)
231267

232268

233-
def test_auto_partition_jpg():
269+
@pytest.mark.parametrize(
270+
("pass_file_filename", "content_type"),
271+
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
272+
)
273+
def test_auto_partition_jpg(pass_file_filename, content_type):
234274
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
235-
elements = partition(filename=filename)
275+
file_filename = filename if pass_file_filename else None
276+
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
236277
assert len(elements) > 0
237278

238279

239-
def test_auto_partition_jpg_from_file():
280+
@pytest.mark.parametrize(
281+
("pass_file_filename", "content_type"),
282+
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
283+
)
284+
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
240285
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
286+
file_filename = filename if pass_file_filename else None
241287
with open(filename, "rb") as f:
242-
elements = partition(file=f)
288+
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
243289
assert len(elements) > 0
244290

245291

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.7-dev3" # pragma: no cover
1+
__version__ = "0.5.7" # pragma: no cover

Diff for: unstructured/file_utils/filetype.py

+31-3
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,24 @@ def __lt__(self, other):
111111
return self.name < other.name
112112

113113

114+
STR_TO_FILETYPE = {
115+
"application/pdf": FileType.PDF,
116+
"application/msword": FileType.DOC,
117+
"image/jpeg": FileType.JPG,
118+
"image/png": FileType.PNG,
119+
"text/markdown": FileType.MD,
120+
"text/x-markdown": FileType.MD,
121+
"application/epub": FileType.EPUB,
122+
"application/epub+zip": FileType.EPUB,
123+
"text/html": FileType.HTML,
124+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
125+
"application/vnd.ms-excel": FileType.XLS,
126+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
127+
"application/vnd.ms-powerpoint": FileType.PPT,
128+
"application/xml": FileType.XML,
129+
}
130+
131+
114132
EXT_TO_FILETYPE = {
115133
".pdf": FileType.PDF,
116134
".docx": FileType.DOCX,
@@ -138,18 +156,26 @@ def __lt__(self, other):
138156

139157
def detect_filetype(
140158
filename: Optional[str] = None,
159+
content_type: Optional[str] = None,
141160
file: Optional[IO] = None,
161+
file_filename: Optional[str] = None,
142162
) -> Optional[FileType]:
143163
"""Use libmagic to determine a file's type. Helps determine which partition brick
144164
to use for a given file. A return value of None indicates a non-supported file type."""
145165
exactly_one(filename=filename, file=file)
146166

147-
if filename:
148-
_, extension = os.path.splitext(filename)
167+
if content_type:
168+
filetype = STR_TO_FILETYPE.get(content_type)
169+
if filetype:
170+
return filetype
171+
172+
if filename or file_filename:
173+
_, extension = os.path.splitext(filename or file_filename or "")
149174
extension = extension.lower()
150175
if LIBMAGIC_AVAILABLE:
151-
mime_type = magic.from_file(filename, mime=True)
176+
mime_type = magic.from_file(filename or file_filename, mime=True) # type: ignore
152177
else:
178+
# might not need this
153179
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
154180
elif file is not None:
155181
extension = None
@@ -164,6 +190,8 @@ def detect_filetype(
164190
"Filetype detection on file-like objects requires libmagic. "
165191
"Please install libmagic and try again.",
166192
)
193+
else:
194+
raise ValueError("No filename, file, nor file_filename were specified.")
167195

168196
if mime_type == "application/pdf":
169197
return FileType.PDF

Diff for: unstructured/partition/auto.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717

1818
def partition(
1919
filename: Optional[str] = None,
20+
content_type: Optional[str] = None,
2021
file: Optional[IO] = None,
22+
file_filename: Optional[str] = None,
2123
include_page_breaks: bool = False,
2224
strategy: str = "hi_res",
2325
encoding: str = "utf-8",
@@ -31,8 +33,12 @@ def partition(
3133
----------
3234
filename
3335
A string defining the target filename path.
36+
content_type
37+
A string defining the file content in MIME type
3438
file
3539
A file-like object using "rb" mode --> open(filename, "rb").
40+
file_filename
41+
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
3642
include_page_breaks
3743
If True, the output will include page breaks if the filetype supports it
3844
strategy
@@ -42,7 +48,12 @@ def partition(
4248
encoding
4349
The encoding method used to decode the text input. If None, utf-8 will be used.
4450
"""
45-
filetype = detect_filetype(filename=filename, file=file)
51+
filetype = detect_filetype(
52+
filename=filename,
53+
file=file,
54+
file_filename=file_filename,
55+
content_type=content_type,
56+
)
4657

4758
if file is not None:
4859
file.seek(0)

0 commit comments

Comments
 (0)