Skip to content

Commit 6710df0

Browse files
authored
fix/Fix MS Office filetype errors and harden docker smoketest (#436)
# Changes **Fix for docx and other office files returning `{"detail":"File type None is not supported."}`** After moving to the wolfi base image, the `mimetypes` lib no longer knows about these file extensions. To avoid issues like this, let's add an explicit mapping for all the file extensions we care about. I added a `filetypes.py` and moved `get_validated_mimetype` over. When this file is imported, we'll call `mimetypes.add_type` for all file extensions we support. **Update smoke test coverage** This bug snuck past because we were already providing the mimetype in the docker smoke test. I updated `test_happy_path` to test against the container with and without passing `content_type`. I added some missing filetypes, and sorted the test params by extension so we can see when new types are missing. # Testing The new smoke test will verify that all filetypes are working. You can also `make docker-build && make docker-start-api`, and test out the docx in the sample docs dir. On `main`, this file will give you the error above. ``` curl 'http://localhost:8000/general/v0/general' \ --form 'files=@"fake.docx"' ```
1 parent d5a878f commit 6710df0

File tree

9 files changed

+187
-114
lines changed

9 files changed

+187
-114
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.72
2+
3+
* Fix certain filetypes failing mimetype lookup in the new base image
4+
15
## 0.0.71
26

37
* replace rockylinux with chainguard/wolfi as a base image for `amd64`

prepline_general/api/app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
app = FastAPI(
1414
title="Unstructured Pipeline API",
1515
summary="Partition documents with the Unstructured library",
16-
version="0.0.71",
16+
version="0.0.72",
1717
docs_url="/general/docs",
1818
openapi_url="/general/openapi.json",
1919
servers=[

prepline_general/api/filetypes.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import mimetypes
2+
import os
3+
from fastapi import UploadFile, HTTPException
4+
from typing import Optional
5+
6+
DEFAULT_MIMETYPES = (
7+
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
8+
"text/x-markdown,text/html,"
9+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
10+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
11+
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
12+
"presentationml.presentation,"
13+
"application/json,"
14+
"application/vnd.ms-powerpoint,"
15+
"text/html,message/rfc822,text/plain,image/png,"
16+
"application/epub,application/epub+zip,"
17+
"application/rtf,text/rtf,"
18+
"application/vnd.oasis.opendocument.text,"
19+
"text/csv,text/x-csv,application/csv,application/x-csv,"
20+
"text/comma-separated-values,text/x-comma-separated-values,"
21+
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
22+
"text/tsv,text/tab-separated-values,"
23+
"application/x-ole-storage,application/vnd.ms-outlook,"
24+
"application/yaml,"
25+
"application/x-yaml,"
26+
"text/x-yaml,"
27+
"text/yaml,"
28+
"image/bmp,"
29+
"image/heic,"
30+
"image/tiff,"
31+
"text/org,"
32+
)
33+
34+
if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
35+
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
36+
37+
38+
def _load_mimetypes() -> None:
39+
"""Call this on startup to ensure that all expected file extensions are present in the mimetypes
40+
lib"""
41+
expected_mimetypes = [
42+
(".bmp", "image/bmp"),
43+
(".csv", "application/csv"),
44+
(".doc", "application/msword"),
45+
(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
46+
(".eml", "message/rfc822"),
47+
(".epub", "application/epub"),
48+
(".gz", "application/gzip"),
49+
(".heic", "image/heic"),
50+
(".html", "text/html"),
51+
(".jpeg", "image/jpeg"),
52+
(".jpg", "image/jpeg"),
53+
(".json", "application/json"),
54+
(".md", "text/markdown"),
55+
(".msg", "application/x-ole-storage"),
56+
(".odt", "application/vnd.oasis.opendocument.text"),
57+
(".org", "text/org"),
58+
(".pdf", "application/pdf"),
59+
(".png", "image/png"),
60+
(".ppt", "application/vnd.ms-powerpoint"),
61+
(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
62+
(".rst", "text/prs.fallenstein.rst"),
63+
(".rtf", "application/rtf"),
64+
(".tiff", "image/tiff"),
65+
(".tsv", "text/tab-separated-values"),
66+
(".txt", "text/plain"),
67+
(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
68+
(".xml", "text/xml"),
69+
]
70+
71+
for extension, mimetype in expected_mimetypes:
72+
mimetypes.add_type(mimetype, extension)
73+
74+
75+
_load_mimetypes()
76+
77+
78+
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
79+
"""The MIME-type of `file`.
80+
81+
The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
82+
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
83+
return HTTP 400 for an invalid type.
84+
"""
85+
content_type = file.content_type
86+
filename = str(file.filename) # -- "None" when file.filename is None --
87+
if not content_type or content_type == "application/octet-stream":
88+
content_type = mimetypes.guess_type(filename)[0]
89+
90+
# Some filetypes missing for this library, just hardcode them for now
91+
if not content_type:
92+
if filename.endswith(".md"):
93+
content_type = "text/markdown"
94+
elif filename.endswith(".msg"):
95+
content_type = "message/rfc822"
96+
97+
allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
98+
if allowed_mimetypes_str is not None:
99+
allowed_mimetypes = allowed_mimetypes_str.split(",")
100+
101+
if content_type not in allowed_mimetypes:
102+
raise HTTPException(
103+
status_code=400,
104+
detail=(f"File type {content_type} is not supported."),
105+
)
106+
107+
return content_type

prepline_general/api/general.py

+3-65
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from starlette.types import Send
3535

3636
from prepline_general.api.models.form_params import GeneralFormParams
37+
from prepline_general.api.filetypes import get_validated_mimetype
3738
from unstructured.documents.elements import Element
3839
from unstructured.partition.auto import partition
3940
from unstructured.staging.base import (
@@ -59,37 +60,6 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool:
5960

6061
logger = logging.getLogger("unstructured_api")
6162

62-
DEFAULT_MIMETYPES = (
63-
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
64-
"text/x-markdown,text/html,"
65-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
66-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
67-
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
68-
"presentationml.presentation,"
69-
"application/json,"
70-
"application/vnd.ms-powerpoint,"
71-
"text/html,message/rfc822,text/plain,image/png,"
72-
"application/epub,application/epub+zip,"
73-
"application/rtf,text/rtf,"
74-
"application/vnd.oasis.opendocument.text,"
75-
"text/csv,text/x-csv,application/csv,application/x-csv,"
76-
"text/comma-separated-values,text/x-comma-separated-values,"
77-
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
78-
"text/tsv,text/tab-separated-values,"
79-
"application/x-ole-storage,application/vnd.ms-outlook,"
80-
"application/yaml,"
81-
"application/x-yaml,"
82-
"text/x-yaml,"
83-
"text/yaml,"
84-
"image/bmp,"
85-
"image/heic,"
86-
"image/tiff,"
87-
"text/org,"
88-
)
89-
90-
if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
91-
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
92-
9363

9464
def get_pdf_splits(pdf_pages: Sequence[PageObject], split_size: int = 1):
9565
"""Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages.
@@ -609,38 +579,6 @@ def _set_pdf_infer_table_structure(
609579
return strategy in ("hi_res", "auto") and pdf_infer_table_structure
610580

611581

612-
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
613-
"""The MIME-type of `file`.
614-
615-
The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
616-
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
617-
return HTTP 400 for an invalid type.
618-
"""
619-
content_type = file.content_type
620-
filename = str(file.filename) # -- "None" when file.filename is None --
621-
if not content_type or content_type == "application/octet-stream":
622-
content_type = mimetypes.guess_type(filename)[0]
623-
624-
# Some filetypes missing for this library, just hardcode them for now
625-
if not content_type:
626-
if filename.endswith(".md"):
627-
content_type = "text/markdown"
628-
elif filename.endswith(".msg"):
629-
content_type = "message/rfc822"
630-
631-
allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
632-
if allowed_mimetypes_str is not None:
633-
allowed_mimetypes = allowed_mimetypes_str.split(",")
634-
635-
if content_type not in allowed_mimetypes:
636-
raise HTTPException(
637-
status_code=400,
638-
detail=(f"File type {content_type} is not supported."),
639-
)
640-
641-
return content_type
642-
643-
644582
class MultipartMixedResponse(StreamingResponse):
645583
CRLF = b"\r\n"
646584

@@ -713,7 +651,7 @@ def return_content_type(filename: str):
713651

714652

715653
@router.get("/general/v0/general", include_in_schema=False)
716-
@router.get("/general/v0.0.71/general", include_in_schema=False)
654+
@router.get("/general/v0.0.72/general", include_in_schema=False)
717655
async def handle_invalid_get_request():
718656
raise HTTPException(
719657
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -728,7 +666,7 @@ async def handle_invalid_get_request():
728666
description="Description",
729667
operation_id="partition_parameters",
730668
)
731-
@router.post("/general/v0.0.71/general", include_in_schema=False)
669+
@router.post("/general/v0.0.72/general", include_in_schema=False)
732670
def general_partition(
733671
request: Request,
734672
# cannot use annotated type here because of a bug described here:

preprocessing-pipeline-family.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.71
2+
version: 0.0.72

sample-docs/DA-1p.bmp

1.44 MB
Binary file not shown.

sample-docs/DA-1p.heic

94.2 KB
Binary file not shown.
1.85 MB
Binary file not shown.

scripts/smoketest.py

+71-47
Original file line numberDiff line numberDiff line change
@@ -49,72 +49,96 @@ def send_document(
4949

5050

5151
@pytest.mark.parametrize(
52-
"example_filename, content_type",
52+
("extension", "example_filename", "content_type"),
5353
[
54-
# Note(yuming): Please sort filetypes alphabetically according to
55-
# https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/auto.py#L14
56-
("stanley-cups.csv", "application/csv"),
57-
("fake.doc", "application/msword"),
58-
("fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
59-
("alert.eml", "message/rfc822"),
60-
("announcement.eml", "message/rfc822"),
61-
("fake-email-attachment.eml", "message/rfc822"),
62-
("fake-email-image-embedded.eml", "message/rfc822"),
63-
("fake-email.eml", "message/rfc822"),
64-
("family-day.eml", "message/rfc822"),
65-
("winter-sports.epub", "application/epub"),
66-
("fake-html.html", "text/html"),
67-
pytest.param(
68-
"layout-parser-paper-fast.jpg",
69-
"image/jpeg",
70-
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
71-
),
72-
("spring-weather.html.json", "application/json"),
73-
("README.md", "text/markdown"),
74-
("fake-email.msg", "application/x-ole-storage"),
75-
("fake.odt", "application/vnd.oasis.opendocument.text"),
76-
# Note(austin) The two inference calls will hang on mac with unsupported hardware error
77-
# Skip these with SKIP_INFERENCE_TESTS=true make docker-test
78-
pytest.param(
79-
"layout-parser-paper.pdf.gz",
80-
"application/gzip",
81-
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
82-
),
83-
pytest.param(
84-
"layout-parser-paper.pdf",
85-
"application/pdf",
86-
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
54+
(".bmp", "DA-1p.bmp", "image/bmp"),
55+
(".csv", "stanley-cups.csv", "application/csv"),
56+
(".doc", "fake.doc", "application/msword"),
57+
(
58+
".docx",
59+
"fake.docx",
60+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
8761
),
88-
("fake-power-point.ppt", "application/vnd.ms-powerpoint"),
62+
(".eml", "fake-email-attachment.eml", "message/rfc822"),
63+
(".epub", "winter-sports.epub", "application/epub"),
64+
(".heic", "DA-1p.heic", "image/heic"),
65+
(".html", "fake-html.html", "text/html"),
66+
(".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"),
67+
(".md", "README.md", "text/markdown"),
68+
(".msg", "fake-email.msg", "application/x-ole-storage"),
69+
(".odt", "fake.odt", "application/vnd.oasis.opendocument.text"),
70+
(".pdf", "layout-parser-paper.pdf", "application/pdf"),
71+
(".png", "english-and-korean.png", "image/png"),
72+
(".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
8973
(
74+
".pptx",
9075
"fake-power-point.pptx",
9176
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
9277
),
93-
("README.rst", "text/prs.fallenstein.rst"),
94-
("fake-doc.rtf", "application/rtf"),
95-
("fake-text.txt", "text/plain"),
96-
("stanley-cups.tsv", "text/tab-separated-values"),
78+
(".rst", "README.rst", "text/prs.fallenstein.rst"),
79+
(".rtf", "fake-doc.rtf", "application/rtf"),
80+
(".tiff", "layout-parser-paper-fast.tiff", "image/tiff"),
81+
(".tsv", "stanley-cups.tsv", "text/tab-separated-values"),
82+
(".txt", "fake-text.txt", "text/plain"),
9783
(
84+
".xlsx",
9885
"stanley-cups.xlsx",
9986
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
10087
),
101-
("fake-xml.xml", "text/xml"),
88+
(".xml", "fake-xml.xml", "text/xml"),
89+
(".json", "spring-weather.html.json", "application/json"),
90+
(
91+
".gz",
92+
"layout-parser-paper.pdf.gz",
93+
"application/gzip",
94+
),
10295
],
10396
)
104-
def test_happy_path(example_filename: str, content_type: str):
97+
def test_happy_path_all_types(extension, example_filename: str, content_type: str):
10598
"""
10699
For the files in sample-docs, verify that we get a 200
107100
and some structured response
108101
"""
102+
# The auto strategy will run ocr on these files
103+
# This doesn't always work on our macs
104+
if skip_inference_tests and extension in [
105+
".bmp",
106+
".heic",
107+
".jpeg",
108+
".pdf",
109+
".png",
110+
".tiff",
111+
".gz", # Since we're using a gzipped pdf...
112+
]:
113+
pytest.skip("emulated hardware")
114+
109115
test_file = str(Path("sample-docs") / example_filename)
110-
print(f"sending {content_type}")
111-
json_response = send_document(filenames=[test_file], content_type=content_type)
112-
assert json_response.status_code == 200
113-
assert len(json_response.json()) > 0
114-
assert len("".join(elem["text"] for elem in json_response.json())) > 20
115116

117+
# Verify we can send with explicit content type
118+
response = send_document(filenames=[test_file], content_type=content_type)
119+
120+
if response.status_code != 200:
121+
assert False, response.text
122+
123+
assert len(response.json()) > 0
124+
assert len("".join(elem["text"] for elem in response.json())) > 20
125+
126+
# Verify we can infer the filetype on the server
127+
response = send_document(filenames=[test_file], content_type=None)
128+
129+
if response.status_code != 200:
130+
assert False, response.text
131+
132+
assert len(response.json()) > 0
133+
assert len("".join(elem["text"] for elem in response.json())) > 20
134+
135+
json_response = response
136+
137+
# Verify we can set output type to csv
116138
csv_response = send_document(
117-
filenames=[test_file], content_type=content_type, output_format="text/csv"
139+
filenames=[test_file],
140+
content_type=content_type,
141+
output_format="text/csv",
118142
)
119143
assert csv_response.status_code == 200
120144
assert len(csv_response.text) > 0

0 commit comments

Comments
 (0)